In [None]:
%pip install nbimporter
%pip install scikit-misc

import nbimporter  
import gzip
import shutil
import os
import json 
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectFromModel
from main import generate_predictions


## Access SG-NEx data through AWS

In [None]:
# # list all samples that have processed data for RNA modification detection using m6Anet
!aws s3 ls --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/  

# saves all samples that have processed data for RNA modification detection using m6Anet under data directory
!aws s3 cp --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/ ../data/sg-nex-data/raw/

## Prediction of SG-NEx data using Random Forest

In [None]:
def generate_prediction_files(raw_directory, prediction_directory):
    # Ensure the output directory exists
    os.makedirs(prediction_directory, exist_ok=True)

    # Iterate through each folder in the data directory
    for root, dirs, files in os.walk(raw_directory):
        for file in files:
            if file.endswith('.json'):
                input_path = os.path.join(root, file)
                
                # Get the base folder name from the input path
                folder_name = os.path.basename(root)
                
                # Construct the output path
                output_path = os.path.join(prediction_directory, f"{folder_name}.csv")
                
                print(f"Processing: {input_path}")
                print(f"Output will be saved to: {output_path}")
                
                # Call prediction function
                generate_predictions(input_path, '../model/selector.joblib.gz', '../model/rf_classifier.joblib.gz', output_path, include_features = True)

                # Remove the folder after the CSV is successfully exported
                shutil.rmtree(root)
                print(f"Removed folder: {root}")

    # Remove sg-nex-data directory 
    sgnex_directory = os.path.dirname(raw_directory)  
    shutil.rmtree(sgnex_directory)  # This will remove 'data/' and everything inside it
    print(f"Removed sg-nex-data directory: {sgnex_directory}")

raw_directory = '../data/sg-nex-data/raw'
prediction_directory = '../output'
generate_prediction_files(raw_directory, prediction_directory)


In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Directory containing the prediction CSV files
prediction_directory = '../output'

# Dictionary to store dataframes by cell line
cell_line_dataframes = {
    'A549': [],
    'Hct116': [],
    'K562': [],
    'HepG2': [],
    'MCF7': []
}

# Iterate through the files in the prediction directory
for file_name in os.listdir(prediction_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(prediction_directory, file_name)
        
        # Load the CSV file into a dataframe
        df = pd.read_csv(file_path)
        
        # Check the file name to determine which cell line it belongs to
        if 'A549' in file_name:
            df['cell_line'] = 'A549'
            cell_line_dataframes['A549'].append(df)
        elif 'Hct116' in file_name:
            df['cell_line'] = 'Hct116'
            cell_line_dataframes['Hct116'].append(df)
        elif 'K562' in file_name:
            df['cell_line'] = 'K562'
            cell_line_dataframes['K562'].append(df)
        elif 'HepG2' in file_name:
            df['cell_line'] = 'HepG2'
            cell_line_dataframes['HepG2'].append(df)
        elif 'MCF7' in file_name:
            df['cell_line'] = 'MCF7'
            cell_line_dataframes['MCF7'].append(df)

# Concatenate dataframes for each cell line into a single dataframe
combined_data = pd.concat(
    [pd.concat(dfs) for dfs in cell_line_dataframes.values() if dfs], 
    ignore_index=True
)

print(combined_data.head())

# import os
# import pandas as pd

# # Directory containing the prediction CSV files
# prediction_directory = '../data/sg-nex-data/predictions'

# # Dictionary to store dataframes by cell line
# cell_line_dataframes = {
#     'A549': [],
#     'Hct116': [],
#     'K562': [],
#     'HepG2': [],
#     'MCF7': []
# }

# # Iterate through the files in the prediction directory
# for file_name in os.listdir(prediction_directory):
#     if file_name.endswith('.csv'):
#         file_path = os.path.join(prediction_directory, file_name)
        
#         # Load the CSV file into a dataframe
#         df = pd.read_csv(file_path)
        
#         # Check the file name to determine which cell line it belongs to
#         if 'A549' in file_name:
#             cell_line_dataframes['A549'].append(df)
#         elif 'Hct116' in file_name:
#             cell_line_dataframes['Hct116'].append(df)
#         elif 'K562' in file_name:
#             cell_line_dataframes['K562'].append(df)
#         elif 'HepG2' in file_name:
#             cell_line_dataframes['HepG2'].append(df)
#         elif 'MCF7' in file_name:
#             cell_line_dataframes['MCF7'].append(df)

# # Optionally, concatenate the dataframes for each cell line into a single dataframe
# for cell_line, dfs in cell_line_dataframes.items():
#     # Concatenate dataframes if the list is not empty
#     cell_line_dataframes[cell_line] = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

# # Now, cell_line_dataframes dictionary contains concatenated dataframes for each cell line

# # Example: Print the first few rows of each cell line dataframe to verify
# for cell_line, df in cell_line_dataframes.items():
#     print(f"First few rows of {cell_line} dataframe:")
#     print(df.head())
#     print("\n")


## Visualisations

### Boxplot to analyse features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# List of features to plot
features = [
    '-1_dwelling_time_mean', '-1_dwelling_time_min', '-1_dwelling_time_max',
    '-1_standard_dev_mean', '-1_mean_current_mean', '-1_mean_current_min',
    '-1_mean_current_max', '0_dwelling_time_mean', '0_dwelling_time_min',
    '0_dwelling_time_max', '0_standard_dev_mean', '0_mean_current_mean',
    '0_mean_current_min', '0_mean_current_max', '+1_dwelling_time_mean',
    '+1_dwelling_time_min', '+1_dwelling_time_max', '+1_standard_dev_mean',
    '+1_mean_current_mean', '+1_mean_current_min', '+1_mean_current_max'
]

# Combine dataframes for all cell lines into a single dataframe for plotting
combined_df = pd.DataFrame()
for cell_line, df in cell_line_dataframes.items():
    if 'transcript_id' in df.columns and 'prediction' in df.columns:
        df['cell_line'] = cell_line  # Add a column for the cell line
        # Map prediction 1 to Modified and 0 to Unmodified
        df['Modification'] = df['prediction'].map({1: 'Modified', 0: 'Unmodified'})
        combined_df = pd.concat([combined_df, df], ignore_index=True)

# Create a combined boxplot for each feature
for feature in features:
    if feature not in combined_df.columns:
        print(f"Feature '{feature}' not found in combined dataframe.")
        continue
    
    plt.figure(figsize=(12, 8))
    sns.boxplot(
        data=combined_df,
        x='cell_line',
        y=feature,
        hue='Modification',  # Use the new Modification column for hue
        palette='Set2'
    )
    plt.title(f'Boxplot of {feature} for Different Cell Lines (Modified vs Unmodified)')
    plt.xlabel('Cell Line')
    plt.ylabel(f'{feature}')
    plt.legend(title='Modification')
    plt.xticks(rotation=45)
    plt.ylim(0, combined_df[feature].quantile(0.99))  # Limit y-axis to 99th percentile to reduce outlier effect
    plt.tight_layout()
    plt.show()

    


### Density Plot to analyse Positions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data for scores > 0.9
filtered_data = combined_data[combined_data['score'] > 0.9]

# Set up the plot
plt.figure(figsize=(12, 8))

# Loop through each cell line and plot the density
for cell_line in filtered_data['cell_line'].unique():
    sns.kdeplot(
        data=filtered_data[filtered_data['cell_line'] == cell_line], 
        x='transcript_position', 
        label=cell_line, 
        fill=False,  # Set fill to False for lines only
        bw_adjust=0.5  # Adjusts the smoothness of the density curve
    )

# Adding labels and title
plt.xlabel('Transcript Position')
plt.ylabel('Density')
plt.title('Density Plot of Transcript Positions for Each Cell Line (Score > 0.9)')
plt.legend(title='Cell Line')
plt.show()


### Scatterplot to analyse positions

In [None]:
import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_smooth, labs, theme_bw, theme

# Filter the data for scores > 0.9 and transcript positions < 20000
filtered_data_scatterplot = filtered_data[filtered_data['transcript_position'] < 20000]

# Create individual plots for each cell line
for cell in filtered_data_scatterplot['cell_line'].unique():
    subset = filtered_data_scatterplot[filtered_data_scatterplot['cell_line'] == cell]

    # Create the plot for the individual cell line
    plot = (
        ggplot(subset, aes(x='transcript_position', y='score'))
        + geom_point(size=0.5, alpha=0.5)
        + geom_smooth(method='loess', color='blue', span=0.3, se=True)  # Adjust span as needed
        + labs(
            title=f'Score by Transcript Position for Cell Line: {cell}',
            x='Transcript Position',
            y='Score'
        )
        + theme_bw()
        + theme(figure_size=(12, 6))
    )
    
    print(plot)

# Combined plot for all cell lines
overlay_plot = (
    ggplot(filtered_data_scatterplot, aes(x='transcript_position', y='score', color='cell_line', group='cell_line'))  # Group by cell_line
    + geom_point(size=0.5, alpha=0.3)  # Optional: Adjust alpha for visibility
    + geom_smooth(method='loess', span=0.3, se=True)  # No need for aes() here
    + labs(
        title='Overlay of LOESS Smooth Lines for All Cell Lines',
        x='Transcript Position',
        y='Score'
    )
    + theme_bw()
    + theme(figure_size=(12, 6))
)

print(overlay_plot)


In [None]:
# Create a combined plot for all cell lines with only LOESS smooth lines
overlay_plot = (
    ggplot(filtered_data_scatterplot, aes(x='transcript_position', y='score', color='cell_line', fill='cell_line', group='cell_line'))  # Group by cell_line
    + geom_smooth(method='loess', span=0.3, se=True, alpha=0.3)  # Adjust alpha for transparency of the confidence band
    + labs(
        title='Overlay of LOESS Smooth Lines for All Cell Lines',
        x='Transcript Position',
        y='Score'
    )
    + theme_bw()
    + theme(figure_size=(12, 6))
)

print(overlay_plot)

# Create a combined plot for all cell lines with only LOESS smooth lines
overlay_plot_1 = (
    ggplot(filtered_data_scatterplot, aes(x='transcript_position', y='score', color='cell_line', fill='cell_line', group='cell_line'))  # Group by cell_line
    + geom_smooth(method='loess', span=0.3, se=False, alpha=0.3)  # Adjust alpha for transparency of the confidence band
    + labs(
        title='Overlay of LOESS Smooth Lines for All Cell Lines',
        x='Transcript Position',
        y='Score'
    )
    + theme_bw()
    + theme(figure_size=(12, 6))
)

print(overlay_plot_1)


In [None]:
# Count distinct transcript_ids in each cell_line
distinct_transcripts = combined_data.groupby('cell_line')['transcript_id'].nunique().reset_index()

# Rename the columns for clarity
distinct_transcripts.columns = ['cell_line', 'distinct_transcript_count']

# Display the result
print(distinct_transcripts)
