In [None]:
%pip install nbimporter
import nbimporter
import main


## Access SG-NEx data through AWS

In [None]:
# list all samples that have processed data for RNA modification detection using m6Anet
!aws s3 ls --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/  

# saves all samples that have processed data for RNA modification detection using m6Anet under data directory
!aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/ ../data/sg-nex-data/raw

In [None]:
import os
import json
import pandas as pd
import shutil

def process_sg_nex_json_files(data_directory, output_directory):
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Iterate through each folder in the data directory
    for root, dirs, files in os.walk(data_directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                print(f"Processing: {file_path}")

                # Load and parse the JSON file
                data = []
                with open(file_path, 'r') as f:
                    for line in f:
                        # Parse each line as a JSON object
                        data.append(json.loads(line))

                # Flatten the data
                flattened_data = main.flatten_json(data)

                # Convert to a DataFrame
                df = pd.DataFrame(flattened_data)

                # Construct the output CSV file name using the folder name
                folder_name = os.path.basename(root)  # Get the folder name
                output_file_name = f"{folder_name}.csv"  # Use folder name for the CSV file
                output_path = os.path.join(output_directory, output_file_name)

                # Export as CSV
                df.to_csv(output_path, index=False)
                print(f"Saved: {output_path}")

                # Remove the folder after the CSV is successfully exported
                shutil.rmtree(root)
                print(f"Removed folder: {root}")

# Example usage
data_directory = '../data/sg-nex-data/raw'  
output_directory = '../data/sg-nex-data/processed'
process_sg_nex_json_files(data_directory, output_directory)


In [None]:
import pandas as pd

labels = pd.read_csv('../data/data.info.labelled')

print(labels.info())  # To get summary information about the DataFrame
print(labels.head())  # Preview the first few rows of the data

In [None]:
features = pd.read_csv('../data/dataset0.csv')

print(features.info()) 
print(features.head())  

In [None]:
def aggregate_by_transcript_position(features):

    features_agg = features.groupby(['transcript_id', 'transcript_position', 'sequence'])\
    [['-1_dwelling_time', '-1_standard_dev', '-1_mean_current',
      '0_dwelling_time', '0_standard_dev', '0_mean_current',
      '+1_dwelling_time', '+1_standard_dev', '+1_mean_current']]\
    .mean().reset_index()
    return features_agg

In [None]:
features_agg = aggregate_by_transcript_position(features)
print(features_agg.info())

In [None]:
def add_gene_and_label(features, labels):
    """
    Adds gene_id and label to features dataframe
    
    Inputs:
    - features: pd.DataFrame
      Dataframe with selected features after feature engineering. Dataframe must contain transcript_id and transcript_position
    - labels: pd.DataFrame
      Dataframe with gene_id, transcript_id, transcript_position, and label.

    Output:
    - pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
    """
    features_labelled = pd.merge(features, labels, on=['transcript_id', 'transcript_position'], how='inner')
    
    return features_labelled


In [None]:
features_labelled = add_gene_and_label(features_agg, labels)
print(features_labelled.info())

In [None]:
# features_labelled.to_csv('../data/features_labelled.csv',index=False)

In [None]:
def train_test_split_by_gene_id(features_labelled, features_columns):
    """
    Performs train test split based on gene_id. Returns X_train and X_test based on feature_columns
    
    Inputs:
    - features_labelled: pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
      
    Output:
    - X_train: pd.DataFrame
    - X_test: pd.DataFrame
    - y_train: pd.DataFrame
    - y_test: pd.DataFrame
    """

    from sklearn.model_selection import train_test_split
    df = features_labelled

    # Get unique genes
    unique_genes = df['gene_id'].unique()
    
    # Perform the train-test split on genes
    genes_train, genes_test = train_test_split(unique_genes, test_size=0.2, random_state=42)
    
    # Split the dataset based on the gene split
    train_data = df[df['gene_id'].isin(genes_train)]
    test_data = df[df['gene_id'].isin(genes_test)]
    
    # Create the feature and target variables for training and testing
    id_train = train_data[['transcript_id','transcript_position']]
    X_train = train_data[features_columns]
    y_train = train_data['label']
    id_test = test_data[['transcript_id','transcript_position']]
    X_test = test_data[features_columns]
    y_test = test_data['label']
    
    # Output the shapes to verify the split
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Test Features Shape: {X_test.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Test Labels Shape: {y_test.shape}")
    return (X_train, X_test, y_train, y_test, id_train, id_test)

In [None]:
features_columns = [
        '-1_dwelling_time', '-1_standard_dev', '-1_mean_current',
        '0_dwelling_time', '0_standard_dev', '0_mean_current',
        '+1_dwelling_time', '+1_standard_dev', '+1_mean_current'
    ]
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split_by_gene_id(features_labelled, features_columns)

In [None]:
def balance_train_data(X_train,y_train):
    """
    Performs SMOTE on train data, oversampling positive class, to account for imbalanced dataset
    
    Inputs:
    - X_train: pd.DataFrame
    - Y_train: pd.DataFrame
      
    Output:
    - X_train_resampled: pd.DataFrame
    - y_train_resampled: pd.DataFrame with balanced classes, ie the same number of 0s and 1s
    """

    from imblearn.over_sampling import SMOTE
    print(f'Label distribution before resampling:')
    print(pd.Series(y_train).value_counts())
    
    smote = SMOTE(k_neighbors=5, random_state=42) 
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print(f'Label distribution after resampling:')
    print(pd.Series(y_train_resampled).value_counts())
    
    return X_train_resampled, y_train_resampled

In [None]:
X_train_resampled, y_train_resampled = balance_train_data(X_train,y_train)

In [None]:
# Predict the output of xtest using the trained models
def predict(classifier, id, X):
    y_pred = classifier.predict(X)
    y_prob = classifier.predict_proba(X)[:, 1] #Gives probability estimates for y=1

    y_out = pd.DataFrame({
        'prediction': y_pred,                  
        'probability': y_prob          
    })

    result = pd.concat((id.reset_index(drop=True), y_out.reset_index(drop=True)), axis=1)
    # print(result.head())
    return result

In [None]:
# Model evaluation
def evaluate(y_test, predict_df):
    y_pred = predict_df['prediction']
    y_prob = predict_df['probability']
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test,  y_pred)
    balancedaccuracy = balanced_accuracy_score(y_test,  y_pred)
    f1score = f1_score(y_test,  y_pred) #F1 is a good scoring metric for imbalanced data when more attention is needed on the positives
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) 
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob) # Computes ROC AUC score using probabilities of positive class
    pr_auc = auc(recall_vals
                 , precision_vals)

    print(f"Accuracy = {round(accuracy, ndigits=3)}")
    print(f"Balanced Accuracy = {round(balancedaccuracy, ndigits=3)}")
    print(f"f1 score = {round(f1score, ndigits=3)}")
    print(f"Precision = {round(precision, ndigits=3)}")
    print(f"Recall = {round(recall, ndigits=3)}")
    print(f"ROC AUC = {round(roc_auc, ndigits=3)}")
    print(f"PR AUC = {round(pr_auc, ndigits=3)}")

    # Plot confusion matrix
    plt.figure(figsize=(2,2))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title('Confusion Matrix')
    plt.figure(figsize=(5,5))  # Increase the figure size to avoid overlap
    plt.show()

    # Plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.show()

    # Plot Precision-Recall curve
    plt.plot(recall_vals, precision_vals, label='PR curve (AUC = %0.2f)' % pr_auc)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

In [None]:
# Logistic Regression Model
def LR(X_train,y_train):
    classifier = LogisticRegression(max_iter=1000, verbose = 1,  random_state = 123, class_weight = 'balanced')
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
# ML workflow for Logistic Regression - without SMOTE
LR_classifier = LR(X_train, y_train)
result = predict(LR_classifier, id_test, X_test_clean)
evaluate(y_test_clean, result)

In [None]:
# ML workflow for Logistic Regression - with SMOTE
LR_classifier = LR(X_train_resampled, y_train_resampled)
result = predict(LR_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)


In [None]:
def RF(X_train, y_train):
    classifier = RandomForestClassifier(n_estimators=100, random_state=123, class_weight='balanced', verbose=1)
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
RF_classifier = RF(X_train, y_train)
result = predict(RF_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)

In [None]:
RF_classifier = RF(X_train_resampled, y_train_resampled)
result = predict(RF_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)

In [None]:
X_test_0 = features_agg[features_columns]
id_test_0 = features_agg[['transcript_id', 'transcript_position']]

In [None]:
result = predict(LR_classifier, id_test_0, X_test_0)

result = result.sort_values(by='probability', ascending=False)
result = result.rename(columns={'probability': 'score'})
result = result[['transcript_id', 'transcript_position', 'score']]


In [None]:
# Export results to CSV
output_csv_path = '../data/geneiuses_dataset0_1.csv'
result.to_csv(output_csv_path, index=False)

print(f"Results exported to {output_csv_path}")


## Prediction of SG-NEx data using Random Forest

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('../data/sg-nex-data/processed/SGNex_A549_directRNA_replicate5_run1.csv')

# Display the column headers
print(df.columns.tolist())


In [None]:
from joblib import load

RF_classifier = load('rf_classifier.joblib')  # Load the model


In [None]:
import os
import pandas as pd

def predict_processed_csv_files(processed_directory, prediction_directory):
    # Ensure the output directory exists
    os.makedirs(prediction_directory, exist_ok=True)

    # Iterate through all CSV files in the processed directory
    for file_name in os.listdir(processed_directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(processed_directory, file_name)
            print(f"Processing: {file_path}")

            # Read the features from the CSV file
            features = pd.read_csv(file_path)

            # Aggregate the features by transcript position
            features_agg = main.aggregate_by_transcript_position(features)


            # Define the columns to be used for prediction
            features_columns = [
                '-1_dwelling_time_mean', '-1_dwelling_time_min', '-1_dwelling_time_max',
                '-1_standard_dev_mean', 
                '-1_mean_current_mean', '-1_mean_current_min', '-1_mean_current_max',
                '0_dwelling_time_mean', '0_dwelling_time_min', '0_dwelling_time_max',
                '0_standard_dev_mean', 
                '0_mean_current_mean', '0_mean_current_min', '0_mean_current_max',
                '+1_dwelling_time_mean', '+1_dwelling_time_min', '+1_dwelling_time_max',
                '+1_standard_dev_mean', 
                '+1_mean_current_mean', '+1_mean_current_min', '+1_mean_current_max'
    ]
            
            # Prepare the data for prediction
            X_test = features_agg[features_columns]
            id_test = features_agg[['transcript_id', 'transcript_position']]

            # Make predictions
            result = main.predict(RF_classifier, id_test, X_test)
            result = result.sort_values(by='probability', ascending=False)
            result = result.rename(columns={'probability': 'score'})
            result = result[['transcript_id', 'transcript_position', 'score']]

            # Construct the output CSV file name
            output_file_name = file_name.replace('.csv', '_prediction.csv')
            output_csv_path = os.path.join(prediction_directory, output_file_name)

            # Export the results to the output directory
            result.to_csv(output_csv_path, index=False)
            print(f"Results exported to {output_csv_path}")

# Example usage
processed_directory = '../data/sg-nex-data/processed'
prediction_directory = '../data/sg-nex-data/predictions'
predict_processed_csv_files(processed_directory, prediction_directory)


In [None]:
import os
import pandas as pd

# Directory containing the prediction CSV files
prediction_directory = '../data/sg-nex-data/predictions'

# Dictionary to store dataframes by cell line
cell_line_dataframes = {
    'A549': [],
    'Hct116': [],
    'K562': [],
    'HepG2': [],
    'MCF7': []
}

# Iterate through the files in the prediction directory
for file_name in os.listdir(prediction_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(prediction_directory, file_name)
        
        # Load the CSV file into a dataframe
        df = pd.read_csv(file_path)
        
        # Check the file name to determine which cell line it belongs to
        if 'A549' in file_name:
            cell_line_dataframes['A549'].append(df)
        elif 'Hct116' in file_name:
            cell_line_dataframes['Hct116'].append(df)
        elif 'K562' in file_name:
            cell_line_dataframes['K562'].append(df)
        elif 'HepG2' in file_name:
            cell_line_dataframes['HepG2'].append(df)
        elif 'MCF7' in file_name:
            cell_line_dataframes['MCF7'].append(df)

# Optionally, concatenate the dataframes for each cell line into a single dataframe
for cell_line, dfs in cell_line_dataframes.items():
    # Concatenate dataframes if the list is not empty
    cell_line_dataframes[cell_line] = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

# Now, cell_line_dataframes dictionary contains concatenated dataframes for each cell line

# Example: Print the first few rows of each cell line dataframe to verify
for cell_line, df in cell_line_dataframes.items():
    print(f"First few rows of {cell_line} dataframe:")
    print(df.head())
    print("\n")
