In [None]:
%pip install pandas
%pip install scikit-learn
%pip install imblearn
%pip install matplotlib
%pip install seaborn


In [None]:
import gzip
import shutil
import pandas as pd
import json


# Reading and Parsing JSON

In [None]:
# Specify the input and output file names (adjust file paths if needed)
input_file = '../data/dataset0.json.gz'  
output_file = '../data/dataset0.json'    

# Unzip the file
def unzip_file(input,output):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    print("File unzipped successfully.")

unzip_file(input_file,output_file)

In [None]:
input_file = '../data/dataset0.json' 

# input_file = '../data/dataset0.json' 

# data = []
# with open(input_file, 'r') as file:
#     for line in file:
#         # Parse each line as a JSON object
#         data.append(json.loads(line))

def flatten_json(data):
    rows = []

    for entry in data:
        for transcript_id, positions in entry.items():
            for position, sequences in positions.items():
                for sequence, features in sequences.items():
                    for feature_set in features:
                        row = {
                            'transcript_id': transcript_id,
                            'transcript_position': position,
                            'sequence': sequence,
                            '-1_dwelling_time': feature_set[0],
                            '-1_standard_dev': feature_set[1],
                            '-1_mean_current': feature_set[2],
                            '0_dwelling_time': feature_set[3],
                            '0_standard_dev': feature_set[4],
                            '0_mean_current': feature_set[5],
                            '+1_dwelling_time': feature_set[6],
                            '+1_standard_dev': feature_set[7],
                            '+1_mean_current': feature_set[8],
                        }
                        rows.append(row)
    return rows

# Flatten the data
flattened_data = flatten_json(data)

# Convert to a DataFrame
df = pd.DataFrame(flattened_data)

In [None]:
print(df.head())
print(df.info())

In [None]:
# Export as csv
path = '../data/dataset0.csv'
df.to_csv(path, index=False)

In [None]:
import os
import pandas as pd
import json

def flatten_json(data):
    rows = []
    for entry in data:
        for transcript_id, positions in entry.items():
            for position, sequences in positions.items():
                for sequence, features in sequences.items():
                    for feature_set in features:
                        row = {
                            'transcript_id': transcript_id,
                            'transcript_position': position,
                            'sequence': sequence,
                            '-1_dwelling_time': feature_set[0],
                            '-1_standard_dev': feature_set[1],
                            '-1_mean_current': feature_set[2],
                            '0_dwelling_time': feature_set[3],
                            '0_standard_dev': feature_set[4],
                            '0_mean_current': feature_set[5],
                            '+1_dwelling_time': feature_set[6],
                            '+1_standard_dev': feature_set[7],
                            '+1_mean_current': feature_set[8],
                        }
                        rows.append(row)
    return rows

def process_sg_nex_json_files(data_directory, output_directory):
 
    # Iterate through each folder in the data directory
    for root, dirs, files in os.walk(data_directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)

                # Load and parse the JSON file
                data = []
                with open(file_path, 'r') as f:
                    for line in f:
                        # Parse each line as a JSON object
                        data.append(json.loads(line))

                # Flatten the data
                flattened_data = flatten_json(data)

                # Convert to a DataFrame
                df = pd.DataFrame(flattened_data)

                # Construct the output CSV file name using the folder name
                folder_name = os.path.basename(root)  # Get the folder name
                output_file_name = f"{folder_name}.csv"  # Use folder name for the CSV file
                output_path = os.path.join(output_directory, output_file_name)

                # Export as CSV
                df.to_csv(output_path, index=False)
                print(f"Saved: {output_path}")

# Example usage
data_directory = '../data'  
output_directory = '../data'
process_sg_nex_json_files(data_directory, output_directory)


# Load labels and features, perform aggregation

## Read in labels and features

In [None]:
labels = pd.read_csv('../data/data.info.labelled')

print(labels.info())  # To get summary information about the DataFrame
print(labels.head())  # Preview the first few rows of the data

In [None]:
features = pd.read_csv('../data/dataset0.csv')

print(features.info()) 
print(features.head())  

## Aggregate and label features

In [None]:
def aggregate_by_transcript_position(features):

    # Apply mean, std, min, max, and skew to the selected columns
    features_agg = features.groupby(['transcript_id', 'transcript_position', 'sequence'])\
        .agg({
            '-1_dwelling_time': ['mean', 'min', 'max'],
            '-1_standard_dev': ['mean'],
            '-1_mean_current': ['mean', 'min', 'max'],
            '0_dwelling_time': ['mean', 'min', 'max'],
            '0_standard_dev': ['mean'],
            '0_mean_current': ['mean', 'min', 'max'],
            '+1_dwelling_time': ['mean', 'min', 'max'],
            '+1_standard_dev': ['mean'],
            '+1_mean_current': ['mean', 'min', 'max']
        }).reset_index()
    
    # Rename the columns to something more readable
    features_agg.columns = ['_'.join(col).strip() if col[1] else col[0] for col in features_agg.columns]

    

    return features_agg

In [None]:
def one_hot_encode_sequence(features_agg, column='sequence'):
    # Step 1: Split each sequence into individual characters
    features_split = features_agg[column].apply(lambda x: pd.Series(list(x)))
    
    # Step 2: Remove the middle letter (always the 4th character, index 3 in 0-based index)
    features_split = features_split.drop(columns=[3])  # Drop the middle letter (index 3)
    
    # Step 3: One-hot encode the remaining letters
    # `pd.get_dummies` will automatically one-hot encode each position
    features_agg_with_seq = pd.get_dummies(features_split, prefix=['pos1', 'pos2', 'pos3', 'pos5', 'pos6', 'pos7'], columns=[0, 1, 2, 4, 5, 6])
    
    return features_agg_with_seq


In [None]:
# features_agg = aggregate_by_transcript_position(features)
features_agg_with_seq = one_hot_encode_sequence(features_agg)
print(features_agg_with_seq.info())
print(features_agg_with_seq.head())

In [None]:
def add_gene_and_label(features, labels):
    """
    Adds gene_id and label to features dataframe
    
    Inputs:
    - features: pd.DataFrame
      Dataframe with selected features after feature engineering. Dataframe must contain transcript_id and transcript_position
    - labels: pd.DataFrame
      Dataframe with gene_id, transcript_id, transcript_position, and label.

    Output:
    - pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
    """
    features_labelled = pd.merge(features, labels, on=['transcript_id', 'transcript_position'], how='inner')
    
    return features_labelled


In [None]:
features_labelled = add_gene_and_label(features_agg, labels)
print(features_labelled.info())

In [None]:
# features_labelled.to_csv('../data/features_labelled.csv',index=False)

# Train Test Split

In [None]:
def train_test_split_by_gene_id(features_labelled, features_columns):
    """
    Performs train test split based on gene_id. Returns X_train and X_test based on feature_columns
    
    Inputs:
    - features_labelled: pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
      
    Output:
    - X_train: pd.DataFrame
    - X_test: pd.DataFrame
    - y_train: pd.DataFrame
    - y_test: pd.DataFrame
    """

    from sklearn.model_selection import train_test_split
    df = features_labelled

    # Get unique genes
    unique_genes = df['gene_id'].unique()
    
    # Perform the train-test split on genes
    genes_train, genes_test = train_test_split(unique_genes, test_size=0.2, random_state=42)
    
    # Split the dataset based on the gene split
    train_data = df[df['gene_id'].isin(genes_train)]
    test_data = df[df['gene_id'].isin(genes_test)]
    
    # Create the feature and target variables for training and testing
    id_train = train_data[['transcript_id','transcript_position']]
    X_train = train_data[features_columns]
    y_train = train_data['label']
    id_test = test_data[['transcript_id','transcript_position']]
    X_test = test_data[features_columns]
    y_test = test_data['label']
    
    # Output the shapes to verify the split
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Test Features Shape: {X_test.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Test Labels Shape: {y_test.shape}")
    return (X_train, X_test, y_train, y_test, id_train, id_test)

In [None]:
features_columns = [
        '-1_dwelling_time', '-1_standard_dev', '-1_mean_current',
        '0_dwelling_time', '0_standard_dev', '0_mean_current',
        '+1_dwelling_time', '+1_standard_dev', '+1_mean_current'
    ]
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split_by_gene_id(features_labelled, features_columns)

# Balancing Data

In [None]:
def balance_train_data(X_train,y_train):
    """
    Performs SMOTE on train data, oversampling positive class, to account for imbalanced dataset
    
    Inputs:
    - X_train: pd.DataFrame
    - Y_train: pd.DataFrame
      
    Output:
    - X_train_resampled: pd.DataFrame
    - y_train_resampled: pd.DataFrame with balanced classes, ie the same number of 0s and 1s
    """

    from imblearn.over_sampling import SMOTE
    print(f'Label distribution before resampling:')
    print(pd.Series(y_train).value_counts())
    
    smote = SMOTE(k_neighbors=5, random_state=42) 
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print(f'Label distribution after resampling:')
    print(pd.Series(y_train_resampled).value_counts())
    
    return X_train_resampled, y_train_resampled

In [None]:
X_train_resampled, y_train_resampled = balance_train_data(X_train,y_train)

In [None]:
# can remove?
# print(X_train.isnull().sum())
# print(X_train_resampled.isnull().sum())
# print(X_test.isnull().sum())

In [None]:
# Remove null value in X_test (and corresponding y_test value)
# X_test_clean = X_test.dropna()
# y_test_clean = y_test[X_test_clean.index]
# id_test_clean = id_test.loc[X_test_clean.index]

# Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Predict and evaluate functions

In [None]:
# Predict the output of xtest using the trained models
def predict(classifier, id, X):
    y_pred = classifier.predict(X)
    y_prob = classifier.predict_proba(X)[:, 1] #Gives probability estimates for y=1

    y_out = pd.DataFrame({
        'prediction': y_pred,                  
        'probability': y_prob          
    })

    result = pd.concat((id.reset_index(drop=True), y_out.reset_index(drop=True)), axis=1)
    # print(result.head())
    return result

In [None]:
# Model evaluation
def evaluate(y_test, predict_df):
    y_pred = predict_df['prediction']
    y_prob = predict_df['probability']
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test,  y_pred)
    balancedaccuracy = balanced_accuracy_score(y_test,  y_pred)
    f1score = f1_score(y_test,  y_pred) #F1 is a good scoring metric for imbalanced data when more attention is needed on the positives
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) 
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob) # Computes ROC AUC score using probabilities of positive class
    pr_auc = auc(recall_vals
                 , precision_vals)

    print(f"Accuracy = {round(accuracy, ndigits=3)}")
    print(f"Balanced Accuracy = {round(balancedaccuracy, ndigits=3)}")
    print(f"f1 score = {round(f1score, ndigits=3)}")
    print(f"Precision = {round(precision, ndigits=3)}")
    print(f"Recall = {round(recall, ndigits=3)}")
    print(f"ROC AUC = {round(roc_auc, ndigits=3)}")
    print(f"PR AUC = {round(pr_auc, ndigits=3)}")

    # Plot confusion matrix
    plt.figure(figsize=(2,2))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.title('Confusion Matrix')
    plt.figure(figsize=(5,5))  # Increase the figure size to avoid overlap
    plt.show()

    # Plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.show()

    # Plot Precision-Recall curve
    plt.plot(recall_vals, precision_vals, label='PR curve (AUC = %0.2f)' % pr_auc)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

## Logistic Regression

In [None]:
# Logistic Regression Model
def LR(X_train,y_train):
    classifier = LogisticRegression(max_iter=1000, verbose = 1,  random_state = 123, class_weight = 'balanced')
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
# ML workflow for Logistic Regression - without SMOTE
LR_classifier = LR(X_train, y_train)
result = predict(LR_classifier, id_test, X_test_clean)
evaluate(y_test_clean, result)

In [None]:
# ML workflow for Logistic Regression - with SMOTE
LR_classifier = LR(X_train_resampled, y_train_resampled)
result = predict(LR_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)


## Random Forest

In [None]:
def RF(X_train, y_train):
    classifier = RandomForestClassifier(n_estimators=100, random_state=123, class_weight='balanced', verbose=1)
    classifier.fit(X_train, y_train)
    return classifier

In [None]:
RF_classifier = RF(X_train, y_train)
result = predict(RF_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)

In [None]:
RF_classifier = RF(X_train_resampled, y_train_resampled)
result = predict(RF_classifier, id_test_clean, X_test_clean)
evaluate(y_test_clean, result)

# Predictions for dataset0

In [None]:
X_test_0 = features_agg[features_columns]
id_test_0 = features_agg[['transcript_id', 'transcript_position']]

In [None]:
result = predict(LR_classifier, id_test_0, X_test_0)

result = result.sort_values(by='probability', ascending=False)
result = result.rename(columns={'probability': 'score'})
result = result[['transcript_id', 'transcript_position', 'score']]


In [None]:
# Export results to CSV
output_csv_path = '../data/geneiuses_dataset0_1.csv'
result.to_csv(output_csv_path, index=False)

print(f"Results exported to {output_csv_path}")


# Predictions for dataset1

In [None]:
import gzip
import shutil

# Specify the input and output file names (adjust file paths if needed)
input_file_1 = '../data/dataset1.json.gz'  
output_file_1 = '../data/dataset1.json'    

# Unzip the file
with gzip.open(input_file_1, 'rb') as f_in:
    with open(output_file_1, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File unzipped successfully.")

In [None]:
import pandas as pd
import json

input_file_1 = '../data/dataset1.json' 

data_1 = []
with open(input_file_1, 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        data_1.append(json.loads(line))

# Flatten the data
flattened_data_1 = flatten_json(data_1)

# Convert to a DataFrame
df_1 = pd.DataFrame(flattened_data_1)

In [None]:
# Export as csv
path = '../data/dataset1.csv'
df_1.to_csv(path, index=False)

In [None]:
features_1 = pd.read_csv('../data/dataset1.csv')
features_agg_1 = aggregate_by_transcript_position(features_1)
features_columns_1 = [
        '-1_dwelling_time', '-1_standard_dev', '-1_mean_current',
        '0_dwelling_time', '0_standard_dev', '0_mean_current',
        '+1_dwelling_time', '+1_standard_dev', '+1_mean_current'
    ]

X_test_1 = features_agg_1[features_columns_1]
id_test_1 = features_agg_1[['transcript_id', 'transcript_position']]

In [None]:
result_1 = predict(LR_classifier, id_test_1, X_test_1)
result_1 = result_1.sort_values(by='probability', ascending=False)
result_1 = result_1.rename(columns={'probability': 'score'})
result_1 = result_1[['transcript_id', 'transcript_position', 'score']]


In [None]:
# Export results to CSV
output_csv_path = '../data/geneiuses_dataset1_1.csv'
result_1.to_csv(output_csv_path, index=False)

print(f"Results exported to {output_csv_path}")


# Predictions for dataset2

In [None]:
import gzip
import shutil

# Specify the input and output file names (adjust file paths if needed)
input_file_2 = '../data/dataset2.json.gz'  
output_file_2 = '../data/dataset2.json'    

# Unzip the file
with gzip.open(input_file_2, 'rb') as f_in:
    with open(output_file_2, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("File unzipped successfully.")

In [None]:
import pandas as pd
import json

input_file_2 = '../data/dataset2.json' 

data_2 = []
with open(input_file_2, 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        data_2.append(json.loads(line))

# Flatten the data
flattened_data_2 = flatten_json(data_2)

# Convert to a DataFrame
df_2 = pd.DataFrame(flattened_data_2)

In [None]:
# Export as csv
path = '../data/dataset2.csv'
df_2.to_csv(path, index=False)

In [None]:
features_2 = pd.read_csv('../data/dataset2.csv')
features_agg_2 = aggregate_by_transcript_position(features_2)
features_columns_2 = [
        '-1_dwelling_time', '-1_standard_dev', '-1_mean_current',
        '0_dwelling_time', '0_standard_dev', '0_mean_current',
        '+1_dwelling_time', '+1_standard_dev', '+1_mean_current'
    ]

X_test_2 = features_agg_2[features_columns_2]
id_test_2 = features_agg_2[['transcript_id', 'transcript_position']]


In [None]:
# ML workflow for Logistic Regression
result_2 = predict(LR_classifier, id_test_2, X_test_2)

In [None]:
result_2 = result_2.sort_values(by='probability', ascending=False)
result_2 = result_2.rename(columns={'probability': 'score'})
result_2 = result_2[['transcript_id', 'transcript_position', 'score']]


In [None]:
# Export results to CSV
output_csv_path = '../data/geneiuses_dataset2_1.csv'
result_2.to_csv(output_csv_path, index=False)

print(f"Results exported to {output_csv_path}")
