In [None]:
%pip install pandas
%pip install scikit-learn
%pip install imblearn

In [None]:
import pandas as pd

labels = pd.read_csv('../code/data.info.labelled')

print(labels.info())  # To get summary information about the DataFrame
print(labels.head())  # Preview the first few rows of the data

In [None]:
features = pd.read_csv('../data/dataset0.csv')

print(features.info()) 
print(features.head())  

In [None]:
def add_gene_and_label(features, labels):
    """
    Adds gene_id and label to features dataframe
    
    Inputs:
    - features: pd.DataFrame
      Dataframe with selected features after feature engineering. Dataframe must contain transcript_id and transcript_position
    - labels: pd.DataFrame
      Dataframe with gene_id, transcript_id, transcript_position, and label.

    Output:
    - pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
    """
    
    features.rename(columns={
            'Transcript_ID': 'transcript_id',
            'Position': 'transcript_position'
        }, inplace=True)
    features_labelled = pd.merge(features, labels, on=['transcript_id', 'transcript_position'], how='inner')
    
    return features_labelled


In [None]:
features_labelled = add_gene_and_label(features, labels)
print(features_labelled.info())

In [None]:
# features_labelled.to_csv('../data/features_labelled.csv',index=False)

In [None]:
def train_test_split_by_gene_id(features_labelled, features_columns):
    """
    Performs train test split based on gene_id. Returns X_train and X_test based on feature_columns
    
    Inputs:
    - features_labelled: pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
      
    Output:
    - X_train: pd.DataFrame
    - X_test: pd.DataFrame
    - y_train: pd.DataFrame
    - y_test: pd.DataFrame
    """

    from sklearn.model_selection import train_test_split
    df = features_labelled

    # Get unique genes
    unique_genes = df['gene_id'].unique()
    
    # Perform the train-test split on genes
    genes_train, genes_test = train_test_split(unique_genes, test_size=0.2, random_state=42)
    
    # Split the dataset based on the gene split
    train_data = df[df['gene_id'].isin(genes_train)]
    test_data = df[df['gene_id'].isin(genes_test)]
    
    # Create the feature and target variables for training and testing
    X_train = train_data[feature_columns]
    y_train = train_data['label']
    X_test = test_data[feature_columns]
    y_test = test_data['label']
    
    # Output the shapes to verify the split
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Test Features Shape: {X_test.shape}")
    print(f"Training Labels Shape: {y_train.shape}")
    print(f"Test Labels Shape: {y_test.shape}")
    return (X_train, X_test, y_train, y_test)

In [None]:
features_columns = [
        '-1 Dwelling Time', '-1 Standard Dev', '-1 Mean Current',
        '0 Dwelling Time', '0 Standard Dev', '0 Mean Current',
        '+1 Dwelling Time', '+1 Standard Dev', '+1 Mean Current'
    ]
X_train, X_test, y_train, y_test = train_test_split_by_gene_id(features_labelled, features_columns)

In [None]:
def balance_train_data(X_train,y_train):
    """
    Performs SMOTE on train data, oversampling positive class, to account for imbalanced dataset
    
    Inputs:
    - features_labelled: pd.DataFrame
      Updated features dataframe with added columns: gene_id and label from labels.
      
    Output:
    - X_train: pd.DataFrame
    - X_test: pd.DataFrame
    - y_train: pd.DataFrame
    - y_test: pd.DataFrame
    """

    from imblearn.over_sampling import SMOTE
    print(f'Label distribution before resampling:')
    print(pd.Series(y_train).value_counts())
    
    smote = SMOTE(k_neighbors=5, random_state=42) 
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print(f'Label distribution after resampling:')
    print(pd.Series(y_train_resampled).value_counts())
    
    return X_train_resampled, y_train_resampled

In [None]:
X_train_resampled, y_train_resampled = balance_train_data(X_train,y_train)