In [14]:
import pandas as pd
import numpy as np
import sys
import os 
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
def get_percent(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [4]:
info_data_path = "data.info"
with open(info_data_path, 'r') as f:
    info = f.read().splitlines()
data_df1 = pd.read_parquet("merged_data_v2_1.parquet")
data_df2 = pd.read_parquet("merged_data_v2_2.parquet")
data_df3 = pd.read_parquet("merged_data_v2_3.parquet")

In [5]:
print(data_df1.shape)
print(data_df2.shape)
print(data_df3.shape)

(5000000, 15)
(5000000, 15)
(1027106, 15)


In [6]:
raw_df =  pd.concat([data_df1, data_df2, data_df3])

In [7]:
def model_features_and_clean(df):
    '''Function to select features for modelling and clean the initial raw data. 
        Removes rows which contain NAN values in key_columns, and converts all NAN values to 0 for non_nan_cols

    :Parameters:
    ------------
        self.df: DataFrame
            DataFrame containing raw data

    :Returns:
    ---------
        self.df: DataFrame
            DataFrame containing features required for modelling
    '''
    model_features_list =  ['transcript', 'position', 'nucleotides', 'reads_count', 'gene_id',
                            'dwellingtime_-1', 'std_-1', 'mean_-1',
                            'dwellingtime_0', 'std_0', 'mean_0',
                            'dwellingtime_+1', 'std_+1', 'mean_+1','label']
    non_nan_cols =  []
    key_columns =['label']
    df = df[model_features_list]
    df[non_nan_cols] = df[non_nan_cols].fillna(0)
    df = df.dropna(subset=key_columns)
    return df

In [8]:
cleaned_df = model_features_and_clean(raw_df)

In [9]:
# function to clean
def feature_eng(df):
    temp = pd.DataFrame(df.groupby(['gene_id', 'transcript', 'position', 'nucleotides', 'reads_count', 'label'], as_index=False)
                           .agg({'dwellingtime_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'std_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'mean_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'dwellingtime_0': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'std_0': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'mean_0': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'dwellingtime_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'std_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean],
                                'mean_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean]}))
    temp.columns = ['gene_id', 'transcript', 'position', 'nucleotides', 'reads_count', 'label',
                        'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 'dwelling_time_-1_mean',
                        'std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean',
                        'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean',
                        'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean',
                        'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean',
                        'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean',
                        'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 'dwelling_time_+1_mean',
                        'std_+1_25', 'std_+1_50', 'std_+1_75', 'std_+1_mean',
                        'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 'mean_+1_mean']
    return temp

In [10]:
percentiles_df = feature_eng(cleaned_df)

In [17]:
def split_stratified_into_train_val_test(df, random_state=None):

    # getting unique id_col and stratify_col for splitting
    id_col = ['gene_id']
    stratify_col = 'label'
    train_percent = 0.8 # percentage of train data
    validation_percent = 0.1 # percentage of validation data
    test_percent = 0.1 # percentage of test data
    #position_col = ['transcript', 'position']
    target_col = 'label'

    temp_col = id_col + [stratify_col]
    df_target = df[temp_col].drop_duplicates()
    X = df_target  
    y = df_target[[stratify_col]]
    

    # parameter checks
    if train_percent + validation_percent + test_percent != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' %
                            (train_percent, validation_percent, test_percent))

    if stratify_col not in df_target.columns:
        raise ValueError('%s is not a column in the dataframe' %
                            (stratify_col))

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X, y, stratify=y,
                                                                    test_size=(
                                                                        1.0 - train_percent),
                                                                    random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_test_percent = test_percent / \
        (validation_percent + test_percent)
    df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp,
                                                        stratify=y_temp,
                                                        test_size=relative_test_percent,
                                                        random_state=random_state)

    assert len(df_target) == len(df_train) + \
        len(df_val) + len(df_test)


    list_train = df_train[id_col]
    y_train_n = y_train
    new_train_df = pd.concat([list_train,y_train_n],axis=1)
    a = new_train_df.merge(df,how = "left", left_on = ["gene_id", "label"], 
                    right_on = ["gene_id", "label"])
    list_test = df_test[id_col]
    y_test_n = y_test
    new_test_df = pd.concat([list_test,y_test_n],axis=1)
    b = new_test_df.merge(df,how = "left", left_on = ["gene_id", "label"], 
                    right_on = ["gene_id", "label"])
    list_val = df_val[id_col]
    y_val_n = y_val
    new_val_df = pd.concat([list_val,y_val_n], axis=1)
    c = new_val_df.merge(df,how = "left", left_on = ["gene_id", "label"], 
                    right_on = ["gene_id", "label"])
    
    
    # printing percentages
    print("train target percentage:", len(
        a[a[target_col] == '1'])/len(a))
    print("test target percentage:", len(
        b[b[target_col] == '1'])/len(b))
    print("val target percentage:", len(
     
    c[c[target_col] == '1'])/len(c))


    # separating df from target column: features -> X | target -> y
    X_train = a.drop(
        columns = ['gene_id','transcript','position','label']).reset_index(drop=True)
    y_train = pd.DataFrame(
        a[target_col]).reset_index(drop=True)
    X_val = c.drop(
        columns = ['gene_id','transcript','position','label']).reset_index(drop=True)
    y_val = pd.DataFrame(
        c[target_col]).reset_index(drop=True)
    X_test = b.drop(
        columns = ['gene_id','transcript','position','label']).reset_index(drop=True)
    y_test = pd.DataFrame(
        b[target_col]).reset_index(drop=True)
    # printing df shape
    print("train data shape:", X_train.shape)
    print("validation data shape:", X_val.shape)
    print("test data shape:", X_test.shape)

    return(a, b, c, X_train, y_train,
            X_val, y_val, X_test, y_test, new_val_df, list_train)

In [18]:
df_train, df_test, df_val, X_train, y_train, X_val, y_val, X_test, y_test, df_val_id, list_train = split_stratified_into_train_val_test(percentiles_df, random_state=42)

train target percentage: 0.043292122202922244
test target percentage: 0.0506553911205074
val target percentage: 0.052622910318702136
train data shape: (97870, 38)
validation data shape: (12143, 38)
test data shape: (11825, 38)


In [21]:
X_train.head()

Unnamed: 0,nucleotides,reads_count,dwelling_time_-1_25,dwelling_time_-1_50,dwelling_time_-1_75,dwelling_time_-1_mean,std_-1_25,std_-1_50,std_-1_75,std_-1_mean,mean_-1_25,mean_-1_50,mean_-1_75,mean_-1_mean,dwelling_time_0_25,dwelling_time_0_50,dwelling_time_0_75,dwelling_time_0_mean,std_0_25,std_0_50,std_0_75,std_0_mean,mean_0_25,mean_0_50,mean_0_75,mean_0_mean,dwelling_time_+1_25,dwelling_time_+1_50,dwelling_time_+1_75,dwelling_time_+1_mean,std_+1_25,std_+1_50,std_+1_75,std_+1_mean,mean_+1_25,mean_+1_50,mean_+1_75,mean_+1_mean
0,AAGACCA,48,0.004085,0.00605,0.009362,0.007326,3.37,4.125,4.9675,4.432083,122.0,124.0,125.0,123.229167,0.007865,0.0109,0.0159,0.012958,5.1475,6.41,7.715,6.455208,127.0,128.0,129.25,128.166667,0.005557,0.00764,0.010225,0.008463,1.9275,3.545,4.9325,3.763854,82.0,83.4,84.175,83.0875
1,GAAACAG,50,0.005855,0.00716,0.010675,0.008841,2.43,2.92,3.3675,3.0754,107.0,108.0,108.0,107.92,0.003395,0.005405,0.00697,0.006933,2.2425,3.215,4.035,3.25758,95.4,101.5,106.0,100.246,0.004065,0.00602,0.00896,0.007349,2.455,3.16,4.63,3.50604,87.45,89.3,90.775,88.902
2,AGGACCC,51,0.00556,0.0105,0.0192,0.011989,5.245,6.08,6.665,5.923529,115.0,117.0,118.0,116.54902,0.00797,0.0133,0.0174,0.013976,4.645,5.23,5.915,5.457059,121.0,122.0,123.0,121.784314,0.00398,0.00554,0.0072,0.005917,1.76,2.27,3.47,2.767451,80.25,82.2,83.25,81.935294
3,CTGACTC,49,0.00498,0.00631,0.0083,0.007012,3.43,3.85,4.41,3.97898,107.0,110.0,110.0,108.816327,0.00598,0.00963,0.0131,0.010202,5.22,6.39,8.1,6.756735,122.0,124.0,126.0,124.204082,0.00403,0.00651,0.00863,0.006681,2.11,2.66,3.34,2.870408,89.6,91.3,93.2,91.830612
4,GGGACAA,35,0.003485,0.0073,0.0146,0.009141,3.11,4.52,6.97,5.36,114.5,117.0,120.0,116.885714,0.00398,0.00664,0.010315,0.008617,6.97,11.4,12.9,10.157714,109.0,113.0,116.0,112.028571,0.00398,0.00558,0.00859,0.00663,2.04,2.45,2.8,2.756571,78.3,79.5,82.2,80.108571


In [22]:
X_val.head()

Unnamed: 0,nucleotides,reads_count,dwelling_time_-1_25,dwelling_time_-1_50,dwelling_time_-1_75,dwelling_time_-1_mean,std_-1_25,std_-1_50,std_-1_75,std_-1_mean,mean_-1_25,mean_-1_50,mean_-1_75,mean_-1_mean,dwelling_time_0_25,dwelling_time_0_50,dwelling_time_0_75,dwelling_time_0_mean,std_0_25,std_0_50,std_0_75,std_0_mean,mean_0_25,mean_0_50,mean_0_75,mean_0_mean,dwelling_time_+1_25,dwelling_time_+1_50,dwelling_time_+1_75,dwelling_time_+1_mean,std_+1_25,std_+1_50,std_+1_75,std_+1_mean,mean_+1_25,mean_+1_50,mean_+1_75,mean_+1_mean
0,TGAACCG,97,0.00492,0.0064,0.00987,0.007655,7.05,8.45,9.38,8.280619,119.0,120.0,122.0,119.797938,0.00558,0.00853,0.0126,0.010538,3.03,3.45,3.97,3.613505,95.8,97.6,99.4,97.394845,0.00365,0.00498,0.00668,0.005532,1.51,1.79,2.18,1.983536,85.1,86.5,87.7,86.273196
1,TGGACAA,88,0.004638,0.007225,0.010325,0.008287,2.6975,3.165,4.26,3.666364,116.0,117.0,118.0,116.806818,0.004645,0.00665,0.011075,0.008651,5.455,7.185,8.625,7.224886,116.0,118.0,120.0,117.886364,0.004545,0.00661,0.009572,0.00757,1.8125,2.285,3.035,2.600114,80.975,83.05,84.225,83.103409
2,TGGACCA,81,0.00531,0.00738,0.0102,0.008319,2.21,2.76,3.24,3.141852,116.0,117.0,118.0,116.901235,0.00398,0.0073,0.0113,0.00912,5.5,7.82,11.3,8.54716,113.0,117.0,119.0,116.17284,0.00365,0.00526,0.0073,0.0061,1.55,2.1,3.44,2.746889,75.0,76.8,77.7,76.661728
3,AGGACAT,95,0.005455,0.00737,0.0116,0.009615,3.975,4.82,5.97,5.029263,115.0,117.0,118.0,116.842105,0.00609,0.00976,0.01475,0.011336,5.755,6.99,8.515,7.141053,120.0,122.0,123.0,121.305263,0.00299,0.00498,0.00747,0.006009,2.24,2.94,3.66,3.188211,80.15,82.4,84.25,82.343158
4,GAAACCG,89,0.00547,0.00991,0.0142,0.010929,2.5,3.09,3.55,3.344382,105.0,106.0,107.0,106.348315,0.00432,0.00697,0.0153,0.010756,3.5,4.0,4.42,4.023483,100.0,103.0,105.0,101.855056,0.00332,0.00498,0.00787,0.006155,1.49,1.72,2.12,1.848303,85.3,86.7,87.6,86.291011


In [23]:
df_train.shape

(97870, 42)

In [24]:
df_test.shape

(11825, 42)