# Workflow

1. Identify datasets to be predicted
2. Parse the datasets into dataframes
3. Perform pre-processing on dataframes
4. Train chosen model on full training data
5. Predict labels for all datasets
6. Save predictions into CSV files

# Identify datasets to be predicted

In [1]:
## libraries to read and parse json file
import json
import pandas as pd
import os
import sys

In [2]:
## get current working directory
os.getcwd()

'/Users/claudia/DSA4262-ACMXZ/prediction'

# Parse datasets into dataframes

### Functions needed to parse json files

In [3]:
## function to get key of a dictionary
def get_key(dictionary):
    key_object = dictionary.keys()
    key = list(key_object)[0]
    return key

In [4]:
## function to help concatenate columns to get transcript, position, nucleotides
def concat_col(transcript, position, nucleotide, n):
    t_df = pd.DataFrame([transcript]*n)
    p_df = pd.DataFrame([position]*n)
    nu_df = pd.DataFrame([nucleotide]*n)
    n_df = pd.DataFrame([n]*n)

    ## concat columns together
    final_df = pd.concat([t_df, p_df, nu_df, n_df], axis = 1)
    final_df.columns = ['transcript', 'position', 'nucleotides', 'reads_count']
    return final_df

In [5]:
## function to parse line in json file
def parse_line(line):
    ## get transcript
    t = get_key(line)

    ## get position
    p = get_key(line[t])

    ## get nucleotide seq
    nu = get_key(line[t][p])

    ## get number of reads
    reads_count = len(line[t][p][nu])

    ## get dataframe of list of reads
    reads = pd.DataFrame(line[t][p][nu])

    ## concat columns together to get transcript, position, nucleotides and all dwelling time, std, mean
    df = pd.concat([concat_col(t, p, nu, reads_count), reads], axis = 1)
    df.columns = ['transcript', 'position', 'nucleotides', 'reads_count', 'dwellingtime_-1', 'std_-1', 'mean_-1', 'dwellingtime_0', 'std_0', 'mean_0', 'dwellingtime_+1', 'std_+1', 'mean_+1']

    return df

In [19]:
## function to breakdown dataframe into smaller sizes and save it
def save_file(df, filename, nrows = 2500000):
    total_rows = len(df)
    start, stop, count = 0, nrows, 1

    ## use while loop to break the dataframe into smaller dataframes
    while stop < total_rows:
        print(start, stop)
        temp_df = pd.DataFrame(df.iloc[start:stop, :])
        fname = f"{filename}_{count}.parquet"
        temp_df.to_parquet(fname)
        print(f"Saved a file called {fname}")
        count += 1
        start += nrows
        stop += nrows
    
    stop = total_rows
    print(start, stop)
    temp_df = pd.DataFrame(df.iloc[start:stop, :])
    fname = f"{filename}_{count}.parquet"
    temp_df.to_parquet(fname)
    print(f"Saved a file called {fname}")

### Parse datasets

In [11]:
def parse(file, filename):
    ## open file
    data = [json.loads(line) for line in open(file, 'r')]

    ## parse all lines into dataframes
    reads = [parse_line(data[i]) for i in range(len(data))]

    ## concatenate dataframes
    result_df = pd.concat(reads, axis = 0)
    print(f"Shape of Dataset = {result_df.shape}")

    ## save dataframe into parquet files
    save_file(result_df, filename)

    return result_df

In [12]:
dataset1_path = "/Users/claudia/Downloads/dataset1.json"
dataset1_filename = "../data/final_round/dataset1"
dataset1 = parse(dataset1_path, dataset1_filename)

Shape of Dataset = (7907952, 13)
0 5000000
Saved a file called ../data/final_round/dataset1_1.parquet
5000000 7907952
Saved a file called ../data/final_round/dataset1_2.parquet


In [22]:
dataset2_path = "/Users/claudia/Downloads/dataset2.json"
dataset2_filename = "../data/final_round/dataset2"
dataset2 = parse(dataset2_path, dataset2_filename)

Shape of Dataset = (6903936, 13)
0 2500000
Saved a file called ../data/final_round/dataset2_1.parquet
2500000 5000000
Saved a file called ../data/final_round/dataset2_2.parquet
5000000 6903936
Saved a file called ../data/final_round/dataset2_3.parquet


In [23]:
dataset3_path = "/Users/claudia/Downloads/dataset3.json"
dataset3_filename = "../data/final_round/dataset3"
dataset3 = parse(dataset3_path, dataset3_filename)

Shape of Dataset = (1171940, 13)
0 1171940
Saved a file called ../data/final_round/dataset3_1.parquet


# Perform pre-processing on dataframes

### Functions needed for pre-processing

In [9]:
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder

sys.path.append(os.path.abspath("../../util/model"))
from training import get_percent



In [16]:
# gene_id and labels removed from groupby and temp.columns
def feature_eng(df):
    temp = pd.DataFrame(df.groupby(['transcript', 'position', 'nucleotides', 'reads_count'], as_index=False)
                           .agg({'dwellingtime_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max]}))
    temp.columns = ['transcript', 'position', 'nucleotides', 'reads_count',
                        'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 'dwelling_time_-1_mean','dwelling_time_-1_min', 'dwelling_time_-1_max',
                        'std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean','std_-1_min', 'std_-1_max',
                        'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean','mean_-1_min', 'mean_-1_max',
                        'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean','dwelling_time_0_min','dwelling_time_0_max',
                        'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean','std_0_min', 'std_0_max',
                        'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean','mean_0_min', 'mean_0_max',
                        'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 'dwelling_time_+1_mean','dwelling_time_+1_min','dwelling_time_+1_max',
                        'std_+1_25', 'std_+1_50', 'std_+1_75', 'std_+1_mean','std_+1_min', 'std_+1_max',
                        'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 'mean_+1_mean','mean_+1_min', 'mean_+1_max']
    return temp

In [17]:
# gene_id removed from groupby
def relative_position(df):
    df["position"] = df["position"].astype(int)

    ## find relative position of each read in each transcript
    df["relative_position"] = df.groupby(["transcript"])["position"].transform(lambda x: (x - x.min())/(x.max()-x.min()))

    ## note: have NAs because there's transcripts with only one position
    ## fill the NAs with 0
    df["relative_position"] = df["relative_position"].fillna(0)

    return df

In [24]:
sys.version

'3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]'

In [12]:
pd.__version__

'1.2.4'

In [17]:
%pip install pandas==1.5.1

Collecting pandas==1.5.1
  Downloading pandas-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl (11.9 MB)
[K     |████████████████████████████████| 11.9 MB 2.3 MB/s eta 0:00:01
Collecting numpy>=1.20.3
  Downloading numpy-1.23.4-cp38-cp38-macosx_10_9_x86_64.whl (18.1 MB)
[K     |████████████████████████████████| 18.1 MB 303 kB/s eta 0:00:01
Installing collected packages: numpy, pandas
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.1
    Uninstalling numpy-1.20.1:
      Successfully uninstalled numpy-1.20.1
  Attempting uninstall: pandas
    Found existing installation: pandas 1.0.2
    Uninstalling pandas-1.0.2:
      Successfully uninstalled pandas-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.6.2 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.23.4 which is incompatible.[0m
Successfully installed numpy-1.23.4 panda

In [12]:
pipe = pickle.load(open("../raw_data/encoding_pipeline.pkl", "rb"))

def encoding(df, columns_to_map):
    for i in range(7):
        df['position_' + str(i)] = df['nucleotides'].apply(lambda x: x[i])
    
    df_enc = pd.DataFrame({col: vals for vals, col in zip(pipe.transform(df).T, columns_to_map)})

    return df_enc

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Perform pre-processing on dataframes

In [21]:
def preprocess(df):
    ## get percentiles
    percentile_df = feature_eng(df)
    print(f"After feature engineering, the shape is {percentile_df.shape}")

    ## get relative position
    relative_pos_df = relative_position(percentile_df)
    print(f"After finding the relative position, the shape is {relative_pos_df.shape}")

    ## perform encoding
    columns_to_map = ['reads_count', 'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 
                        'dwelling_time_-1_mean', 'dwelling_time_-1_min', 'dwelling_time_-1_max', 
                        'std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min', 'std_-1_max', 
                        'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min', 'mean_-1_max', 
                        'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean', 
                        'dwelling_time_0_min', 'dwelling_time_0_max', 'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean',
                        'std_0_min', 'std_0_max', 'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 
                        'mean_0_max', 'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 
                        'dwelling_time_+1_mean', 'dwelling_time_+1_min', 'dwelling_time_+1_max', 'std_+1_25', 'std_+1_50', 
                        'std_+1_75', 'std_+1_mean', 'std_+1_min', 'std_+1_max', 'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 
                        'mean_+1_mean', 'mean_+1_min', 'mean_+1_max', 'relative_position', 'position_0_C', 'position_0_G', 
                        'position_0_T', 'position_0_A', 'position_1_A', 'position_1_G', 'position_1_T', 'position_2_A', 
                        'position_2_G', 'position_3_A', 'position_4_C', 'position_5_C', 'position_5_A', 'position_5_T', 
                        'position_6_T', 'position_6_A', 'position_6_G', 'position_6_C']
    for i in range(7):
        df['position_' + str(i)] = df['nucleotides'].apply(lambda x: x[i])
    df_enc = pd.DataFrame({col: vals for vals, col in zip(pipe.transform(df).T, columns_to_map)})
    
    enc_df = encoding(relative_pos_df, columns_to_map)
    print(f"After encoding, the shape is {enc_df.shape}")
    
    return enc_df


In [22]:
dataset1.columns

Index(['transcript', 'position', 'nucleotides', 'reads_count',
       'dwellingtime_-1', 'std_-1', 'mean_-1', 'dwellingtime_0', 'std_0',
       'mean_0', 'dwellingtime_+1', 'std_+1', 'mean_+1'],
      dtype='object')

In [23]:
data1_pp = preprocess(dataset1)
#data2_pp = preprocess(data2_df)
#data3_pp = preprocess(data3_df)

After feature engineering, the shape is (90810, 58)
After finding the relative position, the shape is (90810, 59)


In [24]:
data1_pp.columns

Index(['reads_count', 'dwelling_time_-1_25', 'dwelling_time_-1_50',
       'dwelling_time_-1_75', 'dwelling_time_-1_mean', 'dwelling_time_-1_min',
       'dwelling_time_-1_max', 'std_-1_25', 'std_-1_50', 'std_-1_75',
       'std_-1_mean', 'std_-1_min', 'std_-1_max', 'mean_-1_25', 'mean_-1_50',
       'mean_-1_75', 'mean_-1_mean', 'mean_-1_min', 'mean_-1_max',
       'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75',
       'dwelling_time_0_mean', 'dwelling_time_0_min', 'dwelling_time_0_max',
       'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean', 'std_0_min',
       'std_0_max', 'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean',
       'mean_0_min', 'mean_0_max', 'dwelling_time_+1_25',
       'dwelling_time_+1_50', 'dwelling_time_+1_75', 'dwelling_time_+1_mean',
       'dwelling_time_+1_min', 'dwelling_time_+1_max', 'std_+1_25',
       'std_+1_50', 'std_+1_75', 'std_+1_mean', 'std_+1_min', 'std_+1_max',
       'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 'mean_+1_mean',

# Train model on full training dataset

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
## load in Train Set
X_train_path = "../data/preprocessed_data/training/X_train_enc.parquet"
X_train = pd.read_parquet(X_train_path)
y_train_path = "../data/preprocessed_data/training/y_train.parquet"
y_train = pd.read_parquet(y_train_path)

### convert y_train into int
y_train = y_train.values.ravel()
y_train = y_train.astype(int)

In [None]:
rfe_features = ['std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min',
       'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min',
       'dwelling_time_0_50', 'dwelling_time_0_mean', 'std_0_25', 'std_0_50',
       'std_0_75', 'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25',
       'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 'mean_0_max',
       'dwelling_time_+1_mean', 'std_+1_25', 'std_+1_50', 'mean_+1_25',
       'mean_+1_50', 'mean_+1_75', 'mean_+1_mean', 'mean_+1_min',
       'mean_+1_max', 'relative_position', 'position_1_G', 'position_5_T']

In [None]:
rfc = RandomForestClassifier(random_state = 42, n_estimators = 220, max_features = "sqrt", max_depth = 30,
                        min_samples_split = 2, min_samples_leaf = 1, bootstrap = False)

rfc.fit(X_train[rfe_features], y_train)

y_test_pred_proba = rfc.predict_proba(dataset1[rfe_features])[:,1]