# Workflow

1. Install requirements
2. Load in data to be predicted
3. Load the model to be used for prediction
4. Parse the data into dataframes
5. Perform pre-processing on dataframes
6. Predict scores for all datasets
7. Save predictions into CSV files

# 1. Install requirements required to run the notebook

In [1]:
pip install -r ../requirements.txt

Collecting category_encoders==2.5.0
  Using cached category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
Collecting hyperopt==0.2.7
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting ipython==8.6.0
  Using cached ipython-8.6.0-py3-none-any.whl (761 kB)
Collecting pandas==1.2.5
  Using cached pandas-1.2.5-cp38-cp38-macosx_10_9_x86_64.whl (10.5 MB)
Collecting pip==22.2.2
  Using cached pip-22.2.2-py3-none-any.whl (2.0 MB)
Collecting scikit-learn==1.1.2
  Using cached scikit_learn-1.1.2-cp38-cp38-macosx_10_9_x86_64.whl (8.6 MB)
Collecting shap==0.39.0
  Using cached shap-0.39.0-cp38-cp38-macosx_10_9_x86_64.whl
Collecting fastparquet
  Using cached fastparquet-0.8.3-cp38-cp38-macosx_10_9_x86_64.whl (602 kB)
Collecting pyarrow==10.0.0
  Using cached pyarrow-10.0.0-cp38-cp38-macosx_10_14_x86_64.whl (24.6 MB)
Collecting joblib==1.2.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting lightgbm==3.2.1
  Using cached lightgbm-3.2.1-py3-none-macosx_10_14_x86_64.ma

# 2. Load in data to be predicted

In [2]:
import json
import os
import pandas as pd

In [3]:
data_path = "../data/small_test_data.json"
data = [json.loads(line) for line in open(data_path, 'r')]

# 3. Load the model to be used for prediction

In [4]:
rfe_features = ['std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min',
       'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min',
       'dwelling_time_0_50', 'dwelling_time_0_mean', 'std_0_25', 'std_0_50',
       'std_0_75', 'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25',
       'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 'mean_0_max',
       'dwelling_time_+1_mean', 'std_+1_25', 'std_+1_50', 'mean_+1_25',
       'mean_+1_50', 'mean_+1_75', 'mean_+1_mean', 'mean_+1_min',
       'mean_+1_max', 'relative_position', 'position_1_G', 'position_5_T']

In [6]:
# Load pickled random forest model
import joblib
pickled_model = joblib.load('../deployment/rf.pkl')

# 4. Parse the Data into Dataframe

### Functions needed

In [7]:
## function to get key of a dictionary
def get_key(dictionary):
    key_object = dictionary.keys()
    key = list(key_object)[0]
    return key

In [8]:
## function to help concatenate columns to get transcript, position, nucleotides
def concat_col(transcript, position, nucleotide, n):
    t_df = pd.DataFrame([transcript]*n)
    p_df = pd.DataFrame([position]*n)
    nu_df = pd.DataFrame([nucleotide]*n)
    n_df = pd.DataFrame([n]*n)

    ## concat columns together
    final_df = pd.concat([t_df, p_df, nu_df, n_df], axis = 1)
    final_df.columns = ['transcript', 'position', 'nucleotides', 'reads_count']
    return final_df

In [9]:
## function to parse line in json file
def parse_line(line):
    ## get transcript
    t = get_key(line)

    ## get position
    p = get_key(line[t])

    ## get nucleotide seq
    nu = get_key(line[t][p])

    ## get number of reads
    reads_count = len(line[t][p][nu])

    ## get dataframe of list of reads
    reads = pd.DataFrame(line[t][p][nu])

    ## concat columns together to get transcript, position, nucleotides and all dwelling time, std, mean
    df = pd.concat([concat_col(t, p, nu, reads_count), reads], axis = 1)
    df.columns = ['transcript', 'position', 'nucleotides', 'reads_count', 'dwellingtime_-1', 'std_-1', 'mean_-1', 'dwellingtime_0', 'std_0', 'mean_0', 'dwellingtime_+1', 'std_+1', 'mean_+1']

    return df

In [10]:
def parse(data):
    ## parse all lines into dataframes
    reads = [parse_line(data[0][i]) for i in range(len(data[0]))]
    print(len(reads))

    ## concatenate dataframes
    result_df = pd.concat(reads, axis = 0)
    print(f"Shape of Dataset = {result_df.shape}")

    return result_df

### Parse data

In [11]:
data_df = parse(data)

1500
Shape of Dataset = (141681, 13)


# 5. Perform Pre-Processing on Data

### Functions

In [12]:
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder

sys.path.append(os.path.abspath("../util/model"))
from training import get_percent

In [13]:
def feature_eng(df):
    temp = pd.DataFrame(df.groupby(['transcript', 'position', 'nucleotides', 'reads_count'], as_index=False)
                           .agg({'dwellingtime_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max]}))
    temp.columns = ['transcript', 'position', 'nucleotides', 'reads_count',
                        'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 'dwelling_time_-1_mean','dwelling_time_-1_min', 'dwelling_time_-1_max',
                        'std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean','std_-1_min', 'std_-1_max',
                        'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean','mean_-1_min', 'mean_-1_max',
                        'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean','dwelling_time_0_min','dwelling_time_0_max',
                        'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean','std_0_min', 'std_0_max',
                        'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean','mean_0_min', 'mean_0_max',
                        'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 'dwelling_time_+1_mean','dwelling_time_+1_min','dwelling_time_+1_max',
                        'std_+1_25', 'std_+1_50', 'std_+1_75', 'std_+1_mean','std_+1_min', 'std_+1_max',
                        'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 'mean_+1_mean','mean_+1_min', 'mean_+1_max']
    return temp

In [14]:
def relative_position(df):
    df["position"] = df["position"].astype(int)

    ## find relative position of each read in each transcript
    df["relative_position"] = df.groupby(["transcript"])["position"].transform(lambda x: (x - x.min())/(x.max()-x.min()))

    ## note: have NAs because there's transcripts with only one position
    ## fill the NAs with 0
    df["relative_position"] = df["relative_position"].fillna(0)

    return df

In [16]:
## variables needed for encoding
pipe = pickle.load(open("../data/model_training/raw_data/encoding_pipeline.pkl", "rb"))

cols_to_map = ['reads_count', 'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 
                'dwelling_time_-1_mean', 'dwelling_time_-1_min', 'dwelling_time_-1_max', 'std_-1_25', 
                'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min', 'std_-1_max', 'mean_-1_25', 
                'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min', 'mean_-1_max', 
                'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean', 
                'dwelling_time_0_min', 'dwelling_time_0_max', 'std_0_25', 'std_0_50', 'std_0_75', 
                'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean', 
                'mean_0_min', 'mean_0_max', 'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 
                'dwelling_time_+1_mean', 'dwelling_time_+1_min', 'dwelling_time_+1_max', 'std_+1_25', 
                'std_+1_50', 'std_+1_75', 'std_+1_mean', 'std_+1_min', 'std_+1_max', 'mean_+1_25', 'mean_+1_50', 
                'mean_+1_75', 'mean_+1_mean', 'mean_+1_min', 'mean_+1_max', 'relative_position', 'position_0_C', 
                'position_0_G', 'position_0_T', 'position_0_A', 'position_1_A', 'position_1_G', 'position_1_T', 
                'position_2_A', 'position_2_G', 'position_3_A', 'position_4_C', 'position_5_C', 'position_5_A', 
                'position_5_T', 'position_6_T', 'position_6_A', 'position_6_G', 'position_6_C']

In [17]:
def encoding(df, columns_to_map):
    id_val = df[['transcript','position']] ## needed to concat with pred proba for submission

    for i in range(7):
        df['position_' + str(i)] = df['nucleotides'].apply(lambda x: x[i])
    
    df_enc = pd.DataFrame({col: vals for vals, col in zip(pipe.transform(df).T, columns_to_map)})

    return df_enc, id_val

### Pre-Processing

In [18]:
percentile_df = feature_eng(data_df)
print(percentile_df.shape)

relative_position_df = relative_position(percentile_df)
print(relative_position_df.shape)

encoded_df, id_val_df = encoding(relative_position_df, cols_to_map)
print(encoded_df.shape, id_val_df.shape)

data_pp = encoded_df

(1492, 58)
(1492, 59)
(1492, 74) (1492, 2)


# 6. Predicting Probabilities

In [19]:
def prediction_pickled_model(data_id_col, data_pp):
    ## predict using pickled_model
    data_pred = pickled_model.predict_proba(data_pp[rfe_features])[:,1]
    print(len(data_pred))

    ## convert predictions to dataframe
    data_pred_df = pd.DataFrame(data_pred, columns = ['score'])

    ## 
    data_pred_df = pd.concat([data_id_col, data_pred_df], axis = 1)
    print(f"Prediction file is of shape: {data_pred_df.shape}")

    return data_pred_df

In [20]:
prediction_df = prediction_pickled_model(id_val_df, data_pp)
prediction_df.head()

1492
Prediction file is of shape: (1492, 3)


Unnamed: 0,transcript,position,score
0,ENST00000005386,1758,0.009091
1,ENST00000009041,1086,0.009948
2,ENST00000009041,955,0.027273
3,ENST00000012443,701,0.027273
4,ENST00000013807,822,0.004545


# 7. Save predictions

In [21]:
## save predictions as csv file
prediction_fname = "test_predictions.csv"
prediction_df.to_csv(prediction_fname)