# Workflow

1. Identify datasets to be predicted
2. Parse the datasets into dataframes
3. Perform pre-processing on dataframes
4. Train chosen model on full training data
5. Predict labels for all datasets
6. Save predictions into CSV files

# Identify datasets to be predicted

In [1]:
## libraries to read and parse json file
import json
import pandas as pd
import os
import sys

In [2]:
## get current working directory
os.getcwd()

'/Users/claudia/DSA4262-ACMXZ/prediction'

In [3]:
## change directory to data
os.chdir("../data/final round/")
files = []

## find which files to parse
for filename in os.listdir("."):
    if filename.endswith(".json"):
        files.append(filename)

files.sort()
files

['dataset1.json', 'dataset2.json', 'dataset3.json']

# Parse datasets into dataframes

### Functions needed to parse json files

In [4]:
## function to get key of a dictionary
def get_key(dictionary):
    key_object = dictionary.keys()
    key = list(key_object)[0]
    return key

In [5]:
## function to help concatenate columns to get transcript, position, nucleotides
def concat_col(transcript, position, nucleotide, n):
    t_df = pd.DataFrame([transcript]*n)
    p_df = pd.DataFrame([position]*n)
    nu_df = pd.DataFrame([nucleotide]*n)
    n_df = pd.DataFrame([n]*n)

    ## concat columns together
    final_df = pd.concat([t_df, p_df, nu_df, n_df], axis = 1)
    final_df.columns = ['transcript', 'position', 'nucleotides', 'reads_count']
    return final_df

In [6]:
## function to parse line in json file
def parse_line(line):
    ## get transcript
    t = get_key(line)

    ## get position
    p = get_key(line[t])

    ## get nucleotide seq
    nu = get_key(line[t][p])

    ## get number of reads
    reads_count = len(line[t][p][nu])

    ## get dataframe of list of reads
    reads = pd.DataFrame(line[t][p][nu])

    ## concat columns together to get transcript, position, nucleotides and all dwelling time, std, mean
    df = pd.concat([concat_col(t, p, nu, reads_count), reads], axis = 1)
    df.columns = ['transcript', 'position', 'nucleotides', 'reads_count', 'dwellingtime_-1', 'std_-1', 'mean_-1', 'dwellingtime_0', 'std_0', 'mean_0', 'dwellingtime_+1', 'std_+1', 'mean_+1']

    return df

### Parse datasets

In [7]:
## open dataset_1 json file
data1 = [json.loads(line) for line in open(files[0], 'r')]

## parse all lines into dataframes
data1_reads = [parse_line(data1[i]) for i in range(len(data1))]

## concatenate dataframes
data1_df = pd.concat(data1_reads, axis = 0)

print(f"Shape of Dataset 1 = {data1_df.shape}")
data1_df.head()

Shape of Dataset 1 = (7907952, 13)


Unnamed: 0,transcript,position,nucleotides,reads_count,dwellingtime_-1,std_-1,mean_-1,dwellingtime_0,std_0,mean_0,dwellingtime_+1,std_+1,mean_+1
0,ENST00000000233,244,AAGACCA,165,0.00465,2.16,127.0,0.0064,3.9,127.0,0.00797,8.75,83.7
1,ENST00000000233,244,AAGACCA,165,0.0269,4.43,106.0,0.0186,10.0,123.0,0.00863,6.2,80.0
2,ENST00000000233,244,AAGACCA,165,0.00432,3.1,108.0,0.012,8.26,125.0,0.0159,2.89,78.7
3,ENST00000000233,244,AAGACCA,165,0.00996,4.52,123.0,0.0175,8.51,128.0,0.00498,2.63,80.0
4,ENST00000000233,244,AAGACCA,165,0.00764,2.81,124.0,0.00772,4.22,126.0,0.00474,5.84,80.9


In [8]:
## open dataset_2 json file
data2 = [json.loads(line) for line in open(files[1], 'r')]

## parse all lines into dataframes
data2_reads = [parse_line(data2[i]) for i in range(len(data2))]

## concatenate dataframes
data2_df = pd.concat(data2_reads, axis = 0)

print(f"Shape of Dataset 2 = {data2_df.shape}")
data2_df.head()

Shape of Dataset 2 = (6903936, 13)


Unnamed: 0,transcript,position,nucleotides,reads_count,dwellingtime_-1,std_-1,mean_-1,dwellingtime_0,std_0,mean_0,dwellingtime_+1,std_+1,mean_+1
0,AT1G01050.1,155,GAAACTA,36,0.00232,1.93,109.0,0.0126,1.97,111.0,0.00421,1.5,95.3
1,AT1G01050.1,155,GAAACTA,36,0.00896,2.27,110.0,0.00536,2.49,110.0,0.00797,2.28,96.0
2,AT1G01050.1,155,GAAACTA,36,0.00498,6.29,114.0,0.00442,2.07,111.0,0.00785,1.97,96.0
3,AT1G01050.1,155,GAAACTA,36,0.00617,5.16,106.0,0.0083,2.7,105.0,0.00199,2.82,97.4
4,AT1G01050.1,155,GAAACTA,36,0.00664,2.01,110.0,0.00495,1.89,110.0,0.011,1.64,97.3


In [1]:
## open dataset_3 json file
data3 = [json.loads(line) for line in open(files[1], 'r')]

## parse all lines into dataframes
data3_reads = [parse_line(data3[i]) for i in range(len(data3))]

## concatenate dataframes
data3_df = pd.concat(data3_reads, axis = 0)

print(f"Shape of Dataset 3 = {data3_df.shape}")
data3_df.head()

NameError: name 'files' is not defined

# Perform pre-processing on dataframes

### Functions needed for pre-processing

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder

sys.path.append(os.path.abspath("../../util/model"))
from training import get_percent

In [None]:
def feature_eng(df):
    temp = pd.DataFrame(df.groupby(['gene_id', 'transcript', 'position', 'nucleotides', 'reads_count'], as_index=False)
                           .agg({'dwellingtime_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_-1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_0': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'dwellingtime_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'std_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max],
                                'mean_+1': [get_percent(25), get_percent(50), get_percent(75), np.mean, np.min, np.max]}))
    temp.columns = ['gene_id', 'transcript', 'position', 'nucleotides', 'reads_count',
                        'dwelling_time_-1_25', 'dwelling_time_-1_50', 'dwelling_time_-1_75', 'dwelling_time_-1_mean','dwelling_time_-1_min', 'dwelling_time_-1_max',
                        'std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean','std_-1_min', 'std_-1_max',
                        'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean','mean_-1_min', 'mean_-1_max',
                        'dwelling_time_0_25', 'dwelling_time_0_50', 'dwelling_time_0_75', 'dwelling_time_0_mean','dwelling_time_0_min','dwelling_time_0_max',
                        'std_0_25', 'std_0_50', 'std_0_75', 'std_0_mean','std_0_min', 'std_0_max',
                        'mean_0_25', 'mean_0_50', 'mean_0_75', 'mean_0_mean','mean_0_min', 'mean_0_max',
                        'dwelling_time_+1_25', 'dwelling_time_+1_50', 'dwelling_time_+1_75', 'dwelling_time_+1_mean','dwelling_time_+1_min','dwelling_time_+1_max',
                        'std_+1_25', 'std_+1_50', 'std_+1_75', 'std_+1_mean','std_+1_min', 'std_+1_max',
                        'mean_+1_25', 'mean_+1_50', 'mean_+1_75', 'mean_+1_mean','mean_+1_min', 'mean_+1_max']
    return temp

In [None]:
def relative_position(df):
    df["position"] = df["position"].astype(int)

    ## find relative position of each read in each transcript
    df["relative_position"] = df.groupby(["transcript", "gene_id"])["position"].transform(lambda x: (x - x.min())/(x.max()-x.min()))

    ## note: have NAs because there's transcripts with only one position
    ## fill the NAs with 0
    df["relative_position"] = df["relative_position"].fillna(0)

    return df

In [None]:
def encoding(df, columns_to_map):
    for i in range(7):
        df['position_' + str(i)] = df['nucleotides'].apply(lambda x: x[i])
        df_enc = pd.DataFrame({col: vals for vals, col in zip(pipe.transform(df).T, columns_to_map)})

    return df_enc

### Perform pre-processing on dataframes

In [None]:
def preprocess(df):
    ## get percentiles
    percentile_df = feature_eng(df)
    print(f"After feature engineering, the shape is {percentile_df.shape}")

    ## get relative position
    relative_pos_df = relative_position(percentile_df)
    print(f"After find the relative position, the shape is {relative_pos_df}")

    ## perform encoding
    enc_df

    return enc_df


In [None]:
data1_pp = preprocess(data1_df)
data2_pp = preprocess(data2_df)
data3_pp = preprocess(data3_df)

# Train model on full training dataset

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc, accuracy_score, plot_confusion_matrix

In [None]:
## load in Train Set
X_train_path = "../data/preprocessed_data/training/X_train_enc.parquet"
X_train = pd.read_parquet(X_train_path)
y_train_path = "../data/preprocessed_data/training/y_train.parquet"
y_train = pd.read_parquet(y_train_path)

### convert y_train into int
y_train = y_train.values.ravel()
y_train = y_train.astype(int)

In [None]:
rfe_features = ['std_-1_25', 'std_-1_50', 'std_-1_75', 'std_-1_mean', 'std_-1_min',
       'mean_-1_25', 'mean_-1_50', 'mean_-1_75', 'mean_-1_mean', 'mean_-1_min',
       'dwelling_time_0_50', 'dwelling_time_0_mean', 'std_0_25', 'std_0_50',
       'std_0_75', 'std_0_mean', 'std_0_min', 'std_0_max', 'mean_0_25',
       'mean_0_50', 'mean_0_75', 'mean_0_mean', 'mean_0_min', 'mean_0_max',
       'dwelling_time_+1_mean', 'std_+1_25', 'std_+1_50', 'mean_+1_25',
       'mean_+1_50', 'mean_+1_75', 'mean_+1_mean', 'mean_+1_min',
       'mean_+1_max', 'relative_position', 'position_1_G', 'position_5_T']
       