# Importing Libraries

In [1]:
%cd ../..

c:\Users\lowmi\OneDrive\Desktop\Uni\DSA4262\DSA4262-ACMXZ


In [2]:
import pandas as pd
import numpy as np
import json
import pickle

In [3]:
from util.preprocess import (Preprocessing)
from util.preprocess.parameters import model_features_list
from util.model.training import get_percent

# Loading Datasets

In [4]:
data_1 = [json.loads(line) for line in open("data/raw_data/dataset1.json", 'r')]
data_2 = [json.loads(line) for line in open("data/raw_data/dataset2.json", 'r')]

In [5]:
with open("data/raw_data/data.info", 'r') as f:
    info = f.read().splitlines()

In [6]:
# encoding pipeline
infile = open("data/raw_data/encoding_pipeline.pkl",'rb')
enc_pipe = pickle.load(infile)

# Helper Functions

In [7]:


## function to get key of a dictionary
def get_key(dictionary):
    key_object = dictionary.keys()
    key = list(key_object)[0]
    return key

## function to help concatenate columns to get transcript, position, nucleotides
def concat_col(transcript, position, nucleotide, n):
    t_df = pd.DataFrame([transcript]*n)
    p_df = pd.DataFrame([position]*n)
    nu_df = pd.DataFrame([nucleotide]*n)
    n_df = pd.DataFrame([n]*n)

    ## concat columns together
    final_df = pd.concat([t_df, p_df, nu_df, n_df], axis = 1)
    final_df.columns = ['transcript', 'position', 'nucleotides', 'reads_count']
    return final_df

## function to parse line in json file
def parse_line(line):
    ## get transcript
    t = get_key(line)

    ## get position
    p = get_key(line[t])

    ## get nucleotide seq
    nu = get_key(line[t][p])

    ## get number of reads
    reads_count = len(line[t][p][nu])

    ## get dataframe of list of reads
    reads = pd.DataFrame(line[t][p][nu])

    ## concat columns together to get transcript, position, nucleotides and all dwelling time, std, mean
    df = pd.concat([concat_col(t, p, nu, reads_count), reads], axis = 1)
    df.columns = ['transcript', 'position', 'nucleotides', 'reads_count', 'dwellingtime_-1', 'std_-1', 'mean_-1', 'dwellingtime_0', 'std_0', 'mean_0', 'dwellingtime_+1', 'std_+1', 'mean_+1']

    return df



# Parsing Data

In [8]:
# parse all lines in data_1 file into dataframe for concatenation
reads_1 = [parse_line(data_1[i]) for i in range(len(data_1))]
df_1 = pd.concat(reads_1, axis = 0)

# parse all lines in data_2 file into dataframe for concatenation
reads_2 = [parse_line(data_2[i]) for i in range(len(data_2))]
df_2 = pd.concat(reads_2, axis = 0)

In [9]:
# transform info data into dataframe
info_list = [info[i].split(",") for i in range(len(info))]
info_df = pd.DataFrame(info_list[1:]) 
info_df.columns = info_list[0]

In [10]:
# left join data_df and info_df to get features and labels
df_1 = df_1.merge(info_df, how = "left", left_on = ["transcript", "position"], right_on = ["transcript_id", "transcript_position"])
df_2 = df_2.merge(info_df, how = "left", left_on = ["transcript", "position"], right_on = ["transcript_id", "transcript_position"])

In [11]:
# drop duplicate transcript_id and transcript_position column
df_1.drop(['transcript_id', 'transcript_position'],  axis = 1, inplace = True)
df_2.drop(['transcript_id', 'transcript_position'],  axis = 1, inplace = True)


# Preprocessing

In [12]:
df_1_pre = Preprocessing(df_1)
df_2_pre = Preprocessing(df_2)

In [13]:
df_1_pre.model_features_and_clean()
df_2_pre.model_features_and_clean()

Unnamed: 0,transcript,position,nucleotides,reads_count,gene_id,dwellingtime_-1,std_-1,mean_-1,dwellingtime_0,std_0,mean_0,dwellingtime_+1,std_+1,mean_+1,label
0,AT1G01050.1,155,GAAACTA,36,,0.00232,1.93,109.0,0.01260,1.97,111.0,0.00421,1.50,95.3,2
1,AT1G01050.1,155,GAAACTA,36,,0.00896,2.27,110.0,0.00536,2.49,110.0,0.00797,2.28,96.0,2
2,AT1G01050.1,155,GAAACTA,36,,0.00498,6.29,114.0,0.00442,2.07,111.0,0.00785,1.97,96.0,2
3,AT1G01050.1,155,GAAACTA,36,,0.00617,5.16,106.0,0.00830,2.70,105.0,0.00199,2.82,97.4,2
4,AT1G01050.1,155,GAAACTA,36,,0.00664,2.01,110.0,0.00495,1.89,110.0,0.01100,1.64,97.3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6903931,AT5G67600.1,154,AGAACTC,30,,0.01080,7.15,136.0,0.00410,1.46,106.0,0.00930,1.46,102.0,2
6903932,AT5G67600.1,154,AGAACTC,30,,0.02460,5.91,136.0,0.00836,1.77,105.0,0.00664,3.19,88.3,2
6903933,AT5G67600.1,154,AGAACTC,30,,0.00842,8.67,136.0,0.00697,5.23,110.0,0.00978,1.62,95.6,2
6903934,AT5G67600.1,154,AGAACTC,30,,0.00316,6.28,135.0,0.00712,1.91,105.0,0.00232,3.04,96.8,2


In [15]:
df_2_pre.df['gene_id'] = df_2_pre.df['gene_id'].fillna(0)

In [17]:
df_1_pre.feature_eng()
df_2_pre.feature_eng()

Unnamed: 0,gene_id,transcript,position,nucleotides,reads_count,label,dwelling_time_-1_25,dwelling_time_-1_50,dwelling_time_-1_75,dwelling_time_-1_mean,...,dwelling_time_+1_75,dwelling_time_+1_mean,std_+1_25,std_+1_50,std_+1_75,std_+1_mean,mean_+1_25,mean_+1_50,mean_+1_75,mean_+1_mean
0,0,AT1G01050.1,155,GAAACTA,36,2,0.004235,0.005310,0.006790,0.006574,...,0.007888,0.006340,1.5300,1.720,1.9825,1.804167,94.700,95.5,96.500,95.641667
1,0,AT1G01050.1,165,ATAACCA,38,2,0.004980,0.005705,0.008533,0.006866,...,0.007065,0.005963,1.6100,2.105,2.5400,2.255000,81.800,82.2,83.375,82.560526
2,0,AT1G01050.1,347,AAGACAG,35,2,0.003835,0.007300,0.010280,0.008893,...,0.009300,0.008132,2.4700,3.610,5.0800,4.088857,76.600,77.8,79.300,78.120000
3,0,AT1G01050.1,435,AAGACAA,35,2,0.005310,0.006970,0.008465,0.007647,...,0.009065,0.007299,2.5100,3.460,4.6950,3.717714,79.900,80.9,84.050,81.951429
4,0,AT1G01050.1,549,ATGACAA,35,2,0.004105,0.007120,0.011450,0.008467,...,0.008030,0.008201,1.8800,2.740,3.8750,2.998000,85.000,88.2,89.350,86.457143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98825,0,AT5G67590.1,367,AAAACTC,117,2,0.007160,0.010200,0.014500,0.011100,...,0.008630,0.006633,2.0400,2.330,2.6900,2.416410,94.300,95.4,96.400,95.182051
98826,0,AT5G67590.1,444,GTGACCC,113,2,0.004430,0.006190,0.008880,0.007500,...,0.006270,0.005211,2.3300,3.090,4.1400,3.472566,78.300,80.3,82.100,80.369912
98827,0,AT5G67590.1,465,GTGACTC,118,2,0.004320,0.006340,0.009132,0.007294,...,0.008390,0.006698,2.0350,2.885,3.8300,3.127305,88.025,89.2,90.300,88.889831
98828,0,AT5G67590.1,663,GAAACCT,101,2,0.004980,0.007060,0.010800,0.008229,...,0.007300,0.006016,2.0400,2.380,2.7700,2.483366,81.200,82.5,83.900,82.502970


In [19]:
df_2_pre.df

Unnamed: 0,gene_id,transcript,position,nucleotides,reads_count,label,dwelling_time_-1_25,dwelling_time_-1_50,dwelling_time_-1_75,dwelling_time_-1_mean,...,dwelling_time_+1_75,dwelling_time_+1_mean,std_+1_25,std_+1_50,std_+1_75,std_+1_mean,mean_+1_25,mean_+1_50,mean_+1_75,mean_+1_mean
0,0,AT1G01050.1,155,GAAACTA,36,2,0.004235,0.005310,0.006790,0.006574,...,0.007888,0.006340,1.5300,1.720,1.9825,1.804167,94.700,95.5,96.500,95.641667
1,0,AT1G01050.1,165,ATAACCA,38,2,0.004980,0.005705,0.008533,0.006866,...,0.007065,0.005963,1.6100,2.105,2.5400,2.255000,81.800,82.2,83.375,82.560526
2,0,AT1G01050.1,347,AAGACAG,35,2,0.003835,0.007300,0.010280,0.008893,...,0.009300,0.008132,2.4700,3.610,5.0800,4.088857,76.600,77.8,79.300,78.120000
3,0,AT1G01050.1,435,AAGACAA,35,2,0.005310,0.006970,0.008465,0.007647,...,0.009065,0.007299,2.5100,3.460,4.6950,3.717714,79.900,80.9,84.050,81.951429
4,0,AT1G01050.1,549,ATGACAA,35,2,0.004105,0.007120,0.011450,0.008467,...,0.008030,0.008201,1.8800,2.740,3.8750,2.998000,85.000,88.2,89.350,86.457143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98825,0,AT5G67590.1,367,AAAACTC,117,2,0.007160,0.010200,0.014500,0.011100,...,0.008630,0.006633,2.0400,2.330,2.6900,2.416410,94.300,95.4,96.400,95.182051
98826,0,AT5G67590.1,444,GTGACCC,113,2,0.004430,0.006190,0.008880,0.007500,...,0.006270,0.005211,2.3300,3.090,4.1400,3.472566,78.300,80.3,82.100,80.369912
98827,0,AT5G67590.1,465,GTGACTC,118,2,0.004320,0.006340,0.009132,0.007294,...,0.008390,0.006698,2.0350,2.885,3.8300,3.127305,88.025,89.2,90.300,88.889831
98828,0,AT5G67590.1,663,GAAACCT,101,2,0.004980,0.007060,0.010800,0.008229,...,0.007300,0.006016,2.0400,2.380,2.7700,2.483366,81.200,82.5,83.900,82.502970
