# Usage:
This file parses the data from `dataset1.json`/`dataset2.json`, and outputs a combined dataset that contains all features and labels in a single dataset. This combined dataset is named `dataset.csv` and is written to the project's `data` directory.

**Note: Ensure that you are in the project's `notebooks` directory and the data files are in the project's `data` directory. Otherwise, the code will fail due to inconsistent file paths**

# Setting things up...

In [1]:
# This is the only cell that users should change
CHOSEN_DATASET = "dataset0" # Can be dataset0 or dataset1 or dataset2

In [2]:
import pandas as pd
import json
from tqdm import tqdm
import joblib

In [14]:
DATASET_PATH = f"./../../data/raw/{CHOSEN_DATASET}.json"
SAVE_PATH = "./../../data/raw/dataset.csv"
SCALER_SAVE_PATH = "./../../model/minmaxscaler"
MODEL_SAVE_PATH = "./../../model/rf-ntrees-1000"
PREDICTIONS_SAVE_PATH = f"./../../data/curated/{CHOSEN_DATASET}_naiveRF_predictions.csv"

# Converting dataset from JSON format into a dataframe

In [4]:
dataset = []
with open(DATASET_PATH, 'r') as file:
    for line in tqdm(file):
        dataset.append(json.loads(line))

121838it [01:05, 1860.84it/s]


In [5]:
processed_data = []
for row in tqdm(dataset):
    transcript_id = list(row.keys())[0]
    transcript_position = list(row[transcript_id].keys())[0]
    k_mer = list(row[transcript_id][transcript_position].keys())[0]
    reads = row[transcript_id][transcript_position][k_mer]
    for read in reads:
        processed_data_row = [transcript_id, transcript_position, k_mer] + read
        processed_data.append(processed_data_row)

100%|████████████████████████████████████████████████████████████████████████| 121838/121838 [00:26<00:00, 4594.88it/s]


In [6]:
columns = ["transcript_id", "transcript_position", "k_mer",
           "left_dwell"   , "left_std"           , "left_mean",
           "mid_dwell"    , "mid_std"            , "mid_mean",
           "right_dwell"  , "right_std"          , "right_mean"]
processed_data_df = pd.DataFrame(processed_data, columns = columns)
processed_data_df.transcript_position = processed_data_df.transcript_position.astype("int64")

In [7]:
processed_data_df.head()

Unnamed: 0,transcript_id,transcript_position,k_mer,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,AAGACCA,0.00398,2.06,125.0,0.0083,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,AAGACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.013,7.15,82.2


In [8]:
# processed_data_df.to_csv(SAVE_PATH)

# Dataset Normalization

In [9]:
scaler = joblib.load(SCALER_SAVE_PATH)

In [10]:
feature_names = [pos + "_" + stat for pos in ["left", "mid", "right"] for stat in ["dwell", "std", "mean"]]
normalized_features = pd.DataFrame(scaler.transform(processed_data_df.iloc[:, 3:]), columns = feature_names)

In [11]:
normalized_data = pd.concat([processed_data_df.iloc[:, :2], normalized_features], axis = 1)
normalized_data

Unnamed: 0,transcript_id,transcript_position,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,ENST00000000233,244,0.011239,0.009548,0.649123,0.117647,0.050283,0.578164,0.075390,0.058543,0.281707
1,ENST00000000233,244,0.039294,0.011831,0.649123,0.049729,0.022461,0.627792,0.085258,0.033525,0.242683
2,ENST00000000233,244,0.025266,0.018581,0.448622,0.087575,0.058051,0.602978,0.032761,0.010845,0.226829
3,ENST00000000233,244,0.019605,0.009548,0.649123,0.048702,0.024112,0.677419,0.032761,0.019819,0.236585
4,ENST00000000233,244,0.042082,0.013725,0.586466,0.007335,0.018917,0.665012,0.111901,0.038148,0.258537
...,...,...,...,...,...,...,...,...,...,...,...
11027101,ENST00000641834,1693,0.021295,0.035919,0.436090,0.029192,0.049312,0.503722,0.082297,0.010192,0.187805
11027102,ENST00000641834,1693,0.042082,0.008820,0.448622,0.031685,0.059508,0.429280,0.157292,0.013456,0.165854
11027103,ENST00000641834,1693,0.046899,0.021787,0.398496,0.017016,0.031735,0.466501,0.014802,0.011661,0.296341
11027104,ENST00000641834,1693,0.008450,0.010859,0.448622,0.054789,0.050283,0.404467,0.049142,0.023409,0.192683


# Feature Engineering
Encode each bag with the statistics of each of their features

In [12]:
bag_data = normalized_data.groupby(["transcript_id", "transcript_position"]).quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
bag_data = bag_data.reset_index().pivot(index = ["transcript_id", "transcript_position"], 
                                        columns=['level_2'], 
                                        values=['left_dwell', 'left_std', 'left_mean',
                                                'mid_dwell', 'mid_std', 'mid_mean',
                                                'right_dwell', 'right_std', 'right_mean']
                                       )
new_colnames = [(c[0] + " " + str(c[1])).strip() for c in bag_data.reset_index().columns]
bag_data = bag_data.reset_index()
bag_data.columns = new_colnames
bag_data

Unnamed: 0,transcript_id,transcript_position,left_dwell 0.0,left_dwell 0.05,left_dwell 0.25,left_dwell 0.5,left_dwell 0.75,left_dwell 0.95,left_dwell 1.0,left_std 0.0,...,right_std 0.75,right_std 0.95,right_std 1.0,right_mean 0.0,right_mean 0.05,right_mean 0.25,right_mean 0.5,right_mean 0.75,right_mean 0.95,right_mean 1.0
0,ENST00000000233,244,0.002789,0.008450,0.022478,0.044871,0.078925,0.134021,0.272435,0.008140,...,0.030153,0.056912,0.083562,0.147561,0.192927,0.219512,0.237805,0.256098,0.296341,0.332927
1,ENST00000000233,261,0.002789,0.008450,0.022478,0.033632,0.053321,0.106093,0.173568,0.004007,...,0.018242,0.029495,0.075947,0.336585,0.360427,0.387805,0.403659,0.423476,0.454756,0.512195
2,ENST00000000233,316,0.005577,0.009008,0.023154,0.039294,0.067348,0.126077,0.238634,0.005760,...,0.012531,0.019340,0.036516,0.285366,0.320976,0.334146,0.343902,0.354878,0.380488,0.429268
3,ENST00000000233,332,0.005577,0.016782,0.040688,0.062194,0.099206,0.177877,0.298631,0.005469,...,0.013143,0.017282,0.034558,0.248780,0.264573,0.316768,0.352439,0.368598,0.409817,0.423171
4,ENST00000000233,368,0.002789,0.008450,0.034730,0.061687,0.103431,0.194947,0.389894,0.005129,...,0.025897,0.036674,0.047176,0.202439,0.244939,0.275915,0.297561,0.315854,0.335549,0.359756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,0.005577,0.005577,0.035406,0.055011,0.083995,0.143485,0.351022,0.004031,...,0.023735,0.039997,0.067245,0.158537,0.187317,0.229268,0.257317,0.279268,0.341463,0.387805
121834,ENST00000641834,1429,0.005577,0.010681,0.025266,0.038195,0.075545,0.154808,0.207369,0.008140,...,0.021886,0.030251,0.034613,0.170732,0.183171,0.218293,0.239024,0.254878,0.295122,0.360976
121835,ENST00000641834,1531,0.005577,0.011239,0.024801,0.044871,0.063165,0.161357,0.252155,0.006343,...,0.012042,0.020254,0.038365,0.209756,0.236341,0.273476,0.286585,0.297561,0.313232,0.330488
121836,ENST00000641834,1537,0.005577,0.005577,0.019436,0.041744,0.062616,0.133852,0.394119,0.006634,...,0.014870,0.024757,0.034613,0.202439,0.222683,0.237805,0.256098,0.278049,0.305854,0.321951


# Making Predictions

In [15]:
rf = joblib.load(MODEL_SAVE_PATH)

In [16]:
# Use the forest's predict method on the test data
bag_data["score"] = rf.predict(bag_data.iloc[:,2:])

In [17]:
predictions = bag_data.loc[:, ["transcript_id", "transcript_position", "score"]]

In [18]:
predictions.to_csv(PREDICTIONS_SAVE_PATH, index=False)