# Usage:
This file parses the data from `dataset1.json`/`dataset2.json`, and outputs a combined dataset that contains all features and labels in a single dataset. This combined dataset is named `dataset.csv` and is written to the project's `data` directory.

**Note: Ensure that you are in the project's `notebooks` directory and the data files are in the project's `data` directory. Otherwise, the code will fail due to inconsistent file paths**

# Setting things up...

In [1]:
# This is the only cell that users should change
CHOSEN_DATASET = "dataset2" # Can be dataset0 or dataset1 or dataset2

In [2]:
import pandas as pd
import json
from tqdm import tqdm
import joblib

In [3]:
DATASET_PATH = f"./../data/raw/{CHOSEN_DATASET}.json"
SAVE_PATH = "./../data/raw/dataset.csv"
SCALER_SAVE_PATH = "./../model/minmaxscaler"
MODEL_SAVE_PATH = "./../model/rf3"
PREDICTIONS_SAVE_PATH = f"./../data/curated/{CHOSEN_DATASET}_naiveRF_predictions.csv"

# Converting dataset from JSON format into a dataframe

In [4]:
dataset = []
with open(DATASET_PATH, 'r') as file:
    for line in tqdm(file):
        dataset.append(json.loads(line))

1323it [00:02, 494.52it/s]


In [5]:
processed_data = []
for row in tqdm(dataset):
    transcript_id = list(row.keys())[0]
    transcript_position = list(row[transcript_id].keys())[0]
    k_mer = list(row[transcript_id][transcript_position].keys())[0]
    reads = row[transcript_id][transcript_position][k_mer]
    for read in reads:
        processed_data_row = [transcript_id, transcript_position, k_mer] + read
        processed_data.append(processed_data_row)

100%|████████████████████████████████████████████████████████████████████████████| 1323/1323 [00:01<00:00, 1195.50it/s]


In [6]:
columns = ["transcript_id", "transcript_position", "k_mer",
           "left_dwell"   , "left_std"           , "left_mean",
           "mid_dwell"    , "mid_std"            , "mid_mean",
           "right_dwell"  , "right_std"          , "right_mean"]
processed_data_df = pd.DataFrame(processed_data, columns = columns)
processed_data_df.transcript_position = processed_data_df.transcript_position.astype("int64")

In [7]:
processed_data_df.head()

Unnamed: 0,transcript_id,transcript_position,k_mer,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,tx_id_0,0,AAAACCT,0.0122,3.99,106.0,0.00337,4.56,102.0,0.00664,4.2,84.2
1,tx_id_0,0,AAAACCT,0.0302,2.32,107.0,0.00443,2.36,102.0,0.00332,2.13,79.2
2,tx_id_0,0,AAAACCT,0.00232,5.55,110.0,0.00664,7.04,99.3,0.00232,2.21,86.6
3,tx_id_0,0,AAAACCT,0.00465,2.1,104.0,0.00996,3.9,108.0,0.00401,2.18,82.2
4,tx_id_0,0,AAAACCT,0.0211,3.49,103.0,0.00531,3.8,101.0,0.00997,2.18,81.2


In [8]:
# processed_data_df.to_csv(SAVE_PATH)

# Dataset Normalization

In [9]:
scaler = joblib.load(SCALER_SAVE_PATH)

In [10]:
feature_names = [pos + "_" + stat for pos in ["left", "mid", "right"] for stat in ["dwell", "std", "mean"]]
normalized_features = pd.DataFrame(scaler.transform(processed_data_df.iloc[:, 3:]), columns = feature_names)

In [11]:
normalized_data = pd.concat([processed_data_df.iloc[:, :2], normalized_features], axis = 1)
normalized_data

Unnamed: 0,transcript_id,transcript_position,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,tx_id_0,0,0.089065,0.018921,0.411028,0.012542,0.021927,0.330025,0.049142,0.022103,0.282927
1,tx_id_0,0,0.241170,0.010811,0.423559,0.020317,0.011245,0.330025,0.016381,0.010845,0.221951
2,tx_id_0,0,0.005577,0.026498,0.461153,0.036526,0.033968,0.296526,0.006513,0.011280,0.312195
3,tx_id_0,0,0.025266,0.009742,0.385965,0.060877,0.018722,0.404467,0.023189,0.011117,0.258537
4,tx_id_0,0,0.164272,0.016493,0.373434,0.026771,0.018237,0.317618,0.082001,0.011117,0.246341
...,...,...,...,...,...,...,...,...,...,...,...
1171935,tx_id_6,1880,0.043012,0.039999,0.473684,0.043494,0.029647,0.354839,0.072035,0.012422,0.373171
1171936,tx_id_6,1880,0.030674,0.022904,0.411028,0.012175,0.035813,0.317618,0.081903,0.009812,0.364634
1171937,tx_id_6,1880,0.109346,0.037036,0.385965,0.007335,0.005661,0.210918,0.195777,0.010736,0.353659
1171938,tx_id_6,1880,0.037519,0.019990,0.573935,0.004841,0.026200,0.240695,0.003256,0.010138,0.263415


# Feature Engineering
Encode each bag with the statistics of each of their features

In [12]:
bag_data = normalized_data.groupby(["transcript_id", "transcript_position"]).quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
bag_data = bag_data.reset_index().pivot(index = ["transcript_id", "transcript_position"], 
                                        columns=['level_2'], 
                                        values=['left_dwell', 'left_std', 'left_mean',
                                                'mid_dwell', 'mid_std', 'mid_mean',
                                                'right_dwell', 'right_std', 'right_mean']
                                       )
new_colnames = [(c[0] + " " + str(c[1])).strip() for c in bag_data.reset_index().columns]
bag_data = bag_data.reset_index()
bag_data.columns = new_colnames
bag_data

Unnamed: 0,transcript_id,transcript_position,left_dwell 0.0,left_dwell 0.05,left_dwell 0.25,left_dwell 0.5,left_dwell 0.75,left_dwell 0.95,left_dwell 1.0,left_std 0.0,...,right_std 0.75,right_std 0.95,right_std 1.0,right_mean 0.0,right_mean 0.05,right_mean 0.25,right_mean 0.5,right_mean 0.75,right_mean 0.95,right_mean 1.0
0,tx_id_0,0,0.000000,0.005577,0.017069,0.034223,0.065827,0.134697,0.277505,0.001593,...,0.013401,0.020069,0.063982,0.169512,0.214878,0.237805,0.250000,0.262195,0.291463,0.368293
1,tx_id_0,10,0.000000,0.008450,0.028055,0.053997,0.095826,0.176610,0.549603,0.001530,...,0.026998,0.045479,0.078123,0.102439,0.119512,0.139024,0.153659,0.171951,0.218293,0.339024
2,tx_id_0,20,0.000000,0.018844,0.045716,0.072165,0.106811,0.187933,0.474396,0.002268,...,0.015359,0.023681,0.117282,0.226829,0.300000,0.330488,0.343902,0.354878,0.373171,0.487805
3,tx_id_0,30,0.000000,0.008083,0.022478,0.044702,0.073644,0.151597,0.385668,0.001112,...,0.025625,0.038564,0.081386,0.059756,0.079268,0.097561,0.118293,0.146341,0.190427,0.287805
4,tx_id_0,40,0.000000,0.008788,0.023745,0.043519,0.073010,0.136936,0.370458,0.003701,...,0.019778,0.033307,0.100422,0.254878,0.337805,0.357317,0.369512,0.380488,0.402439,0.468293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318,tx_id_6,1840,0.002789,0.008450,0.020872,0.036505,0.061687,0.111881,0.255535,0.005808,...,0.016461,0.023952,0.063982,0.157317,0.235366,0.265854,0.285366,0.306098,0.342256,0.435366
1319,tx_id_6,1850,0.005577,0.015126,0.032238,0.049434,0.076390,0.138499,0.330742,0.003978,...,0.021124,0.030397,0.062350,0.204878,0.251220,0.279878,0.296341,0.322561,0.385976,0.424390
1320,tx_id_6,1860,0.000000,0.008450,0.025266,0.046730,0.073010,0.152865,0.522562,0.003327,...,0.031839,0.043247,0.086825,0.104878,0.170732,0.195122,0.223171,0.258537,0.294512,0.382927
1321,tx_id_6,1870,0.000000,0.005577,0.016816,0.033632,0.061687,0.128781,0.263985,0.004497,...,0.016447,0.031132,0.059087,0.101220,0.179268,0.192683,0.206098,0.229268,0.278049,0.359756


# Making Predictions

In [13]:
rf = joblib.load(MODEL_SAVE_PATH)

In [14]:
# Use the forest's predict method on the test data
bag_data["score"] = rf.predict(bag_data.iloc[:,2:])

In [15]:
predictions = bag_data.loc[:, ["transcript_id", "transcript_position", "score"]]

In [16]:
predictions.to_csv(PREDICTIONS_SAVE_PATH, index=False)