In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib

DATA_READ_PATH = "./../data/raw/dataset.csv"
DATA_WRITE_PATH = "./../data/curated/dataset_scaled.csv"
SCALER_SAVE_PATH = "./../model/minmaxscaler"

In [2]:
data = pd.read_csv(DATA_READ_PATH).iloc[:, 1:]

In [3]:
# Bringing in the bag_id column for each bag so that we can remove the
# gene_id, transcript_id, and transcript_position columns for clarity
bag_meta = pd.read_csv("./../data/raw/bag_meta.csv").iloc[:,1:]
data = data.merge(bag_meta, on = ["gene_id", "transcript_id", "transcript_position", "label"])

feature_names = [pos + "_" + stat for pos in ["left", "mid", "right"] for stat in ["dwell", "std", "mean"]]
# feature_names is just the list of 9 features: ["left_dwell", "left_std", ..., "right_std", "right_mean"]
data = data.loc[:, ["bag_id", "label"] + feature_names]
data

Unnamed: 0,bag_id,label,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,1,0,0.00299,2.06,125.0,0.01770,10.40,122.0,0.00930,10.90,84.1
1,1,0,0.00631,2.53,125.0,0.00844,4.67,126.0,0.01030,6.30,80.9
2,1,0,0.00465,3.92,109.0,0.01360,12.00,124.0,0.00498,2.13,79.6
3,1,0,0.00398,2.06,125.0,0.00830,5.01,130.0,0.00498,3.78,80.4
4,1,0,0.00664,2.92,120.0,0.00266,3.94,129.0,0.01300,7.15,82.2
...,...,...,...,...,...,...,...,...,...,...,...
11027101,121838,0,0.00418,7.49,108.0,0.00564,10.20,116.0,0.01000,2.01,76.4
11027102,121838,0,0.00664,1.91,109.0,0.00598,12.30,110.0,0.01760,2.61,74.6
11027103,121838,0,0.00721,4.58,105.0,0.00398,6.58,113.0,0.00316,2.28,85.3
11027104,121838,0,0.00266,2.33,109.0,0.00913,10.40,108.0,0.00664,4.44,76.8


In [4]:
scaler = MinMaxScaler()
scaler.fit(data.iloc[:, 2:])
normalized_features = pd.DataFrame(scaler.transform(data.iloc[:, 2:]), columns = feature_names)
normalized_data = pd.concat([data.iloc[:, :2], normalized_features], axis = 1)
normalized_data

Unnamed: 0,bag_id,label,left_dwell,left_std,left_mean,mid_dwell,mid_std,mid_mean,right_dwell,right_std,right_mean
0,1,0,0.011239,0.009548,0.649123,0.117647,0.050283,0.578164,0.075390,0.058543,0.281707
1,1,0,0.039294,0.011831,0.649123,0.049729,0.022461,0.627792,0.085258,0.033525,0.242683
2,1,0,0.025266,0.018581,0.448622,0.087575,0.058051,0.602978,0.032761,0.010845,0.226829
3,1,0,0.019605,0.009548,0.649123,0.048702,0.024112,0.677419,0.032761,0.019819,0.236585
4,1,0,0.042082,0.013725,0.586466,0.007335,0.018917,0.665012,0.111901,0.038148,0.258537
...,...,...,...,...,...,...,...,...,...,...,...
11027101,121838,0,0.021295,0.035919,0.436090,0.029192,0.049312,0.503722,0.082297,0.010192,0.187805
11027102,121838,0,0.042082,0.008820,0.448622,0.031685,0.059508,0.429280,0.157292,0.013456,0.165854
11027103,121838,0,0.046899,0.021787,0.398496,0.017016,0.031735,0.466501,0.014802,0.011661,0.296341
11027104,121838,0,0.008450,0.010859,0.448622,0.054789,0.050283,0.404467,0.049142,0.023409,0.192683


In [5]:
normalized_data.to_csv(DATA_WRITE_PATH)
joblib.dump(scaler, SCALER_SAVE_PATH)