# Preprocessing pipeline

In [1]:
# adding the project root inside the python path
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

In [2]:
# The path where the dataset are stored
DATASET_PATH: str = "../../dataset/first_disease_sel/"
DISEASE_COLNAME: str = 'DISEASE'

## Starting the pipeline

In [3]:
from analysis.preprocess import PreprocessPipeline

pipeline = PreprocessPipeline(datasets_path=DATASET_PATH, disease_col_name=DISEASE_COLNAME)
pipeline.execute_pipeline()

INFO:root:Pipeline already executed, found dataset inside /tmp/chl
INFO:root:Splitting dataset
INFO:root:Pipeline executed


## Inspecting the dataset we have built

The shape of dataset is $\text{rows} \times (n_{feature} + 1)$ where $+1$ is the y (target).

In [None]:
dataset = pipeline.dataset
dataset

## Plot disease distribution

In [None]:
# plotting the number of sample for each disease
disease = dataset['DISEASE'].value_counts()
print(disease)
disease.plot.bar()

## Experiments

In [None]:
import seaborn as sns
dataset = dataset[dataset['DISEASE'] == 'A1A']
corr = dataset.corr()
sns.heatmap(corr)


In [None]:
import numpy as np
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = dataset.columns[columns]

In [None]:
## Selecting only the train and test data

The shape of `train_set` and `test_set` is $R \times n_{features}$ where $R < $ rows

In [5]:
x, y = pipeline.test_set
x

Unnamed: 0,SWAP70,SERTAD2,TAPT1,LYST,ZNF83,HSD17B10,COX7B,FN3KRP,FAM120A,ZNF155,...,IPO13,NUP188,DDX60,MLH3,IGFBP7,YAP1,EML4,POMT2,LTBP4,FIBP
330,0.212567,1.000000,0.000000,0.339730,0.000000,0.359792,0.202816,0.500993,0.384262,0.063050,...,0.229384,0.826634,1.000000,1.000000,0.016118,0.392841,0.105317,0.000000,1.000000,0.298693
312,0.145114,0.484298,0.261904,0.749082,1.000000,0.053916,0.073590,0.178453,0.000000,0.454354,...,0.460398,0.775799,1.000000,0.180764,0.620879,0.230877,0.815269,0.642125,0.825034,0.123821
97,0.157042,0.681858,0.472924,0.265096,0.482143,0.142436,0.336447,0.403672,0.643755,0.205084,...,0.324652,0.274891,0.095989,0.198475,0.277550,0.083364,0.421012,0.336790,0.354213,0.090360
23,0.306159,0.639392,0.768268,0.730576,0.178221,0.138684,0.812343,0.708128,0.515353,0.202960,...,0.000000,0.445399,0.057466,0.690620,0.652229,0.623564,0.353394,0.444441,0.417545,0.594054
44,0.730299,0.433469,0.786303,0.464067,0.439204,1.000000,1.000000,0.275885,1.000000,0.000000,...,0.180092,0.982381,0.607558,0.508344,0.550279,0.121410,0.866431,0.639267,0.435709,0.489652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,0.782049,0.694581,0.300556,0.032058,0.514782,0.289088,0.523735,0.077712,0.556361,0.184867,...,0.734491,0.543154,0.185832,0.983004,1.000000,0.443915,0.275828,0.532254,0.000000,0.632733
201,0.945956,0.323621,0.377974,0.574941,0.277886,0.061082,0.044363,0.273338,0.992451,0.000000,...,0.741231,0.620693,0.149020,0.104523,0.079473,0.070119,0.641081,0.000000,0.285432,0.053640
258,0.103774,0.079268,0.842520,0.272059,0.409357,0.682540,0.370588,1.000000,0.426471,0.683258,...,0.959459,1.000000,0.744094,0.210526,0.000000,0.982759,0.305344,0.817568,0.568905,0.437500
264,0.117925,0.378049,0.944882,0.492647,0.660819,0.507937,0.429412,0.396947,0.720588,0.814480,...,0.783784,0.478788,0.267717,0.458647,0.575630,0.840517,0.564885,0.851351,0.402827,0.000000


In [None]:
x1, x2 = pipeline.train_set

In [None]:
x1