# Testing dataset loading

This notebook contains code that run the pipeline we have created to analyze a set of datasets.

In [1]:
# adding the project root inside the python path
import sys
import os

sys.path.insert(0, os.path.abspath('..'))

In [2]:
# The path where the dataset are stored
DATASET_PATH: str = "../../dataset/first_disease_sel/"
DISEASE_COLNAME: str = 'DISEASE'

In [3]:
from analysis.dataset import load_datasets, compute_ds_col_intersection, clean_datasets, build_dataset, scale_minmax

### Load datasets

In [4]:
# Load dataset
datasets = load_datasets(DATASET_PATH)
# compute intersection
colname_intersection = compute_ds_col_intersection(datasets)
# clean datasets
datasets = clean_datasets(datasets, colname_intersection)

INFO:root:Inspecting directory ../../dataset/first_disease_sel/GS
INFO:root:Setting disease as GS
INFO:root:Loading file ../../dataset/first_disease_sel/GS/eGSE117146m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/MCM
INFO:root:Setting disease as MCM
INFO:root:Loading file ../../dataset/first_disease_sel/MCM/MCM_GSE149607.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/NALD
INFO:root:Setting disease as NALD
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE85804m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE34308m.csv
INFO:root:Loading file ../../dataset/first_disease_sel/NALD/eGSE117647m.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/A1A
INFO:root:Setting disease as A1A
INFO:root:Loading file ../../dataset/first_disease_sel/A1A/A1A_deficiency_GSE109516_pbmc.csv
INFO:root:Inspecting directory ../../dataset/first_disease_sel/DIABETE
INFO:root:Setting disease as DIABETE
INFO:root:Loading file

## Scaling

In [5]:
scale_minmax(datasets)

## Building unique DS

In [6]:
# finally build the unique dataset
dataset = build_dataset(datasets)
dataset

Unnamed: 0,DBF4,HDGF,FGGY,ZNF212,MATN2,EZH2,SP1,CIAO1,SPAG16,TIE1,...,KIF13A,RELA,CD320,SUMO3,S100PBP,TSPAN2,HDAC5,GTPBP1,EXOC6B,LRRK1
0,0.845741,0.262321,0.056685,0.642673,0.391867,0.159400,0.648183,0.641888,0.142962,0.767586,...,0.549536,0.416490,0.619608,0.444056,0.777795,0.073474,0.582168,0.134254,0.000000,0.527766
1,0.920240,0.586209,1.000000,0.066759,0.261315,0.000000,0.963466,0.476715,0.913618,0.608851,...,0.650507,0.396693,0.402936,0.558398,1.000000,0.697394,0.000000,0.507015,0.324098,0.588302
2,0.779848,0.339573,0.290705,0.673761,0.350022,0.626847,0.832938,0.914359,0.374409,0.731490,...,0.567397,0.362949,0.510207,0.727660,0.868839,0.847086,0.734903,0.480767,0.214531,0.657040
3,0.938717,0.456911,0.404222,0.604067,0.930749,0.381957,0.582604,1.000000,0.000000,0.649896,...,0.834061,0.456284,0.113139,0.846422,0.629524,0.344342,0.465653,0.445645,0.368990,0.854131
4,0.676631,0.758689,0.820812,0.721471,0.030534,0.052575,0.732706,0.386558,0.513853,0.831849,...,0.621713,0.531190,0.421745,0.892459,0.146682,0.862078,0.499185,0.443758,0.416868,0.462537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,0.709804,0.320942,0.884615,0.467742,0.281690,0.923611,0.864787,0.294118,0.631579,0.166667,...,0.576119,0.286530,0.166667,0.381853,0.901370,0.902527,0.235759,0.523902,0.734043,0.344037
395,0.427451,0.389503,0.275641,0.568548,0.535211,0.118056,0.166746,0.247549,0.263158,0.000000,...,0.096517,0.149543,0.371429,0.000000,0.386301,0.348977,0.197785,0.258537,0.361702,0.123853
396,0.298039,0.629534,0.205128,0.762097,0.845070,0.145833,0.210702,0.485294,0.438596,0.111111,...,0.000000,0.262557,0.338095,0.550095,0.460274,0.048135,0.314873,0.334634,0.382979,0.282110
397,0.074510,0.350270,0.230769,0.459677,1.000000,0.069444,0.350693,0.127451,0.280702,0.166667,...,0.109453,0.176941,0.138095,0.024575,0.126027,0.176895,0.177215,0.251707,0.478723,0.316514


## Inspecting dataset we have built

In [None]:
dataset['DISEASE']

In [None]:
# plotting the number of sample for each disease
disease = dataset['DISEASE'].value_counts()
print(disease)
disease.plot.bar()

In [None]:
import matplotlib.pyplot as plt
from bioinfokit import analys, visuz
import numpy as np

In [None]:
visuz.gene_exp.hmap(df=dataset.astype(float), rowclus=False, colclus=False, dim=(10, 10), tickfont=(2, 4), show=True)