# Train-Test Split

The next section focuses on splitting the datasets in train and test sets, in order to perform our model and evaluate it.


## Additional Imports


In [1]:
#already in the main:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#to add:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Metadata Files
filepath_HCC = "raw_data/HCC1806_SmartS_MetaData.tsv"
filepath_MCF = "raw_data/MCF7_SmartS_MetaData.tsv"

#already in main:
df_meta_HCC = pd.read_csv(filepath_HCC,delimiter="\t",engine='python',index_col=0)
df_meta_MCF = pd.read_csv(filepath_MCF,delimiter="\t",engine='python',index_col=0)

#Filtered Files
df_HCC_s_f = pd.read_csv("raw_data/HCC1806_SmartS_Filtered_Data.txt", delimiter="\ ",engine='python',index_col=0)
df_MCF_s_f = pd.read_csv("raw_data/MCF7_SmartS_Filtered_Data.txt", delimiter="\ ",engine='python',index_col=0)

#Transposition
df_HCC_F = df_HCC_s_f.T
df_MCF_F = df_MCF_s_f.T
df_HCC_F.head()

Unnamed: 0,"""CICP27""","""DDX11L17""","""WASH9P""","""OR4F29""","""MTND1P23""","""MTND2P28""","""MTCO1P12""","""MTCO2P12""","""MTATP8P1""","""MTATP6P1""",...,"""MT-TH""","""MT-TS2""","""MT-TL2""","""MT-ND5""","""MT-ND6""","""MT-TE""","""MT-CYB""","""MT-TT""","""MT-TP""","""MAFIP"""
"""output.STAR.PCRPlate1A10_Normoxia_S123_Aligned.sortedByCoord.out.bam""",0,0,0,2,250,54,1740,6,1,974,...,17,5,15,3852,900,22,4208,26,66,0
"""output.STAR.PCRPlate1A12_Normoxia_S26_Aligned.sortedByCoord.out.bam""",0,0,0,0,424,100,2340,5,1,1642,...,49,29,36,7457,1439,43,6491,62,71,4
"""output.STAR.PCRPlate1A2_Hypoxia_S104_Aligned.sortedByCoord.out.bam""",0,0,0,0,63,16,1020,3,0,653,...,43,17,8,1479,234,0,4819,11,3,7
"""output.STAR.PCRPlate1A3_Hypoxia_S4_Aligned.sortedByCoord.out.bam""",0,0,0,0,27,3,303,0,0,91,...,0,0,3,303,33,0,310,4,9,0
"""output.STAR.PCRPlate1A4_Hypoxia_S8_Aligned.sortedByCoord.out.bam""",0,1,0,0,81,1,587,0,0,161,...,1,0,0,650,79,3,695,0,14,9


---
## Pipeline Definition and Application

We define a generic pipeline to apply to the two datasets: 

(((In case additional scalers need to be applied, add the import in the section above and add the corresponding tuple in the list below.)))

In [3]:
steps = [('scaler', StandardScaler()), ('normalizer', MinMaxScaler())]
pipeline = Pipeline(steps)

In [4]:
df_HCC_F_N = pd.DataFrame(pipeline.fit_transform(df_HCC_F), columns=df_HCC_F.columns, index=df_HCC_F.index)
df_MCF_F_N = pd.DataFrame(pipeline.fit_transform(df_MCF_F), columns=df_MCF_F.columns, index=df_MCF_F.index)
df_HCC_F_N.head()

Unnamed: 0,"""CICP27""","""DDX11L17""","""WASH9P""","""OR4F29""","""MTND1P23""","""MTND2P28""","""MTCO1P12""","""MTCO2P12""","""MTATP8P1""","""MTATP6P1""",...,"""MT-TH""","""MT-TS2""","""MT-TL2""","""MT-ND5""","""MT-ND6""","""MT-TE""","""MT-CYB""","""MT-TT""","""MT-TP""","""MAFIP"""
"""output.STAR.PCRPlate1A10_Normoxia_S123_Aligned.sortedByCoord.out.bam""",0.0,0.0,0.0,0.5,0.359307,0.45,0.486813,0.5,0.25,0.412065,...,0.326923,0.116279,0.263158,0.428189,0.624128,0.103774,0.365044,0.168831,0.228374,0.0
"""output.STAR.PCRPlate1A12_Normoxia_S26_Aligned.sortedByCoord.out.bam""",0.0,0.0,0.0,0.0,0.61039,0.833333,0.655163,0.416667,0.25,0.695837,...,0.942308,0.674419,0.631579,0.830802,1.0,0.20283,0.56708,0.402597,0.245675,0.166667
"""output.STAR.PCRPlate1A2_Hypoxia_S104_Aligned.sortedByCoord.out.bam""",0.0,0.0,0.0,0.0,0.089466,0.133333,0.284792,0.25,0.0,0.275701,...,0.826923,0.395349,0.140351,0.163167,0.159693,0.0,0.419115,0.071429,0.010381,0.291667
"""output.STAR.PCRPlate1A3_Hypoxia_S4_Aligned.sortedByCoord.out.bam""",0.0,0.0,0.0,0.0,0.037518,0.025,0.083614,0.0,0.0,0.036958,...,0.0,0.0,0.052632,0.031829,0.019526,0.0,0.020088,0.025974,0.031142,0.0
"""output.STAR.PCRPlate1A4_Hypoxia_S8_Aligned.sortedByCoord.out.bam""",0.0,0.083333,0.0,0.0,0.11544,0.008333,0.1633,0.0,0.0,0.066695,...,0.019231,0.0,0.0,0.070583,0.051604,0.014151,0.054159,0.0,0.048443,0.375


---
## Main Genes Selection

In order to limit the dimension of the dataset, it is best to pick only 3000 genes for each dataset. The genes will be sorted with respect to a certain criteria (for now number of cells in which it is expressed)(??), and then the first 3000 columns in the dataset will be picked.

In [5]:
def MainCols(df,n):
    #Given Dataframe df and number n, returns the first n columns of df in term of nonzero elements
    non_zero_counts = pd.DataFrame((df != 0).sum(axis=0), columns = ["Counts"]).T
    sorted_df = non_zero_counts.sort_values(by = 'Counts', axis = 1, ascending = False)
    columns = (sorted_df.iloc[:,0:n]).columns
    filtered = df.loc[:, df.columns.isin(columns)]
    return filtered
    

In [6]:
df_HCC = MainCols(df_HCC_F_N, 3000)
df_MCF = MainCols(df_MCF_F_N, 3000)
df_HCC.head()

Unnamed: 0,"""MTND1P23""","""MTND2P28""","""MTCO1P12""","""MTATP6P1""","""MTCO3P12""","""ISG15""","""SDF4""","""UBE2J2""","""PUSL1""","""INTS11""",...,"""MT-ATP8""","""MT-ATP6""","""MT-CO3""","""MT-ND3""","""MT-ND4L""","""MT-ND4""","""MT-ND5""","""MT-ND6""","""MT-CYB""","""MT-TP"""
"""output.STAR.PCRPlate1A10_Normoxia_S123_Aligned.sortedByCoord.out.bam""",0.359307,0.45,0.486813,0.412065,0.345992,0.044321,0.118367,0.183635,0.00277,0.08377,...,0.545094,0.431119,0.341359,0.455,0.787973,0.85185,0.428189,0.624128,0.365044,0.228374
"""output.STAR.PCRPlate1A12_Normoxia_S26_Aligned.sortedByCoord.out.bam""",0.61039,0.833333,0.655163,0.695837,0.763713,0.155298,0.298776,0.282588,0.144044,0.352531,...,0.862167,0.734575,0.634929,0.29,0.556423,0.750157,0.830802,1.0,0.56708,0.245675
"""output.STAR.PCRPlate1A2_Hypoxia_S104_Aligned.sortedByCoord.out.bam""",0.089466,0.133333,0.284792,0.275701,0.295359,0.159107,0.347755,0.813511,0.024931,0.513089,...,0.295519,0.317658,0.253352,0.0,0.139269,0.314246,0.163167,0.159693,0.419115,0.010381
"""output.STAR.PCRPlate1A3_Hypoxia_S4_Aligned.sortedByCoord.out.bam""",0.037518,0.025,0.083614,0.036958,0.042194,0.021814,0.083265,0.056137,0.078947,0.055846,...,0.014748,0.033612,0.030461,0.025,0.035533,0.046022,0.031829,0.019526,0.020088,0.031142
"""output.STAR.PCRPlate1A4_Hypoxia_S8_Aligned.sortedByCoord.out.bam""",0.11544,0.008333,0.1633,0.066695,0.075949,0.042763,0.245714,0.016175,0.076177,0.152705,...,0.03063,0.072905,0.062107,0.06,0.072237,0.081721,0.070583,0.051604,0.054159,0.048443


---
## Split

This function splits the datasets into train and test, according to a ratio that we set.

(((I just write the function defined in page 49 of Lucibello's book)))

In [7]:
def split_train_test(data, test_ratio):
 shuffled_indices = np.random.permutation(len(data))
 test_set_size = int(len(data) * test_ratio)
 test_indices = shuffled_indices[:test_set_size]
 train_indices = shuffled_indices[test_set_size:]
 return data.iloc[train_indices], data.iloc[test_indices]

In [8]:
df_HCC_train , df_HCC_test = split_train_test(df_HCC, 0.2)
df_MCF_train , df_MCF_test = split_train_test(df_MCF, 0.2)

In [9]:
df_HCC_train.head()

Unnamed: 0,"""MTND1P23""","""MTND2P28""","""MTCO1P12""","""MTATP6P1""","""MTCO3P12""","""ISG15""","""SDF4""","""UBE2J2""","""PUSL1""","""INTS11""",...,"""MT-ATP8""","""MT-ATP6""","""MT-CO3""","""MT-ND3""","""MT-ND4L""","""MT-ND4""","""MT-ND5""","""MT-ND6""","""MT-CYB""","""MT-TP"""
"""output.STAR.PCRPlate4C6_Hypoxia_S231_Aligned.sortedByCoord.out.bam""",0.102453,0.466667,0.493827,0.376381,0.362869,0.238054,0.314286,0.099905,0.108033,0.17103,...,0.458877,0.366893,0.388509,0.41,0.328778,0.395919,0.482689,0.559275,0.409115,0.131488
"""output.STAR.PCRPlate3E12_Normoxia_S217_Aligned.sortedByCoord.out.bam""",0.111111,0.058333,0.0867,0.06627,0.046414,0.021988,0.057959,0.056137,0.037396,0.120419,...,0.066364,0.084583,0.066119,0.02,0.130678,0.09547,0.102189,0.079498,0.07469,0.107266
"""output.STAR.PCRPlate2A8_Normoxia_S151_Aligned.sortedByCoord.out.bam""",0.124098,0.091667,0.110269,0.083263,0.07173,0.021641,0.119184,0.093245,0.124654,0.070681,...,0.073738,0.071327,0.057091,0.085,0.099701,0.103719,0.106656,0.122036,0.042124,0.083045
"""output.STAR.PCRPlate3C3_Hypoxia_S71_Aligned.sortedByCoord.out.bam""",0.085137,0.091667,0.106341,0.053101,0.059072,0.008137,0.12898,0.060894,0.166205,0.199825,...,0.038003,0.057283,0.053899,0.045,0.076793,0.068503,0.089122,0.071827,0.044779,0.044983
"""output.STAR.PCRPlate4A7_Normoxia_S201_Aligned.sortedByCoord.out.bam""",0.034632,0.083333,0.012626,0.02294,0.004219,0.054017,0.083265,0.26451,0.210526,0.192845,...,0.014748,0.020672,0.012221,0.03,0.0328,0.037001,0.048582,0.048117,0.010442,0.00346


---
## Labels

Create four dataframes containing only one column: the label hypoxia-normoxia is encoded as 0-1.

In [10]:
def Label(df, meta):
    Names = [cell[1:-1] for cell in df.index]
    Hypo = [meta.loc[i,"Condition"] for i in Names]
    OH_Hypo = [1 if x=="Normo" else 0 for x in Hypo]
    result = pd.DataFrame(OH_Hypo, columns=["Condition"], index=df.index)
    return result

In [11]:
Y_HCC_train = Label(df_HCC_train, df_meta_HCC)
Y_HCC_test = Label(df_HCC_test, df_meta_HCC)
Y_MCF_train = Label(df_MCF_train, df_meta_MCF)
Y_MCF_test = Label(df_MCF_test, df_meta_MCF)

---
---

In [12]:
# import csv
# display(df_HCC_train.T)
# display(df_HCC_test.T)
# display(df_MCF_train.T)
# display(df_MCF_test.T)
# df_HCC_train.T.to_csv("./processed_data/HCC1806_SmartS_Filtered_Standardized-Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
# df_HCC_test.T.to_csv("./processed_data/HCC1806_SmartS_Filtered_Standardized-Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)
# df_MCF_train.T.to_csv("./processed_data/MCF7_SmartS_Filtered_Standardized-Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
# df_MCF_test.T.to_csv("./processed_data/MCF7_SmartS_Filtered_Standardized-Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)