# Train-Test Split

---
## Generic Imports


In [1]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Metadata Files
# filepath_HCC = "DropSeq_ignore_data/HCC1806_SmartS_MetaData.tsv"
# filepath_MCF = "DropSeq_ignore_data/MCF7_SmartS_MetaData.tsv"
# df_meta_HCC = pd.read_csv(filepath_HCC,delimiter="\t",engine='python',index_col=0)
# df_meta_MCF = pd.read_csv(filepath_MCF,delimiter="\t",engine='python',index_col=0)

#Filtered Files
df_HCC_s_f = pd.read_csv("DropSeq_ignore_data/HCC1806_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)
df_MCF_s_f = pd.read_csv("DropSeq_ignore_data/MCF7_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)

#Transposition
df_HCC_F = df_HCC_s_f.T
df_MCF_F = df_MCF_s_f.T
df_HCC_F.head()

Unnamed: 0,"""H1-5""","""MALAT1""","""MT-RNR2""","""ARVCF""","""BCYRN1""","""ATXN7L2""","""IGFBP3""","""H1-3""","""CTIF""","""RNF123""",...,"""BATF3""","""CDKN3""","""DLD""","""PMPCA""","""ZNF165""","""SCCPDH""","""NTAN1""","""CLIP2""","""DUSP23""","""ZNF682"""
"""AAAAAACCCGGC_Normoxia""",2,3,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
"""AAAACCGGATGC_Normoxia""",2,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""AAAACGAGCTAG_Normoxia""",5,2,0,0,1,0,1,3,0,0,...,0,0,0,0,0,0,0,0,0,0
"""AAAACTTCCCCG_Normoxia""",1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""AAAAGCCTACCC_Normoxia""",0,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
## Pipeline Definition and Application

I defined a generic pipeline to apply to the two datasets

In case additional scalers need to be applied, add the import in the section above and add the corresponding tuple in the list below

In [3]:
# steps = [
#         ('scaler', StandardScaler()),
#         ('normalizer', MinMaxScaler())
#         ]
# pipeline = Pipeline(steps)

In [4]:
# df_HCC_F_N = pd.DataFrame(pipeline.fit_transform(df_HCC_F), columns=df_HCC_F.columns, index=df_HCC_F.index)
# df_MCF_F_N = pd.DataFrame(pipeline.fit_transform(df_MCF_F), columns=df_MCF_F.columns, index=df_MCF_F.index)
# df_HCC_F_N.head()

df_HCC = df_HCC_F
df_MCF = df_MCF_F

---
## Main Genes Selection

I want to limit the dimension of the dataset by picking only 3000 genes for each dataset (same as she did in the train-test sets) so I sort the genes by a criteria (for now number of cells in which it is expressed) and pick the first 3000 columns in the dataset.

In [5]:
# def MainCols(df,n):
#     #Given Dataframe df and number n, returns the first n columns of df in term of nonzero elements
#     non_zero_counts = pd.DataFrame((df != 0).sum(axis=0), columns = ["Counts"]).T
#     sorted_df = non_zero_counts.sort_values(by = 'Counts', axis = 1, ascending = False)
#     columns = (sorted_df.iloc[:,0:n]).columns
#     filtered = df.loc[:, df.columns.isin(columns)]
#     return filtered
    

In [6]:
# df_HCC = MainCols(df_HCC_F_N, 3000)
# df_MCF = MainCols(df_MCF_F_N, 3000)
# df_HCC.head()

---
## Split

I just write the function defined in page 49 of Lucibello's book

In [7]:
def split_train_test(data, test_ratio):
 shuffled_indices = np.random.permutation(len(data))
 test_set_size = int(len(data) * test_ratio)
 test_indices = shuffled_indices[:test_set_size]
 train_indices = shuffled_indices[test_set_size:]
 return data.iloc[train_indices], data.iloc[test_indices]

In [8]:
df_HCC_train , df_HCC_test = split_train_test(df_HCC, 0.2)
df_MCF_train , df_MCF_test = split_train_test(df_MCF, 0.2)

In [9]:
df_HCC_train.head()

Unnamed: 0,"""H1-5""","""MALAT1""","""MT-RNR2""","""ARVCF""","""BCYRN1""","""ATXN7L2""","""IGFBP3""","""H1-3""","""CTIF""","""RNF123""",...,"""BATF3""","""CDKN3""","""DLD""","""PMPCA""","""ZNF165""","""SCCPDH""","""NTAN1""","""CLIP2""","""DUSP23""","""ZNF682"""
"""GGTGCAGTAGCT_Normoxia""",5,1,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
"""CACGCTGCTTCC_Hypoxia""",1,5,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""TCCGCTTGGGAC_Hypoxia""",0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""CGCCCCGTTATC_Normoxia""",1,2,1,0,0,0,2,5,0,0,...,0,1,0,0,0,0,0,0,0,0
"""CAATGCGCGTGT_Hypoxia""",0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---
## Labels

Create four Dataframes containing only one column: the label hypoxia-normoxia encoded as 0-1

In [10]:
# def Label(df, meta):
#     Names = [cell[1:-1] for cell in df.index]
#     Hypo = [meta.loc[i,"Condition"] for i in Names]
#     OH_Hypo = [0 if x=="Hypo" else 1 for x in Hypo]
#     result = pd.DataFrame(OH_Hypo, columns=["Condition"], index=df.index)
#     return result

In [11]:
# Y_HCC_train = Label(df_HCC_train, df_meta_HCC)
# Y_HCC_test = Label(df_HCC_test, df_meta_HCC)
# Y_MCF_train = Label(df_MCF_train, df_meta_MCF)
# Y_MCF_test = Label(df_MCF_test, df_meta_MCF)

---
---

In [12]:
import csv
# display(df_HCC_train.T)
# display(df_HCC_test.T)
# display(df_MCF_train.T)
# display(df_MCF_test.T)
df_HCC_train.T.to_csv("./processed_data(DropSeq)/HCC1806_SmartS_Filtered_Standardized_Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
df_HCC_test.T.to_csv("./processed_data(DropSeq)/HCC1806_SmartS_Filtered_Standardized_Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)
df_MCF_train.T.to_csv("./processed_data(DropSeq)/MCF7_SmartS_Filtered_Standardized_Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
df_MCF_test.T.to_csv("./processed_data(DropSeq)/MCF7_SmartS_Filtered_Standardized_Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)