In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

---

# LIHC

In [2]:
lihc_df = pd.read_csv("TCGA_LIHC_gene_expression_data.csv")

In [3]:
lihc_df

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-0.4579,-0.2749,0.2537,0.3082,0.0792,0.3080,0.7932,0.0252,-0.3093,0.7072,...,1.1714,0.8965,0.3385,0.3242,-0.2765,-0.8901,0.7811,1,1,23.78
1,0.2590,-0.6611,-1.5083,1.2439,-2.0270,-0.6591,0.3035,0.0952,1.1039,0.8076,...,0.6240,0.5703,0.5101,-0.7796,0.6530,-0.0861,-0.0875,1,1,53.35
2,-0.7977,1.4223,0.9841,-0.9023,-2.0270,-0.5989,0.1142,0.3536,0.5283,0.5764,...,0.4081,-0.0825,0.0121,-1.2220,-0.5053,1.0836,-0.1570,0,0,63.70
3,0.4761,-1.3221,-1.9983,1.4541,-2.0270,0.8220,0.3332,1.0881,0.0164,1.3414,...,0.7449,0.8242,-2.4195,-1.5752,0.4504,0.6772,-0.2676,0,1,83.18
4,0.3577,0.0060,0.0038,-0.6596,-2.0270,-0.6835,-0.4084,-0.7568,-0.4736,0.3716,...,-0.5116,-0.9817,0.4782,0.0868,0.9322,-0.8486,1.5964,1,1,41.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,-0.6873,0.4198,-0.1862,-0.4012,-2.0270,-0.3729,0.0078,-1.1184,-0.4914,0.4142,...,-0.9760,-0.3673,-0.0690,-0.7032,-0.4128,-0.8771,0.3031,1,1,45.53
368,-0.6714,-0.6625,-1.1085,0.7950,-2.0270,1.1797,0.2652,0.1413,-0.1388,1.1840,...,-1.3322,0.8841,-0.0415,0.0540,-1.2380,0.0221,0.3087,0,0,40.77
369,0.5088,-0.3136,-0.8363,1.7350,-2.0270,-1.2987,0.2478,0.8356,1.6430,1.6705,...,-0.8610,0.3497,0.9416,-0.0258,1.7398,-0.0156,-1.8457,1,0,79.24
370,0.5742,-0.1260,-1.9095,1.5389,1.1931,-0.4865,1.0476,1.3813,2.0297,1.1241,...,1.4361,1.5646,1.1577,-0.1308,0.5543,0.9046,-2.9768,1,0,79.24


In [4]:
data = lihc_df.iloc[:, :-3]
label = lihc_df.iloc[:, -3:]

In [5]:
data_type = "LIHC"

In [6]:
experiment = 1
for train, test in StratifiedShuffleSplit(n_splits = 10, test_size = 0.2).split(data, label["Sex"]):
    trainData, validData, trainLabel, validLabel = train_test_split(data.loc[train], label.loc[train], test_size = 0.2, shuffle = True, stratify = label["Sex"].loc[train])
    testData = data.loc[test]
    testLabel = label.loc[test]
    print("#######################  %d experiment  #######################\n" % experiment)
    ##################################################################################################
    data_scaler = StandardScaler()
    data_scaler.fit(trainData)
    scaled_tcga_train_data = data_scaler.transform(trainData)
    scaled_tcga_valid_data = data_scaler.transform(validData)
    scaled_tcga_test_data = data_scaler.transform(testData)
   ##################################################################################################
    ### entire data
    scaled_tcga_train_df = pd.DataFrame(scaled_tcga_train_data)
    scaled_tcga_valid_df = pd.DataFrame(scaled_tcga_valid_data)
    scaled_tcga_test_df = pd.DataFrame(scaled_tcga_test_data)
    ### entire label
    reindexed_tcga_train_label = trainLabel.reset_index(drop = True)
    reindexed_tcga_valid_label = validLabel.reset_index(drop = True)
    reindexed_tcga_test_label = testLabel.reset_index(drop = True)
    ### concat entire data & label
    tcga_train_df = pd.concat([scaled_tcga_train_df, reindexed_tcga_train_label], axis = 1).reset_index(drop = True)
    tcga_valid_df = pd.concat([scaled_tcga_valid_df, reindexed_tcga_valid_label], axis = 1).reset_index(drop = True)
    tcga_test_df = pd.concat([scaled_tcga_test_df, reindexed_tcga_test_label], axis = 1).reset_index(drop = True)
    ##################################################################################################
    tcga_train_df.to_csv(f"TCGA_{data_type}_Train_{experiment}.csv", index = False, header = True)
    tcga_valid_df.to_csv(f"TCGA_{data_type}_Valid_{experiment}.csv", index = False, header = True)
    tcga_test_df.to_csv(f"TCGA_{data_type}_Test_{experiment}.csv", index = False, header = True)
    
    experiment += 1

#######################  1 experiment  #######################

#######################  2 experiment  #######################

#######################  3 experiment  #######################

#######################  4 experiment  #######################

#######################  5 experiment  #######################

#######################  6 experiment  #######################

#######################  7 experiment  #######################

#######################  8 experiment  #######################

#######################  9 experiment  #######################

#######################  10 experiment  #######################



---

# STAD

In [9]:
stad_df = pd.read_csv("TCGA_STAD_gene_expression_data.csv")

In [10]:
stad_df

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,0.5935,-0.7905,0.1913,0.7205,-0.9637,0.9801,-1.7089,-2.2510,-0.5928,-2.2211,...,0.5274,-1.0740,-0.6108,-0.2185,0.0939,2.7476,-0.7957,1,0,57.98
1,-2.5289,-0.4229,-0.7856,0.1786,-1.0378,-0.2856,-0.8256,-1.3174,-0.2596,-0.6966,...,-0.2306,-0.3609,0.6905,1.8023,0.3047,-0.8814,-0.0503,0,0,26.68
2,-0.9824,-1.2408,1.1243,-0.2285,1.3105,-0.1549,-2.0963,-1.4500,-2.0049,0.1831,...,-1.2383,-0.8456,-1.8184,1.3100,-0.9642,-1.2401,0.4228,1,0,11.70
3,0.0122,0.3973,2.1612,-1.3275,-1.6946,0.8537,-1.1514,1.2557,-0.8907,0.1589,...,0.4294,0.4894,-0.0760,0.4733,0.4611,-0.6828,-0.4783,1,0,19.55
4,1.0339,-0.7171,-0.0049,0.2223,-1.6946,-0.2764,-0.4209,-1.7160,-0.2034,0.6382,...,-1.4298,-1.1947,1.3031,-0.6918,-0.4172,-0.0998,0.1977,1,0,11.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,0.0332,-0.3452,0.1883,0.6936,-0.4248,-0.1127,0.5221,0.1307,0.7730,-0.9549,...,-0.2239,1.0356,0.4915,0.2803,-0.8648,-1.9597,-0.8232,1,1,16.13
403,0.4787,0.0216,0.3006,0.7963,0.6076,-0.1100,-0.4227,0.0208,1.3625,0.5721,...,-0.9005,-1.1860,-1.5501,0.3959,-0.4625,-0.7030,-0.4662,1,0,27.53
404,-0.5134,0.7968,0.2220,0.5102,-0.3330,-0.3603,-1.0079,0.7964,0.3475,0.0812,...,-0.3561,-1.2356,-1.0319,1.8559,0.5439,0.2765,-0.5852,1,1,12.42
405,0.0491,1.4773,-0.5391,-0.0735,-0.2259,0.1212,-1.1962,0.0108,0.7556,0.6400,...,1.2759,0.4438,1.1157,-0.4324,0.7854,0.3452,1.0530,1,0,17.25


In [11]:
data = stad_df.iloc[:, :-3]
label = stad_df.iloc[:, -3:]

In [12]:
data_type = "STAD"

In [13]:
experiment = 1
for train, test in StratifiedShuffleSplit(n_splits = 10, test_size = 0.2).split(data, label["Sex"]):
    trainData, validData, trainLabel, validLabel = train_test_split(data.loc[train], label.loc[train], test_size = 0.2, shuffle = True, stratify = label["Sex"].loc[train])
    testData = data.loc[test]
    testLabel = label.loc[test]
    print("#######################  %d experiment  #######################\n" % experiment)
    ##################################################################################################
    data_scaler = StandardScaler()
    data_scaler.fit(trainData)
    scaled_tcga_train_data = data_scaler.transform(trainData)
    scaled_tcga_valid_data = data_scaler.transform(validData)
    scaled_tcga_test_data = data_scaler.transform(testData)
   ##################################################################################################
    ### entire data
    scaled_tcga_train_df = pd.DataFrame(scaled_tcga_train_data)
    scaled_tcga_valid_df = pd.DataFrame(scaled_tcga_valid_data)
    scaled_tcga_test_df = pd.DataFrame(scaled_tcga_test_data)
    ### entire label
    reindexed_tcga_train_label = trainLabel.reset_index(drop = True)
    reindexed_tcga_valid_label = validLabel.reset_index(drop = True)
    reindexed_tcga_test_label = testLabel.reset_index(drop = True)
    ### concat entire data & label
    tcga_train_df = pd.concat([scaled_tcga_train_df, reindexed_tcga_train_label], axis = 1).reset_index(drop = True)
    tcga_valid_df = pd.concat([scaled_tcga_valid_df, reindexed_tcga_valid_label], axis = 1).reset_index(drop = True)
    tcga_test_df = pd.concat([scaled_tcga_test_df, reindexed_tcga_test_label], axis = 1).reset_index(drop = True)
    ##################################################################################################
    tcga_train_df.to_csv(f"TCGA_{data_type}_Train_{experiment}.csv", index = False, header = True)
    tcga_valid_df.to_csv(f"TCGA_{data_type}_Valid_{experiment}.csv", index = False, header = True)
    tcga_test_df.to_csv(f"TCGA_{data_type}_Test_{experiment}.csv", index = False, header = True)
    
    experiment += 1

#######################  1 experiment  #######################

#######################  2 experiment  #######################

#######################  3 experiment  #######################

#######################  4 experiment  #######################

#######################  5 experiment  #######################

#######################  6 experiment  #######################

#######################  7 experiment  #######################

#######################  8 experiment  #######################

#######################  9 experiment  #######################

#######################  10 experiment  #######################



---

# LUAD

In [14]:
luad_df = pd.read_csv("TCGA_LUAD_gene_expression_data.csv")

In [15]:
luad_df

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,-1.9057,-0.7328,-0.0599,-0.4275,-1.9441,0.3717,-1.9669,1.2450,-0.4371,0.5023,...,0.4099,0.0758,0.7671,1.9043,0.2518,1.0164,-0.4967,1,0,0.00
1,-0.2950,0.4726,0.4150,0.9225,-1.1738,-0.9029,0.3265,-0.0974,-0.0565,0.6207,...,0.6071,0.6687,0.0146,-1.1770,0.8868,0.7726,-0.8791,1,0,50.03
2,-1.9091,-0.0511,-0.1980,0.0388,-0.5402,-0.2299,0.0953,-0.0171,-0.8013,-1.2674,...,-0.5641,-0.6107,0.7475,0.4486,-0.1294,-0.7576,0.8248,0,1,3.98
3,-0.5333,0.3503,0.5821,-0.5648,-0.4509,-0.2470,-0.4774,-0.1876,-0.6723,-0.1605,...,-0.2524,0.4919,-1.4625,-0.8091,0.4065,0.5852,1.2704,1,0,19.94
4,-0.8895,1.2646,-0.3292,0.2626,-1.9441,-0.2941,0.3080,0.1383,1.1152,1.0806,...,1.4413,0.2123,-1.7023,0.6045,0.8609,0.2940,-0.1648,1,0,13.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,-0.0218,0.2995,-0.4215,-0.4627,-1.9441,0.2866,-0.6996,-0.0004,0.1431,-0.4726,...,-0.1856,1.0863,-1.7023,-0.8183,-1.2910,1.2719,0.2930,0,0,0.43
504,-0.8099,0.5823,2.7798,-0.0093,-0.0856,0.8445,-0.3696,0.7827,-0.1377,2.6733,...,-0.4354,0.5556,-0.2406,-0.2837,0.5286,0.9093,-0.3319,1,0,19.81
505,-0.4522,-1.7762,-0.7533,1.4659,-0.1499,0.4272,0.0790,-1.2886,1.3519,0.4950,...,-0.3309,-0.4957,-1.3037,0.8470,-1.2156,0.6139,-1.2127,1,0,20.27
506,-1.3473,0.4958,0.5675,0.9068,2.6611,0.4739,-0.6699,0.4860,0.9761,3.1634,...,0.7868,0.4064,-1.4625,1.3708,-0.6834,-0.2513,0.4326,0,1,59.07


In [16]:
data = luad_df.iloc[:, :-3]
label = luad_df.iloc[:, -3:]

In [17]:
data_type = "LUAD"

In [18]:
experiment = 1
for train, test in StratifiedShuffleSplit(n_splits = 10, test_size = 0.2).split(data, label["Sex"]):
    trainData, validData, trainLabel, validLabel = train_test_split(data.loc[train], label.loc[train], test_size = 0.2, shuffle = True, stratify = label["Sex"].loc[train])
    testData = data.loc[test]
    testLabel = label.loc[test]
    print("#######################  %d experiment  #######################\n" % experiment)
    ##################################################################################################
    data_scaler = StandardScaler()
    data_scaler.fit(trainData)
    scaled_tcga_train_data = data_scaler.transform(trainData)
    scaled_tcga_valid_data = data_scaler.transform(validData)
    scaled_tcga_test_data = data_scaler.transform(testData)
   ##################################################################################################
    ### entire data
    scaled_tcga_train_df = pd.DataFrame(scaled_tcga_train_data)
    scaled_tcga_valid_df = pd.DataFrame(scaled_tcga_valid_data)
    scaled_tcga_test_df = pd.DataFrame(scaled_tcga_test_data)
    ### entire label
    reindexed_tcga_train_label = trainLabel.reset_index(drop = True)
    reindexed_tcga_valid_label = validLabel.reset_index(drop = True)
    reindexed_tcga_test_label = testLabel.reset_index(drop = True)
    ### concat entire data & label
    tcga_train_df = pd.concat([scaled_tcga_train_df, reindexed_tcga_train_label], axis = 1).reset_index(drop = True)
    tcga_valid_df = pd.concat([scaled_tcga_valid_df, reindexed_tcga_valid_label], axis = 1).reset_index(drop = True)
    tcga_test_df = pd.concat([scaled_tcga_test_df, reindexed_tcga_test_label], axis = 1).reset_index(drop = True)
    ##################################################################################################
    tcga_train_df.to_csv(f"TCGA_{data_type}_Train_{experiment}.csv", index = False, header = True)
    tcga_valid_df.to_csv(f"TCGA_{data_type}_Valid_{experiment}.csv", index = False, header = True)
    tcga_test_df.to_csv(f"TCGA_{data_type}_Test_{experiment}.csv", index = False, header = True)
    
    experiment += 1

#######################  1 experiment  #######################

#######################  2 experiment  #######################

#######################  3 experiment  #######################

#######################  4 experiment  #######################

#######################  5 experiment  #######################

#######################  6 experiment  #######################

#######################  7 experiment  #######################

#######################  8 experiment  #######################

#######################  9 experiment  #######################

#######################  10 experiment  #######################



---

# LUSC

In [19]:
lusc_df = pd.read_csv("TCGA_LUSC_gene_expression_data.csv")

In [20]:
lusc_df

Unnamed: 0,HMGB1P1,A2M,AACS,AADAT,AANAT,AARS2,AASDHPPT,AASDH,AASS,ABAT,...,RBSN,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,1.4899,0.2806,1.2640,-1.1910,-1.9130,-3.1176,1.5903,2.0014,-0.3019,-0.1855,...,-1.5068,-0.5023,0.0084,1.9210,0.5612,1.1802,-1.5127,1,1,12.19
1,0.5958,0.5284,0.7721,-0.1655,-0.8774,-0.8649,0.1732,0.7866,1.3584,1.3587,...,0.3832,0.3592,0.6552,0.0093,0.7833,-1.4013,0.1076,1,1,4.47
2,1.2436,0.8217,-1.8103,1.8121,-1.9130,-2.9060,1.1801,1.1786,0.1547,0.6600,...,-0.0217,0.5434,0.2672,0.1989,2.8537,-0.0450,-4.1585,0,1,75.69
3,0.1109,1.4265,-1.0411,0.8031,-1.9130,-0.5093,0.8880,1.2401,-0.1967,0.7333,...,2.0952,0.7145,-2.5363,0.9940,0.2669,0.3938,0.1869,1,0,123.09
4,0.6288,-0.5615,1.4504,0.0493,-0.3669,1.5842,0.6747,1.3838,0.1108,-0.9052,...,-0.5310,-0.3965,-1.9010,-1.2751,-0.3862,0.2625,-0.5724,1,1,4.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,0.2982,-0.7502,-0.6965,0.6680,-1.9130,0.0325,0.6547,-0.2071,-0.8912,-1.3756,...,0.3423,0.7051,-0.2711,-1.7719,0.3876,-0.9890,-0.8226,0,1,12.71
491,0.4682,-0.3021,-0.6245,-0.1061,0.2575,-0.2706,-0.3664,-1.0547,1.2710,1.1411,...,-1.4025,0.7756,0.3390,-0.2501,1.1855,-2.0965,0.8601,0,1,43.86
492,1.3014,-0.3246,0.1157,-0.3372,-0.7686,1.5318,-0.8293,-0.6591,0.5701,0.6028,...,-1.9555,-0.5944,1.4217,-0.5923,-1.3152,-0.5711,-0.9170,1,1,8.57
493,0.0438,0.9554,0.8300,2.0655,-1.2849,2.1550,0.0948,0.8630,-0.1724,-0.5642,...,0.6429,0.8057,-0.8409,1.7818,-0.4635,0.8710,-2.4961,0,1,11.17


In [21]:
data = lusc_df.iloc[:, :-3]
label = lusc_df.iloc[:, -3:]

In [22]:
data_type = "LUSC"

In [23]:
experiment = 1
for train, test in StratifiedShuffleSplit(n_splits = 10, test_size = 0.2).split(data, label["Sex"]):
    trainData, validData, trainLabel, validLabel = train_test_split(data.loc[train], label.loc[train], test_size = 0.2, shuffle = True, stratify = label["Sex"].loc[train])
    testData = data.loc[test]
    testLabel = label.loc[test]
    print("#######################  %d experiment  #######################\n" % experiment)
    ##################################################################################################
    data_scaler = StandardScaler()
    data_scaler.fit(trainData)
    scaled_tcga_train_data = data_scaler.transform(trainData)
    scaled_tcga_valid_data = data_scaler.transform(validData)
    scaled_tcga_test_data = data_scaler.transform(testData)
   ##################################################################################################
    ### entire data
    scaled_tcga_train_df = pd.DataFrame(scaled_tcga_train_data)
    scaled_tcga_valid_df = pd.DataFrame(scaled_tcga_valid_data)
    scaled_tcga_test_df = pd.DataFrame(scaled_tcga_test_data)
    ### entire label
    reindexed_tcga_train_label = trainLabel.reset_index(drop = True)
    reindexed_tcga_valid_label = validLabel.reset_index(drop = True)
    reindexed_tcga_test_label = testLabel.reset_index(drop = True)
    ### concat entire data & label
    tcga_train_df = pd.concat([scaled_tcga_train_df, reindexed_tcga_train_label], axis = 1).reset_index(drop = True)
    tcga_valid_df = pd.concat([scaled_tcga_valid_df, reindexed_tcga_valid_label], axis = 1).reset_index(drop = True)
    tcga_test_df = pd.concat([scaled_tcga_test_df, reindexed_tcga_test_label], axis = 1).reset_index(drop = True)
    ##################################################################################################
    tcga_train_df.to_csv(f"TCGA_{data_type}_Train_{experiment}.csv", index = False, header = True)
    tcga_valid_df.to_csv(f"TCGA_{data_type}_Valid_{experiment}.csv", index = False, header = True)
    tcga_test_df.to_csv(f"TCGA_{data_type}_Test_{experiment}.csv", index = False, header = True)
    
    experiment += 1

#######################  1 experiment  #######################

#######################  2 experiment  #######################

#######################  3 experiment  #######################

#######################  4 experiment  #######################

#######################  5 experiment  #######################

#######################  6 experiment  #######################

#######################  7 experiment  #######################

#######################  8 experiment  #######################

#######################  9 experiment  #######################

#######################  10 experiment  #######################



---

# GBM & LGG

In [24]:
combined_gbm_lgg_df = pd.read_csv("TCGA_GBM_&_LGG_gene_expression_data.csv")

In [25]:
combined_gbm_lgg_df

Unnamed: 0,A2M,AACS,AADAT,AANAT,AARS2,AASDH,AASDHPPT,AASS,ABAT,ABCA10,...,ZFYVE16,ZFYVE9,ZIC2,ZMAT2,ZMAT3,ZNF274,ZYX,Sex,OS_STATUS,OS_MONTHS
0,0.9457,0.2310,0.4380,0.5911,-0.5742,-0.3994,0.1820,-0.3404,1.0665,0.3920,...,0.8032,0.7179,-1.8458,-0.7945,0.8365,-0.3184,-0.9779,1,1,14.72
1,1.3224,-1.1385,-1.6921,-0.8731,-0.2268,0.1284,-1.8299,-1.3493,-3.2963,-0.5713,...,0.1094,-0.5526,-1.0222,-0.2260,-0.8981,0.5467,1.0307,0,1,2.50
2,0.1660,0.3063,0.6997,-1.1499,-0.1483,-0.5233,1.2384,-1.0504,-0.4303,-2.2010,...,-1.5679,0.5098,0.2495,0.0916,-2.6503,1.5333,-0.4795,1,0,15.31
3,-1.4495,-1.2758,-0.0801,-1.8551,-0.0765,-0.5345,-0.7625,0.3655,0.9717,1.4854,...,-0.0324,0.5402,1.2708,0.7992,-1.7368,-0.7350,-0.4707,1,0,15.44
4,1.2485,-0.4453,0.2542,0.4176,-0.1170,-0.3279,-0.0754,-0.8850,-1.8596,-0.6940,...,-1.1768,-0.2062,0.0956,1.6558,-0.6314,-0.4226,0.3169,1,1,20.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,-0.2927,-0.0071,1.3750,0.1424,-0.2869,0.6011,0.0011,0.2679,0.2561,-1.0454,...,0.8332,0.3819,0.3507,0.0267,0.0615,1.0639,-0.9482,1,0,43.36
689,0.8444,-0.8818,1.7158,-0.8667,0.6668,0.8342,0.8409,0.6754,-0.2144,-0.1674,...,2.1262,0.2318,-0.7200,0.5324,-1.0072,1.2301,-0.8485,1,0,45.76
690,-0.0160,-0.5032,1.7822,0.3375,0.6621,0.7183,1.2917,0.8670,0.1620,0.1908,...,1.7647,1.0268,1.2121,0.2512,-1.2149,2.0410,-0.4482,1,0,46.85
691,0.0948,0.2616,0.4672,1.5114,-0.1265,0.0345,-1.1829,0.2674,0.4877,-0.0156,...,0.6297,-0.0323,0.9821,0.4749,-1.0888,1.2793,-0.6900,1,0,37.68


In [26]:
data = combined_gbm_lgg_df.iloc[:, :-3]
label = combined_gbm_lgg_df.iloc[:, -3:]

In [27]:
data_type = "GBM_&_LGG"

In [28]:
experiment = 1
for train, test in StratifiedShuffleSplit(n_splits = 10, test_size = 0.2).split(data, label["Sex"]):
    trainData, validData, trainLabel, validLabel = train_test_split(data.loc[train], label.loc[train], test_size = 0.2, shuffle = True, stratify = label["Sex"].loc[train])
    testData = data.loc[test]
    testLabel = label.loc[test]
    print("#######################  %d experiment  #######################\n" % experiment)
    ##################################################################################################
    data_scaler = StandardScaler()
    data_scaler.fit(trainData)
    scaled_tcga_train_data = data_scaler.transform(trainData)
    scaled_tcga_valid_data = data_scaler.transform(validData)
    scaled_tcga_test_data = data_scaler.transform(testData)
   ##################################################################################################
    ### entire data
    scaled_tcga_train_df = pd.DataFrame(scaled_tcga_train_data)
    scaled_tcga_valid_df = pd.DataFrame(scaled_tcga_valid_data)
    scaled_tcga_test_df = pd.DataFrame(scaled_tcga_test_data)
    ### entire label
    reindexed_tcga_train_label = trainLabel.reset_index(drop = True)
    reindexed_tcga_valid_label = validLabel.reset_index(drop = True)
    reindexed_tcga_test_label = testLabel.reset_index(drop = True)
    ### concat entire data & label
    tcga_train_df = pd.concat([scaled_tcga_train_df, reindexed_tcga_train_label], axis = 1).reset_index(drop = True)
    tcga_valid_df = pd.concat([scaled_tcga_valid_df, reindexed_tcga_valid_label], axis = 1).reset_index(drop = True)
    tcga_test_df = pd.concat([scaled_tcga_test_df, reindexed_tcga_test_label], axis = 1).reset_index(drop = True)
    ##################################################################################################
    tcga_train_df.to_csv(f"TCGA_{data_type}_Train_{experiment}.csv", index = False, header = True)
    tcga_valid_df.to_csv(f"TCGA_{data_type}_Valid_{experiment}.csv", index = False, header = True)
    tcga_test_df.to_csv(f"TCGA_{data_type}_Test_{experiment}.csv", index = False, header = True)
    
    experiment += 1

#######################  1 experiment  #######################

#######################  2 experiment  #######################

#######################  3 experiment  #######################

#######################  4 experiment  #######################

#######################  5 experiment  #######################

#######################  6 experiment  #######################

#######################  7 experiment  #######################

#######################  8 experiment  #######################

#######################  9 experiment  #######################

#######################  10 experiment  #######################

