# Set environment

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.cross_validation import StratifiedShuffleSplit
from collections import Counter

FP_DAT = "/home/clint/GitRepo/Duke_BIOS824_Proj1/data/"
FP_LGG = "/data/Duke_BIOS824/LGG-segmentation/"
FP_OUT = "/data/Duke_BIOS824/res_proj01/"



# Import data

In [2]:
os.listdir(FP_LGG)

['data.csv',
 'README.md',
 'TCGA_CS_4941_19960909',
 'TCGA_CS_4942_19970222',
 'TCGA_CS_4943_20000902',
 'TCGA_CS_4944_20010208',
 'TCGA_CS_5393_19990606',
 'TCGA_CS_5395_19981004',
 'TCGA_CS_5396_20010302',
 'TCGA_CS_5397_20010315',
 'TCGA_CS_6186_20000601',
 'TCGA_CS_6188_20010812',
 'TCGA_CS_6290_20000917',
 'TCGA_CS_6665_20010817',
 'TCGA_CS_6666_20011109',
 'TCGA_CS_6668_20011025',
 'TCGA_CS_6669_20020102',
 'TCGA_DU_5849_19950405',
 'TCGA_DU_5851_19950428',
 'TCGA_DU_5852_19950709',
 'TCGA_DU_5853_19950823',
 'TCGA_DU_5854_19951104',
 'TCGA_DU_5855_19951217',
 'TCGA_DU_5871_19941206',
 'TCGA_DU_5872_19950223',
 'TCGA_DU_5874_19950510',
 'TCGA_DU_6399_19830416',
 'TCGA_DU_6400_19830518',
 'TCGA_DU_6401_19831001',
 'TCGA_DU_6404_19850629',
 'TCGA_DU_6407_19860514',
 'TCGA_DU_6408_19860521',
 'TCGA_DU_7008_19830723',
 'TCGA_DU_7010_19860307',
 'TCGA_DU_7013_19860523',
 'TCGA_DU_7014_19860618',
 'TCGA_DU_7018_19911220',
 'TCGA_DU_7019_19940908',
 'TCGA_DU_7294_19890104',
 'TCGA_DU_7

In [3]:
labels = pd.read_csv(os.path.join(FP_LGG, "data.csv"))
labels.shape

(110, 18)

In [4]:
labels.head()

Unnamed: 0,Patient,RNASeqCluster,MethylationCluster,miRNACluster,CNCluster,RPPACluster,OncosignCluster,COCCluster,histological_type,neoplasm_histologic_grade,tumor_tissue_site,laterality,tumor_location,gender,age_at_initial_pathologic,race,ethnicity,death01
0,TCGA_CS_4941,2.0,4.0,2,2.0,,3.0,2,1.0,2.0,1.0,3.0,2.0,2.0,67.0,3.0,2.0,1.0
1,TCGA_CS_4942,1.0,5.0,2,1.0,1.0,2.0,1,1.0,2.0,1.0,3.0,2.0,1.0,44.0,2.0,,1.0
2,TCGA_CS_4943,1.0,5.0,2,1.0,2.0,2.0,1,1.0,2.0,1.0,1.0,2.0,2.0,37.0,3.0,,0.0
3,TCGA_CS_4944,,5.0,2,1.0,2.0,1.0,1,1.0,1.0,1.0,3.0,6.0,2.0,50.0,3.0,,0.0
4,TCGA_CS_5393,4.0,5.0,2,1.0,2.0,3.0,1,1.0,2.0,1.0,1.0,6.0,2.0,39.0,3.0,,0.0


# Train-test split

get the target columns

In [5]:
label_selected = labels.loc[:, ["Patient", "COCCluster", "neoplasm_histologic_grade"]]
label_selected.head(2)

Unnamed: 0,Patient,COCCluster,neoplasm_histologic_grade
0,TCGA_CS_4941,2,2.0
1,TCGA_CS_4942,1,2.0


train-test split

In [8]:
y_raw = label_selected.dropna().copy()
y_na  = label_selected.iloc[-1:, :]

y = y_raw.loc[:,["COCCluster", "neoplasm_histologic_grade"]].values
y_train, y_test = train_test_split(y_raw, test_size=9, random_state=123, stratify=y)

print(Counter(y_train['COCCluster']))
print(Counter(y_test['COCCluster']))
print("===========")
print(Counter(y_train['neoplasm_histologic_grade']))
print(Counter(y_test['neoplasm_histologic_grade']))

Counter({1: 50, 3: 28, 2: 22})
Counter({1: 5, 2: 2, 3: 2})
Counter({2.0: 53, 1.0: 47})
Counter({2.0: 5, 1.0: 4})


train-test-val split

In [11]:
### set and prepare split
kf = KFold(n_splits=10, shuffle = False, random_state = 123)
y_train2 = y_train.loc[:, ["Patient"]]
y_test2  = y_test.loc[ :, ["Patient"]]
y_na2    = y_na.loc[   :, ["Patient"]]

### init
y_new = label_selected.copy()
y_new = y_new.set_index("Patient")
idx = 0

### construct columns
for train_index, val_index in kf.split(y_train2):
    ### init loop
    idx += 1
    y1 = y_train2.iloc[train_index,:].copy()
    y2 = y_train2.iloc[  val_index,:].copy()
    y3 = y_test2.copy()
    y4 = y_na2.copy()
    
    ### construct column for cross-validation
    y1.loc[:, "cv" + str(idx).zfill(2)] = "train"
    y2.loc[:, "cv" + str(idx).zfill(2)] = "val"
    y3.loc[:, "cv" + str(idx).zfill(2)] = "test"
    y4.loc[:, "cv" + str(idx).zfill(2)] = "remove"
    
    ### join the columns together
    y_tmp = pd.concat([y1, y2, y3, y4]).set_index("Patient")
    y_new = y_new.join(y_tmp) 

### arrange results
label_new = y_new.sort_index().reset_index()

In [12]:
label_new

Unnamed: 0,Patient,COCCluster,neoplasm_histologic_grade,cv01,cv02,cv03,cv04,cv05,cv06,cv07,cv08,cv09,cv10
0,TCGA_CS_4941,2,2.0,train,train,train,train,train,val,train,train,train,train
1,TCGA_CS_4942,1,2.0,train,train,val,train,train,train,train,train,train,train
2,TCGA_CS_4943,1,2.0,train,train,train,train,train,train,train,train,val,train
3,TCGA_CS_4944,1,1.0,train,val,train,train,train,train,train,train,train,train
4,TCGA_CS_5393,1,2.0,train,train,train,train,train,train,train,val,train,train
5,TCGA_CS_5395,2,1.0,train,train,train,train,train,train,train,val,train,train
6,TCGA_CS_5396,3,2.0,train,train,train,val,train,train,train,train,train,train
7,TCGA_CS_5397,2,2.0,train,train,train,train,train,train,train,train,train,val
8,TCGA_CS_6186,2,2.0,val,train,train,train,train,train,train,train,train,train
9,TCGA_CS_6188,2,2.0,train,train,val,train,train,train,train,train,train,train


# Store the results

In [13]:
fp = FP_OUT + "label.csv"
label_selected.to_csv(fp, index=False)

fp = FP_OUT + "label_split.csv"
label_new.to_csv(     fp, index=False)