In [1]:
# load the raw dataset
import numpy as np
import pandas as pd
from pandas import DataFrame

rawdf = pd.read_csv("unc.edu_PANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv", sep="\t", index_col=0)
rawdf.head()

Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-VQ-AA6A-01A-11R-A414-31,TCGA-VQ-AA6B-01A-11R-A414-31,TCGA-VQ-AA6D-01A-11R-A414-31,TCGA-VQ-AA6F-01A-31R-A414-31,TCGA-VQ-AA6G-01A-11R-A414-31,TCGA-VQ-AA6I-01A-11R-A414-31,TCGA-VQ-AA6J-01A-11R-A414-31,TCGA-VQ-AA6K-01A-11R-A414-31,TCGA-ZA-A8F6-01A-23R-A36D-31,TCGA-ZQ-A9CR-01A-11R-A39E-31
0.0gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.3189,0.0,0.0,0.0,0.0,0.2466,0.4134,0.0,0.5668
?|100133144,3.2661,2.6815,1.7301,0.0,0.0,1.1673,1.4422,0.0,4.4556,7.1293,...,26.1744,28.1937,32.4153,18.6091,33.7625,35.0189,27.677,28.6855,29.3939,19.8016
?|100134869,3.9385,8.9948,6.565,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,0.0,...,14.3662,16.5156,21.5482,17.2753,20.2653,30.1605,19.2494,21.1378,15.4703,8.5379
?|10357,149.135,81.0777,86.4879,53.9117,66.9063,103.506,94.9316,78.1955,69.2389,155.709,...,11.5541,10.0447,15.9546,21.9117,8.4115,21.6463,35.6665,11.5371,22.0386,29.3701
?|10431,2034.1,1304.93,1054.66,2350.89,1257.99,1866.43,995.027,1762.12,1213.53,2005.57,...,357.786,780.626,903.238,1033.03,733.743,519.993,709.643,702.473,561.984,1874.02


In [2]:
# transpose raw data set so row is patient samples and column is list of genes
processeddf = rawdf.transpose()

In [3]:
# check to see if the raw data set has any missing values
processeddf.isnull().any().any()

False

In [4]:
# check if the any row( i.e. sample) has all zero values so we can delete them
removed_all_zero_rows_df = processeddf.loc[(processeddf!=0).any(axis=1),:]
removed_all_zero_rows_df.shape
# answer is no; all 10471 samples have values; proceed to next

(10471, 20531)

In [5]:
# check to see if any duplicate rows/samples
processeddf.index.get_duplicates()
# answe is no; move to next step

[]

In [6]:
# check to see if any duplicate columns/features
processeddf.transpose().index.get_duplicates()
# answer is no; move to next step

[]

In [7]:
# create the data label file (original data)

# step 1: load tissue source site file
dfSampleSource = pd.read_csv("tissueSourceSite.tsv", sep="\t", keep_default_na=False, na_values=[])

# step 2: load disease study information file
dfSampleAbb = pd.read_csv("diseaseStudy.tsv", sep="\t",keep_default_na=False, na_values=[])
dfSampleAbb = dfSampleAbb.reset_index()


# step 3: add "study abbreviation" on disease study file to tissue source sit file 
dfSampleSourceAddOn = dfSampleSource.merge(dfSampleAbb[['Study Name', 'Study Abbreviation']], on=['Study Name'])

# step 4: create a new file (i.e. dflabels) for class labels
dflabels = DataFrame(columns = ['labels'], index = processeddf.index)

# step 5: add class labels to dflabels
for i, row in processeddf.iterrows():
    for diseaseType in dfSampleSourceAddOn['TSS Code']:
        if(i[5:7] == diseaseType):
            dflabels.loc[i] = dfSampleSourceAddOn.loc[dfSampleSourceAddOn['TSS Code'] == diseaseType, 'Study Abbreviation'].iloc[0]

# step 6: check to see if dflabels file is all filled with tumor type
dflabels.isnull().any().any()

False

In [8]:
# count # of samples per class
from collections import Counter
print (Counter(dflabels['labels']))

Counter({'BRCA': 1218, 'KIRC': 606, 'LUAD': 576, 'THCA': 572, 'HNSC': 566, 'LUSC': 554, 'PRAD': 550, 'LGG': 534, 'SKCM': 474, 'STAD': 450, 'BLCA': 427, 'LIHC': 424, 'COAD': 329, 'KIRP': 323, 'CESC': 310, 'OV': 309, 'SARC': 265, 'UCEC': 201, 'ESCA': 196, 'PCPG': 187, 'PAAD': 183, 'GBM': 174, 'LAML': 173, 'TGCT': 156, 'THYM': 122, 'READ': 105, 'KICH': 91, 'MESO': 87, 'UVM': 80, 'ACC': 79, 'UCS': 57, 'DLBC': 48, 'CHOL': 45})


In [9]:
# count total 3 of classes
len(Counter(dflabels['labels']))

33

In [10]:
# calculate ave number per class
10471/33
# answer is 317.333; pick 300 samples per class

317.3030303030303

In [11]:
# randomly select 300 samples per class
above300tumortype = ['BRCA','KIRC','LUAD','THCA','HNSC','LUSC','PRAD','LGG','SKCM',
                     'STAD','BLCA','LIHC','COAD','KIRP','CESC','OV']
below300tumortype = ['SARC','UCEC','ESCA','PCPG','PAAD','GBM','LAML','TGCT','THYM',
                     'READ','KICH','MESO','UVM','ACC','UCS','DLBC','CHOL']
undersample300df = DataFrame(columns = processeddf.columns)

for i in above300tumortype:
    OnetypeAll = dflabels.loc[dflabels['labels'] == i].index
    OnetypeAll300 = np.random.choice(OnetypeAll, 300, replace=False)
    undersample300df = undersample300df.append(processeddf.loc[OnetypeAll300])
    
for i in below300tumortype :
    OnetypeAll = dflabels.loc[dflabels['labels'] == i].index
    OnetypeAll300 = np.random.choice(OnetypeAll, 300, replace=True)
    undersample300df = undersample300df.append(processeddf.loc[OnetypeAll300]) 

In [12]:
# confirm total has 9900 samples selected
undersample300df.shape

(9900, 20531)

In [13]:
# create the data label file

# step 1: load tissue source site file
dfSampleSource = pd.read_csv("tissueSourceSite.tsv", sep="\t", keep_default_na=False, na_values=[])

# step 2: load disease study information file
dfSampleAbb = pd.read_csv("diseaseStudy.tsv", sep="\t",keep_default_na=False, na_values=[])
dfSampleAbb = dfSampleAbb.reset_index()


# step 3: add "study abbreviation" on disease study file to tissue source sit file 
dfSampleSourceAddOn = dfSampleSource.merge(dfSampleAbb[['Study Name', 'Study Abbreviation']], on=['Study Name'])

# step 4: create a new file (i.e. dflabels) for class labels
dflabels = DataFrame(columns = ['labels'], index = undersample300df.index)

# step 5: add class labels to dflabels
for i, row in undersample300df.iterrows():
    for diseaseType in dfSampleSourceAddOn['TSS Code']:
        if(i[5:7] == diseaseType):
            dflabels.loc[i] = dfSampleSourceAddOn.loc[dfSampleSourceAddOn['TSS Code'] == diseaseType, 'Study Abbreviation'].iloc[0]

# step 6: check to see if dflabels file is all filled with tumor type
dflabels.isnull().any().any()

False

In [14]:
# export dflabels to a csv file so later on we can use this file as class label file
dflabels.to_csv('project_class_labels_300.csv')

In [15]:
# export undersample300df to a csv file so later on we can use this file as class label file
undersample300df.to_csv('project_data_down_300.csv')