<img src="../TCGA-barcode.jpg">

<img src="../sample_type.png">

## Pandas viewing options
http://songhuiming.github.io/pages/2017/04/02/jupyter-and-pandas-display/

Extract the "Extracted.tar.gz" file then change the directory to where the files are located. Code should then run.

In [1]:
import numpy as np
import scipy.io
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split

In [2]:
files = os.listdir('../firehose_data/Extracted')
#Sort alphabetcially
files = sorted(files)

cancer_names = []
for file in files:
    cancer_names.append(file.split('.')[0])
cancer_names = list(np.unique(cancer_names))

In [3]:
cancer_dataframes = []
rows = []
num_samples = []
last_cancer_type = 0

for file in files:
    sample = (pd.read_csv('../firehose_data/Extracted/' + file, sep='\t', index_col=0, skiprows=[1]))
    cancer_dataframes.append(sample)
    rows.append(sample.index) #stores the row names
    if file.split('.')[0]==last_cancer_type:
        num_samples[-1] = num_samples[-1] + sample.shape[1]
    else:
        num_samples.append(sample.shape[1])
    last_cancer_type = file.split('.')[0]

#Sanity check for data uniformness
rows=np.array(rows)
if np.sum(np.sum(rows == rows[0,:],axis=1))==20531*33:
    print("All rows are the same gene")
    
#Create single data frame for everything
full_data = pd.concat(cancer_dataframes,axis=1)

In [4]:
tumor_samples = []
sample_ids = []
columns2drop = []
index = 0
for number in num_samples:
    num = 0
    for _ in range(number):
        if full_data.columns.values[index][13:15] in ['01','02','03','04','05','06','07','08','09']:
            sample_ids.append(full_data.columns.values[index])
            num = num + 1
        else:
            columns2drop.append(full_data.columns.values[index])
        index = index + 1
    tumor_samples.append(num)

In [5]:
tumor_data = full_data.drop(columns2drop,axis=1)
tumor_data.shape

(20531, 10354)

In [6]:
np.vstack([cancer_names,tumor_samples]).T

array([['ACC', '79'],
       ['BLCA', '408'],
       ['BRCA', '1100'],
       ['CESC', '306'],
       ['CHOL', '36'],
       ['COAD', '478'],
       ['DLBC', '48'],
       ['ESCA', '185'],
       ['GBM', '166'],
       ['HNSC', '522'],
       ['KICH', '66'],
       ['KIRC', '534'],
       ['KIRP', '291'],
       ['LAML', '173'],
       ['LGG', '530'],
       ['LIHC', '373'],
       ['LUAD', '517'],
       ['LUSC', '501'],
       ['MESO', '87'],
       ['OV', '307'],
       ['PAAD', '179'],
       ['PCPG', '184'],
       ['PRAD', '498'],
       ['READ', '167'],
       ['SARC', '263'],
       ['SKCM', '472'],
       ['STAD', '415'],
       ['TGCT', '156'],
       ['THCA', '509'],
       ['THYM', '120'],
       ['UCEC', '547'],
       ['UCS', '57'],
       ['UVM', '80']], dtype='<U21')

In [7]:
#Create one hot encoded data labels
labels = []
for i in range(33):
    labels.append(np.ones(tumor_samples[i])*i)
labels = np.hstack(labels).astype(int)
tumor_labels = (np.eye(labels.max()+1)[labels].astype(int)).T
tumor_labels.shape
tumor_labels_df = pd.DataFrame(tumor_labels, columns = sample_ids)

In [8]:
data_and_labels = pd.concat([tumor_data, tumor_labels_df])
data_and_labels.shape

(20564, 10354)

In [9]:
#Partition to train and test
train_df = []
test_df = []
index = 0
for i in range(33):
    if i == 0:
        train, test = train_test_split(data_and_labels.iloc[:,0:tumor_samples[i]].T, test_size=0.2)
    else:
        train, test = train_test_split(data_and_labels.iloc[:,index:index+tumor_samples[i]].T, test_size=0.2)
#     print(index)
#     print(index+tumor_samples[i])
#     print(train.shape)
#     print(test.shape)
    index = index + tumor_samples[i]
    train_df.append(train)
    test_df.append(test)

In [10]:
train_df = pd.concat(train_df).T
train_data = train_df.iloc[0:-33,:]
train_labels = train_df.iloc[-33::,:]

test_df = pd.concat(test_df).T
test_data = test_df.iloc[0:-33,:]
test_labels = test_df.iloc[-33::,:]
# print(train_df.shape)
# print(train_data.shape)
# print(train_labels.shape)
# print(test_df.shape)

In [11]:
#Save
tumor_data.to_pickle('../DATA/tumor_data.pkl')
tumor_labels_df.to_pickle('../DATA/tumor_labels_df.pkl')

train_data.to_pickle('../DATA/train_data.pkl')
train_labels.to_pickle('../DATA/train_labels.pkl')

test_data.to_pickle('../DATA/test_data.pkl')
test_labels.to_pickle('../DATA/test_labels.pkl')

#np.save('../firehose_data/tumor_labels.npy', tumor_labels)

In [12]:
#Convert and save to .mat
scipy.io.savemat('../DATA/tumor_data.mat', mdict={'tumor_data': tumor_data.values})
scipy.io.savemat('../DATA/tumor_labels.mat', mdict={'tumor_labels': tumor_labels_df.values})

scipy.io.savemat('../DATA/train_data.mat', mdict={'train_data': train_data.values})
scipy.io.savemat('../DATA/train_labels.mat', mdict={'train_labels': train_labels.values})

scipy.io.savemat('../DATA/test_data.mat', mdict={'test_data': test_data.values})
scipy.io.savemat('../DATA/test_labels.mat', mdict={'test_labels': test_labels.values})

In [17]:
#Load
tumor_data = pd.read_pickle('../firehose_data/tumor_data.pkl')
row_names = tumor_data.index
column_names = tumor_data.columns.values
values = tumor_data.values

In [13]:
#Normalizing values
values = np.log(values + 1)