In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
import re

### Handling Data ###

In [2]:
# Import expression data
expression = pd.read_csv('Gene_expression_log2.csv')
expression.head()

Unnamed: 0,patient_id,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,...,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,sample_type
0,TCGA-05-4244,0.0,3.460913,3.618474,5.661048,9.731217,0.0,8.435591,1.033652,0.0,...,9.018679,5.350285,8.19732,9.90726,0.763921,10.088854,11.471137,9.768648,9.170596,1
1,TCGA-05-4249,0.0,3.034867,3.748848,6.515884,9.853335,0.0,7.191819,1.383939,0.0,...,8.172463,5.980428,8.950002,10.204975,4.41165,9.622978,11.199826,10.153704,9.433116,1
2,TCGA-05-4250,0.0,3.043572,2.811142,5.659257,10.156943,0.0,5.720508,0.0,0.0,...,10.033203,5.931168,8.517335,9.722642,4.782796,8.895339,12.40898,10.194166,9.060342,1
3,TCGA-05-4382,0.0,3.62423,3.099968,6.3894,9.65852,0.0,7.913021,0.564232,0.309525,...,9.558593,5.373036,8.441915,9.888267,6.041142,9.828389,12.725185,10.192589,9.376842,1
4,TCGA-05-4384,0.0,2.079088,2.168064,6.200361,9.137001,0.0,8.104766,0.687867,0.0,...,7.275566,6.340285,9.140127,10.36819,3.160501,9.607079,11.706703,10.763478,9.500392,1


## 1 Year

In [4]:
# Import 1-year exp data
yr_one = pd.read_csv('top_100_DEgenes_1yr.txt', delimiter='\t')
yr_one.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,GPR77|27202,57.363315,-2.412671,0.229693,-10.503893,8.29e-26,1.54e-21
1,MT1A|4489,46.043479,-3.527803,0.431637,-8.173073,3.01e-16,2.8e-12
2,SCGB3A1|92304,6620.702084,3.900022,0.486672,8.013647,1.11e-15,6.92e-12
3,AMDHD1|144193,31.390227,-2.605383,0.336389,-7.745141,9.55e-15,4.45e-11
4,TMEM213|155006,87.078696,3.285105,0.436324,7.529047,5.11e-14,1.9e-10


In [5]:
# Number of genes in both datasets
len(set(expression.columns).intersection(set(yr_one.gene)))

100

In [6]:
# How many genes are present in the one-year that aren't in the expression data?
# Should be 0
set(yr_one.gene) - set(expression.columns)

set()

In [7]:
common_genes = set(yr_one.gene).intersection(set(expression.columns))
common_genes.add('patient_id')

In [8]:
yr_one_exp = expression[list(common_genes)]

### Label data ###

Which patients have survived?

In [9]:
y = pd.read_csv('labels_1yr.csv')
y.head()

Unnamed: 0,patient_id,Label
0,TCGA-05-4249,1
1,TCGA-05-4382,1
2,TCGA-05-4384,1
3,TCGA-05-4389,1
4,TCGA-05-4390,1


In [10]:
y.Label.value_counts()

1    374
0     32
Name: Label, dtype: int64

In [11]:
patient_id = y['patient_id']
exp_file = y.merge(yr_one_exp, on='patient_id')
exp_file = exp_file.drop(['patient_id'],axis = 1)

In [12]:
exp_file.shape

(405, 101)

### Add metadata

In [13]:
# Add patient_id back
exp_file['patient_id'] = patient_id

# Add metadata to meta_file
metadata_final = pd.read_csv('metadata_final_no_os.csv')
meta_file = metadata_final.merge(exp_file, how='right', on='patient_id')

In [14]:
meta_file

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender_FEMALE,gender_MALE,tumor_stage_Early,tumor_stage_Late,tumor_stage_N/A,is_smoker_False,is_smoker_True,Label,...,UGT1A10|54575,AKAP12|9590,FCRL1|115350,LOC149620|149620,NRN1|51299,ORM1|5004,CTSH|1512,C12orf11|55726,CXCL5|6374,MT2A|4502
0,TCGA-05-4249,67,0,1,1,0,0,0,1,1,...,1.529421,7.650585,4.759076,3.276913,8.034012,6.908381,14.159714,8.134950,3.726221,8.853818
1,TCGA-05-4382,68,0,1,1,0,0,0,1,1,...,2.249354,8.729852,3.496986,1.419593,6.718868,3.985218,12.086441,9.358550,10.203752,10.740885
2,TCGA-05-4384,66,0,1,0,1,0,0,1,1,...,0.000000,9.184729,3.255833,6.372553,6.979900,6.753591,12.666242,8.568876,5.604312,9.317634
3,TCGA-05-4389,70,0,1,1,0,0,0,1,1,...,7.026756,6.867551,3.231018,0.000000,5.653315,9.159379,14.116133,9.280318,4.468642,9.138057
4,TCGA-05-4390,58,1,0,1,0,0,0,1,1,...,0.000000,9.023034,2.611031,0.000000,10.271685,6.662077,11.198077,8.716329,10.799492,9.640357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,TCGA-NJ-A4YG,65,0,1,1,0,0,0,1,1,...,0.000000,7.735915,6.800680,5.660837,7.830458,4.740798,12.632689,8.453908,2.590243,11.047894
401,TCGA-NJ-A4YQ,69,1,0,1,0,0,0,1,1,...,6.868934,8.901479,6.162762,1.547104,7.962624,8.971518,12.745334,8.843045,5.220438,9.980754
402,TCGA-NJ-A55R,67,0,1,1,0,0,1,0,1,...,1.576667,4.831659,2.699352,1.717912,4.578721,5.505224,13.987104,8.479012,5.066645,12.268884
403,TCGA-NJ-A7XG,49,0,1,0,1,0,1,0,1,...,0.000000,8.406320,1.284159,3.113617,5.330358,6.948227,16.699600,8.212832,2.532242,11.091660


In [42]:
NA = ((meta_file['tumor_stage_N/A'] == 1)).sum()
NA

6

In [31]:
meta_file.Label.value_counts()

1    373
0     32
Name: Label, dtype: int64

In [24]:
count_F_Live = ((meta_file['gender_FEMALE'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_F_Live)
count_F_dead = ((meta_file['gender_FEMALE'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_F_dead)
F = meta_file.gender_FEMALE.value_counts()
print(F)

210
15
1    225
0    180
Name: gender_FEMALE, dtype: int64


In [26]:
count_M_Live = ((meta_file['gender_MALE'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_M_Live)
count_M_dead = ((meta_file['gender_MALE'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_M_dead)
M = meta_file.gender_MALE.value_counts()
print(M)

163
17
0    225
1    180
Name: gender_MALE, dtype: int64


In [28]:
count_early_live = ((meta_file['tumor_stage_Early'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_early_live)
count_early_dead = ((meta_file['tumor_stage_Early'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_early_dead)
early = meta_file.tumor_stage_Early.value_counts()
print(early)

293
26
1    319
0     86
Name: tumor_stage_Early, dtype: int64


In [29]:
count_late_live = ((meta_file['tumor_stage_Late'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_late_live)
count_late_dead = ((meta_file['tumor_stage_Late'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_late_dead)
late = meta_file.tumor_stage_Late.value_counts()
print(late)

74
6
0    325
1     80
Name: tumor_stage_Late, dtype: int64


In [30]:
count_smoker_live = ((meta_file['is_smoker_True'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_smoker_live)
count_smoker_dead = ((meta_file['is_smoker_True'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_smoker_dead)
smoker = meta_file.is_smoker_True.value_counts()
print(smoker)

237
19
1    256
0    149
Name: is_smoker_True, dtype: int64


In [32]:
count_nonsmoker_live = ((meta_file['is_smoker_False'] == 1) & (meta_file['Label'] == 1)).sum()
print(count_nonsmoker_live)
count_nonsmoker_dead = ((meta_file['is_smoker_False'] == 1) & (meta_file['Label'] == 0)).sum()
print(count_nonsmoker_dead)
nonsmoker = meta_file.is_smoker_False.value_counts()
print(nonsmoker)

136
13
0    256
1    149
Name: is_smoker_False, dtype: int64


In [33]:
count_under65_live = ((meta_file['age_at_initial_pathologic_diagnosis'] < 65) & (meta_file['Label'] == 1)).sum()
print(count_under65_live)
count_under65_dead = ((meta_file['age_at_initial_pathologic_diagnosis'] < 65) & (meta_file['Label'] == 0)).sum()
print(count_under65_dead)
under65 = (meta_file['age_at_initial_pathologic_diagnosis'] < 65).sum()
print(under65)

163
16
179


In [36]:
count_over65_live = ((meta_file['age_at_initial_pathologic_diagnosis'] >= 65) & (meta_file['Label'] == 1)).sum()
print(count_over65_live)
count_over65_dead = ((meta_file['age_at_initial_pathologic_diagnosis'] >= 65) & (meta_file['Label'] == 0)).sum()
print(count_over65_dead)
over65 = (meta_file['age_at_initial_pathologic_diagnosis'] >= 65).sum()
print(over65)

210
16
226
