In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
import re

### Handling Data ###

In [3]:
# Import expression data
expression = pd.read_csv('./Gene_expression_log2.csv')
expression.head()

Unnamed: 0,patient_id,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,...,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,sample_type
0,TCGA-05-4244,0.0,3.460913,3.618474,5.661048,9.731217,0.0,8.435591,1.033652,0.0,...,9.018679,5.350285,8.19732,9.90726,0.763921,10.088854,11.471137,9.768648,9.170596,1
1,TCGA-05-4249,0.0,3.034867,3.748848,6.515884,9.853335,0.0,7.191819,1.383939,0.0,...,8.172463,5.980428,8.950002,10.204975,4.41165,9.622978,11.199826,10.153704,9.433116,1
2,TCGA-05-4250,0.0,3.043572,2.811142,5.659257,10.156943,0.0,5.720508,0.0,0.0,...,10.033203,5.931168,8.517335,9.722642,4.782796,8.895339,12.40898,10.194166,9.060342,1
3,TCGA-05-4382,0.0,3.62423,3.099968,6.3894,9.65852,0.0,7.913021,0.564232,0.309525,...,9.558593,5.373036,8.441915,9.888267,6.041142,9.828389,12.725185,10.192589,9.376842,1
4,TCGA-05-4384,0.0,2.079088,2.168064,6.200361,9.137001,0.0,8.104766,0.687867,0.0,...,7.275566,6.340285,9.140127,10.36819,3.160501,9.607079,11.706703,10.763478,9.500392,1


## 3 Year

In [4]:
# Import 1-year exp data
yr_3 = pd.read_csv('top_100_DEgenes_3yr.txt', delimiter='\t')
yr_3.head()

Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,MAEL|84944,34.385374,3.815821,0.456804,8.353305,6.64e-17,1.2e-12
1,WIF1|11197,734.788524,3.071286,0.389516,7.884882,3.15e-15,2.84e-11
2,COL4A6|1288,38.42281,-2.225394,0.284721,-7.816044,5.45e-15,3.28e-11
3,CIDEC|63924,14.800624,-3.94203,0.510802,-7.717331,1.19e-14,5.36e-11
4,NTS|4922,176.357014,-3.859908,0.504927,-7.644489,2.1e-14,7.57e-11


In [5]:
# Number of genes in both datasets
len(set(expression.columns).intersection(set(yr_3.gene)))

100

In [6]:
# How many genes are present in the one-year that aren't in the expression data?
# Should be 0
set(yr_3.gene) - set(expression.columns)

set()

In [7]:
common_genes = set(yr_3.gene).intersection(set(expression.columns))
common_genes.add('patient_id')

In [8]:
yr_3_exp = expression[list(common_genes)]

### Label data ###

Which patients have survived?

In [9]:
y = pd.read_csv('labels_3yr.csv')
y.head()

Unnamed: 0,patient_id,Label
0,TCGA-05-4249,1
1,TCGA-05-4389,1
2,TCGA-05-4390,1
3,TCGA-05-4398,1
4,TCGA-05-4415,0


In [10]:
y.Label.value_counts()

1    119
0     90
Name: Label, dtype: int64

In [11]:
patient_id = y['patient_id']
exp_file = y.merge(yr_3_exp, on='patient_id')
exp_file = exp_file.drop(['patient_id'],axis = 1)

In [12]:
exp_file.shape

(209, 101)

### Add metadata

In [14]:
# Add patient_id back
exp_file['patient_id'] = patient_id

# Add metadata to meta_file
metadata_final = pd.read_csv('./metadata_final_no_os.csv')
meta_file = metadata_final.merge(exp_file, how='right', on='patient_id')

In [15]:
meta_file

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender_FEMALE,gender_MALE,tumor_stage_Early,tumor_stage_Late,tumor_stage_N/A,is_smoker_False,is_smoker_True,Label,...,ABAT|18,CDHR5|53841,CIDEC|63924,UPK1B|7348,LDLRAD3|143458,HNF4A|3172,WFDC12|128488,SLC2A1|6513,IL22RA1|58985,C18orf54|162681
0,TCGA-05-4249,67,0,1,1,0,0,0,1,1,...,8.906139,1.383939,0.464564,0.000000,5.474572,1.962586,0.000000,10.577164,3.369704,5.575424
1,TCGA-05-4389,70,0,1,1,0,0,0,1,1,...,7.908465,6.276312,1.270708,1.004825,6.959886,8.220693,1.004825,11.612035,7.404767,6.124988
2,TCGA-05-4390,58,1,0,1,0,0,0,1,1,...,9.087121,0.712552,0.000000,0.712552,5.123070,0.712552,5.149272,11.945268,5.494041,8.024037
3,TCGA-05-4398,47,1,0,0,1,0,0,1,1,...,8.183343,1.454123,0.728399,0.283922,6.172285,2.869299,0.283922,13.400613,5.629336,6.535726
4,TCGA-05-4415,57,0,1,0,1,0,0,1,0,...,5.005000,4.087463,2.321928,9.919029,9.721289,3.429991,0.000000,14.361128,7.144087,7.442023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,TCGA-MP-A5C7,76,1,0,1,0,0,0,1,1,...,11.312883,3.994516,0.562865,0.000000,3.956456,3.398432,0.500802,9.146235,3.280155,4.496910
205,TCGA-NJ-A4YF,50,1,0,1,0,0,0,1,1,...,5.559874,1.497025,0.733094,1.147372,5.226971,0.000000,0.000000,11.419834,4.855317,4.309867
206,TCGA-NJ-A4YG,65,0,1,1,0,0,0,1,1,...,9.474034,0.657549,0.657549,0.000000,4.955271,4.722696,2.158402,10.000944,2.987030,4.722696
207,TCGA-NJ-A4YQ,69,1,0,1,0,0,0,1,1,...,9.036635,1.527671,0.702835,0.000000,4.870296,1.173447,0.702835,12.011598,4.263380,6.591223


In [16]:
def format_columns(df):
    '''
    Description: This function aims to format all columns in a given dataset.
                 It removes special characters, replaces spaces with _, \
                 and converts to lowercase.
    Parameters: df, the DataFrame to format.
    '''
    import re
    
    df.columns = [re.sub('[^A-Za-z0-9 _]+', '', c).replace(' ', '_').lower() \
                    for c in df.columns]
    return df

In [17]:
# Make all columns lowercase and regex
meta_file = format_columns(meta_file)

### Permutations of Metadata for Feature Selection

In [18]:
meta_file

Unnamed: 0,patient_id,age_at_initial_pathologic_diagnosis,gender_female,gender_male,tumor_stage_early,tumor_stage_late,tumor_stage_na,is_smoker_false,is_smoker_true,label,...,abat18,cdhr553841,cidec63924,upk1b7348,ldlrad3143458,hnf4a3172,wfdc12128488,slc2a16513,il22ra158985,c18orf54162681
0,TCGA-05-4249,67,0,1,1,0,0,0,1,1,...,8.906139,1.383939,0.464564,0.000000,5.474572,1.962586,0.000000,10.577164,3.369704,5.575424
1,TCGA-05-4389,70,0,1,1,0,0,0,1,1,...,7.908465,6.276312,1.270708,1.004825,6.959886,8.220693,1.004825,11.612035,7.404767,6.124988
2,TCGA-05-4390,58,1,0,1,0,0,0,1,1,...,9.087121,0.712552,0.000000,0.712552,5.123070,0.712552,5.149272,11.945268,5.494041,8.024037
3,TCGA-05-4398,47,1,0,0,1,0,0,1,1,...,8.183343,1.454123,0.728399,0.283922,6.172285,2.869299,0.283922,13.400613,5.629336,6.535726
4,TCGA-05-4415,57,0,1,0,1,0,0,1,0,...,5.005000,4.087463,2.321928,9.919029,9.721289,3.429991,0.000000,14.361128,7.144087,7.442023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,TCGA-MP-A5C7,76,1,0,1,0,0,0,1,1,...,11.312883,3.994516,0.562865,0.000000,3.956456,3.398432,0.500802,9.146235,3.280155,4.496910
205,TCGA-NJ-A4YF,50,1,0,1,0,0,0,1,1,...,5.559874,1.497025,0.733094,1.147372,5.226971,0.000000,0.000000,11.419834,4.855317,4.309867
206,TCGA-NJ-A4YG,65,0,1,1,0,0,0,1,1,...,9.474034,0.657549,0.657549,0.000000,4.955271,4.722696,2.158402,10.000944,2.987030,4.722696
207,TCGA-NJ-A4YQ,69,1,0,1,0,0,0,1,1,...,9.036635,1.527671,0.702835,0.000000,4.870296,1.173447,0.702835,12.011598,4.263380,6.591223


In [21]:
count_F_Live = ((meta_file['gender_female'] == 1) & (meta_file['label'] == 1)).sum()
print(count_F_Live)
count_F_dead = ((meta_file['gender_female'] == 1) & (meta_file['label'] == 0)).sum()
print(count_F_dead)
F = meta_file.gender_female.value_counts()
print(F)

66
50
1    116
0     93
Name: gender_female, dtype: int64


In [22]:
count_M_Live = ((meta_file['gender_male'] == 1) & (meta_file['label'] == 1)).sum()
print(count_M_Live)
count_M_dead = ((meta_file['gender_male'] == 1) & (meta_file['label'] == 0)).sum()
print(count_M_dead)
M = meta_file.gender_male.value_counts()
print(M)

53
40
0    116
1     93
Name: gender_male, dtype: int64


In [23]:
count_early_live = ((meta_file['tumor_stage_early'] == 1) & (meta_file['label'] == 1)).sum()
print(count_early_live)
count_early_dead = ((meta_file['tumor_stage_early'] == 1) & (meta_file['label'] == 0)).sum()
print(count_early_dead)
early = meta_file.tumor_stage_early.value_counts()
print(early)

99
58
1    157
0     52
Name: tumor_stage_early, dtype: int64


In [24]:
count_late_live = ((meta_file['tumor_stage_late'] == 1) & (meta_file['label'] == 1)).sum()
print(count_late_live)
count_late_dead = ((meta_file['tumor_stage_late'] == 1) & (meta_file['label'] == 0)).sum()
print(count_late_dead)
late = meta_file.tumor_stage_late.value_counts()
print(late)

18
31
0    160
1     49
Name: tumor_stage_late, dtype: int64


In [25]:
count_smoker_live = ((meta_file['is_smoker_true'] == 1) & (meta_file['label'] == 1)).sum()
print(count_smoker_live)
count_smoker_dead = ((meta_file['is_smoker_true'] == 1) & (meta_file['label'] == 0)).sum()
print(count_smoker_dead)
smoker = meta_file.is_smoker_true.value_counts()
print(smoker)

78
56
1    134
0     75
Name: is_smoker_true, dtype: int64


In [26]:
count_nonsmoker_live = ((meta_file['is_smoker_false'] == 1) & (meta_file['label'] == 1)).sum()
print(count_nonsmoker_live)
count_nonsmoker_dead = ((meta_file['is_smoker_false'] == 1) & (meta_file['label'] == 0)).sum()
print(count_nonsmoker_dead)
nonsmoker = meta_file.is_smoker_false.value_counts()
print(nonsmoker)

41
34
0    134
1     75
Name: is_smoker_false, dtype: int64


In [27]:
count_under65_live = ((meta_file['age_at_initial_pathologic_diagnosis'] < 65) & (meta_file['label'] == 1)).sum()
print(count_under65_live)
count_under65_dead = ((meta_file['age_at_initial_pathologic_diagnosis'] < 65) & (meta_file['label'] == 0)).sum()
print(count_under65_dead)
under65 = (meta_file['age_at_initial_pathologic_diagnosis'] < 65).sum()
print(under65)

49
44
93


In [28]:
count_over65_live = ((meta_file['age_at_initial_pathologic_diagnosis'] >= 65) & (meta_file['label'] == 1)).sum()
print(count_over65_live)
count_over65_dead = ((meta_file['age_at_initial_pathologic_diagnosis'] >= 65) & (meta_file['label'] == 0)).sum()
print(count_over65_dead)
over65 = (meta_file['age_at_initial_pathologic_diagnosis'] >= 65).sum()
print(over65)

70
46
116
