In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [38]:
egfr_out = pd.read_csv('../data/crc_egfr_out.csv', index_col=0)
folfox_out = pd.read_csv('../data/crc_folfox_out.csv', index_col=0)
folfiri_out = pd.read_csv('../data/crc_folfiri_out.csv', index_col=0)
ib_out = pd.read_csv('../data/crc_ib_out.csv', index_col=0)
tri_out = pd.read_csv('../data/crc_tri_out.csv', index_col=0)

egfr_out.rename(columns={'os_g_status': 'OS', 'pfs_m_g_status': 'PFS'}, inplace=True)
folfox_out.rename(columns={'os_g_status': 'OS', 'pfs_m_g_status': 'PFS'}, inplace=True)
folfiri_out.rename(columns={'os_g_status': 'OS', 'pfs_m_g_status': 'PFS'}, inplace=True)
ib_out.rename(columns={'os_g_status': 'OS', 'pfs_m_g_status': 'PFS'}, inplace=True)
tri_out.rename(columns={'os_g_status': 'OS', 'pfs_m_g_status': 'PFS'}, inplace=True)

#add prefix to egfr_out columns except PFS and OS
egfr_out.columns = ['id_' + i if i not in ['PFS', 'OS'] else i for i in egfr_out.columns]
folfox_out.columns = ['id_' + i if i not in ['PFS', 'OS'] else i for i in folfox_out.columns]
folfiri_out.columns = ['id_' + i if i not in ['PFS', 'OS'] else i for i in folfiri_out.columns]
ib_out.columns = ['id_' + i if i not in ['PFS', 'OS'] else i for i in ib_out.columns]
tri_out.columns = ['id_' + i if i not in ['PFS', 'OS'] else i for i in tri_out.columns]
egfr_out.head()

Unnamed: 0,id_record_id,id_institution,id_drugs_list,OS,id_tt_os_g_mos,PFS,id_tt_pfs_m_g_mos,id_sample_id
0,GENIE-DFCI-000971,DFCI,"Bevacizumab, Cetuximab, Irinotecan Hydrochloride",1,11.546053,1.0,5.723684,GENIE-DFCI-000971-10958
1,GENIE-DFCI-001292,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin,...",1,10.493421,1.0,4.144737,GENIE-DFCI-001292-7406
2,GENIE-DFCI-001463,DFCI,Cetuximab,1,9.703947,0.0,5.756579,GENIE-DFCI-001463-9650
3,GENIE-DFCI-001463,DFCI,"Cetuximab, Irinotecan Hydrochloride",1,3.947368,1.0,2.927632,GENIE-DFCI-001463-9650
4,GENIE-DFCI-002507,DFCI,"Fluorouracil, Irinotecan Hydrochloride, Leucov...",0,35.427632,1.0,5.953947,GENIE-DFCI-002507-4815


In [39]:
mut = pd.read_csv('../data/crc_mutoh_pertreat.csv', index_col=0)
mut.columns = ['mut_' + i for i in mut.columns]
cna = pd.read_csv('../data/crc_cna_pertreat.csv', index_col=0)
cna.columns = ['cna_' + i for i in cna.columns]
fusion = pd.read_csv('../data/crc_fusion.csv', index_col=0)
fusion.columns = ['fus_' + i for i in fusion.columns]
clin = pd.read_csv('../data/crc_clin_pub.csv', index_col=0)
#add prefix to clin columns except PFS and OS
clin.columns = ['clin_' + i if i not in ['PFS', 'OS', 'OS_time'] else i for i in clin.columns]
#drop 'OS' and 'PFS' columns
clin = clin.drop(['OS', 'PFS'], axis=1)
cat_cols = [col for col in clin.columns if col.startswith('clin') and clin[col].nunique() <= 10]
num_cols = [col for col in clin.columns if col.startswith('clin') and clin[col].nunique() > 10]

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in cat_cols:
    clin[i] = le.fit_transform(clin[i])
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='median')
for i in num_cols:
    clin[i] = imp.fit_transform(clin[i].values.reshape(-1,1))

In [41]:
print(mut.shape[1], cna.shape[1], fusion.shape[1], clin.shape[1])

224 224 257 22


In [42]:
print('total columns:', mut.shape[1] + cna.shape[1] + fusion.shape[1] + clin.shape[1])

total columns: 727


In [43]:
egfr = egfr_out.set_index('id_sample_id')
ib = ib_out.set_index('id_sample_id')
folfox = folfox_out.set_index('id_sample_id')
folfiri = folfiri_out.set_index('id_sample_id')
tri = tri_out.set_index('id_sample_id')


In [44]:
egfr_mut = egfr.join(mut, how='inner')
print(egfr_mut.shape)
egfr_mut_cna = egfr_mut.join(cna, how='inner')
print(egfr_mut_cna.shape)
egfr_mut_cna_fus = egfr_mut_cna.join(fusion, how='left')
fus_cols = [col for col in egfr_mut_cna_fus.columns if col.startswith('fus')]
egfr_mut_cna_fus[fus_cols] = egfr_mut_cna_fus[fus_cols].fillna(0)
print(egfr_mut_cna_fus.shape)

(359, 231)
(354, 455)
(354, 712)


In [45]:
folfiri_mut = folfiri.join(mut, how='inner')
print(folfiri_mut.shape)
folfiri_mut_cna = folfiri_mut.join(cna, how='inner')
print(folfiri_mut_cna.shape)
folfiri_mut_cna_fus = folfiri_mut_cna.join(fusion, how='left')
fus_cols = [col for col in folfiri_mut_cna_fus.columns if col.startswith('fus')]
folfiri_mut_cna_fus[fus_cols] = folfiri_mut_cna_fus[fus_cols].fillna(0)
print(folfiri_mut_cna_fus.shape)


(936, 231)
(909, 455)
(909, 712)


In [46]:
tri_mut = tri.join(mut, how='inner')
tri_mut_cna = tri_mut.join(cna, how='inner')
tri_mut_cna_fus = tri_mut_cna.join(fusion, how='left')
fus_cols = [col for col in tri_mut_cna_fus.columns if col.startswith('fus')]
tri_mut_cna_fus[fus_cols] = tri_mut_cna_fus[fus_cols].fillna(0)
print(tri_mut_cna.shape)

ib_mut = ib.join(mut, how='inner')
ib_mut_cna = ib_mut.join(cna, how='inner')
ib_mut_cna_fus = ib_mut_cna.join(fusion, how='left')
fus_cols = [col for col in ib_mut_cna_fus.columns if col.startswith('fus')]
ib_mut_cna_fus[fus_cols] = ib_mut_cna_fus[fus_cols].fillna(0)
print(ib_mut_cna.shape)

folfox_mut = folfox.join(mut, how='inner')
folfox_mut_cna = folfox_mut.join(cna, how='inner')
folfox_mut_cna_fus = folfox_mut_cna.join(fusion, how='left')
fus_cols = [col for col in folfox_mut_cna_fus.columns if col.startswith('fus')]
folfox_mut_cna_fus[fus_cols] = folfox_mut_cna_fus[fus_cols].fillna(0)
print(folfox_mut_cna.shape)

(148, 455)


(146, 455)
(1290, 455)


In [47]:
egfr_mut_cna_clin_fus = egfr_mut_cna_fus.set_index('id_record_id').join(clin, how='inner')
folfiri_mut_cna_clin_fus = folfiri_mut_cna_fus.set_index('id_record_id').join(clin, how='inner')
tri_mut_cna_clin_fus = tri_mut_cna_fus.set_index('id_record_id').join(clin, how='inner')
ib_mut_cna_clin_fus = ib_mut_cna_fus.set_index('id_record_id').join(clin, how='inner')
folfox_mut_cna_clin_fus = folfox_mut_cna_fus.set_index('id_record_id').join(clin, how='inner')
print(egfr_mut_cna_clin_fus.shape, ib_mut_cna_clin_fus.shape, folfox_mut_cna_clin_fus.shape, folfiri_mut_cna_clin_fus.shape, tri_mut_cna_clin_fus.shape)

(354, 733) (146, 733) (1290, 733) (909, 733) (148, 733)


In [52]:
egfr_mut_cna_clin_fus['mut_KRAS'].value_counts()

mut_KRAS
0    321
1     33
Name: count, dtype: int64

In [30]:
egfr_mut_cna_clin_fus.head()

Unnamed: 0,id_institution,id_drugs_list,OS,id_tt_os_g_mos,PFS,id_tt_pfs_m_g_mos,mut_CDK4,mut_CCND3,mut_CDH1,mut_CDK8,...,clin_ca_first_dmets1,clin_ca_crc_td,clin_ca_crc_crm,clin_ca_crc_peri_inv,clin_crc_type,OS_time,clin_Histology Category,clin_Histology,clin_Derived Grade or Differentiation of Tumor,clin_CEA
GENIE-DFCI-000971,DFCI,"Bevacizumab, Cetuximab, Irinotecan Hydrochloride",1,11.546053,1.0,5.723684,0,0,0,0,...,0,1,0,0,1,814,0,0,4,9.3
GENIE-DFCI-001292,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin,...",1,10.493421,1.0,4.144737,0,0,0,0,...,5,2,3,0,2,637,0,0,4,20.0
GENIE-DFCI-001463,DFCI,Cetuximab,1,9.703947,0.0,5.756579,0,0,0,0,...,5,2,2,0,0,5468,0,0,1,0.5
GENIE-DFCI-001463,DFCI,"Cetuximab, Irinotecan Hydrochloride",1,3.947368,1.0,2.927632,0,0,0,0,...,5,2,2,0,0,5468,0,0,1,0.5
GENIE-DFCI-002507,DFCI,"Fluorouracil, Irinotecan Hydrochloride, Leucov...",0,35.427632,1.0,5.953947,0,0,0,0,...,0,2,2,0,0,2288,0,0,3,83.4


In [31]:
ib_mut_cna_clin_fus.head()

Unnamed: 0,id_institution,id_drugs_list,OS,PFS,id_tt_os_g_mos,id_tt_pfs_m_g_mos,mut_CDK4,mut_CCND3,mut_CDH1,mut_CDK8,...,clin_ca_first_dmets1,clin_ca_crc_td,clin_ca_crc_crm,clin_ca_crc_peri_inv,clin_crc_type,OS_time,clin_Histology Category,clin_Histology,clin_Derived Grade or Differentiation of Tumor,clin_CEA
GENIE-DFCI-000971,DFCI,Regorafenib,1,1.0,0.625,0.361842,0,0,0,0,...,0,1,0,0,1,814,0,0,4,9.3
GENIE-DFCI-001038,DFCI,Regorafenib,0,1.0,23.223684,3.157895,0,0,0,0,...,0,2,0,0,0,1465,0,0,4,100.0
GENIE-DFCI-002499,DFCI,Cabozantinib Smalate,1,0.0,8.881579,0.921053,0,0,0,0,...,1,1,1,1,0,1343,0,0,1,100.0
GENIE-DFCI-002507,DFCI,Regorafenib,1,1.0,4.802632,2.697368,0,0,0,0,...,0,2,2,0,0,2288,0,0,3,83.4
GENIE-DFCI-002561,DFCI,Regorafenib,1,1.0,10.789474,4.605263,0,0,0,0,...,0,1,2,1,0,2140,0,0,1,8.4


In [32]:
folfox_mut_cna_clin_fus.head()

Unnamed: 0,id_institution,id_drugs_list,OS,id_tt_os_g_mos,PFS,id_tt_pfs_m_g_mos,mut_CDK4,mut_CCND3,mut_CDH1,mut_CDK8,...,clin_ca_first_dmets1,clin_ca_crc_td,clin_ca_crc_crm,clin_ca_crc_peri_inv,clin_crc_type,OS_time,clin_Histology Category,clin_Histology,clin_Derived Grade or Differentiation of Tumor,clin_CEA
GENIE-DFCI-000233,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,100.986842,0.0,31.578947,0,0,0,0,...,5,2,2,0,2,3303,0,0,1,1.0
GENIE-DFCI-000247,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",1,35.789474,0.0,3.914474,0,0,0,0,...,0,2,0,0,2,1163,0,0,0,4.9
GENIE-DFCI-000306,DFCI,"bev, Fluorouracil, Leucovorin Calcium, Oxalipl...",0,38.125,0.0,12.960526,0,0,0,0,...,1,2,2,0,3,1230,0,0,1,0.7
GENIE-DFCI-000738,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,51.578947,,,0,0,0,0,...,5,0,2,0,3,1623,0,1,1,1.8
GENIE-DFCI-000924,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,65.361842,,,0,0,0,0,...,5,0,2,1,0,2039,0,0,1,0.6


In [33]:
egfr_mut_cna_clin_fus.to_csv('../data/crc_egfr_mut_cna_fus_clin.csv')
folfiri_mut_cna_clin_fus.to_csv('../data/crc_folfiri_mut_cna_fus_clin.csv')
tri_mut_cna_clin_fus.to_csv('../data/crc_tri_mut_cna_fus_clin.csv')
ib_mut_cna_clin_fus.to_csv('../data/crc_ib_mut_cna_fus_clin.csv')
folfox_mut_cna_clin_fus.to_csv('../data/crc_folfox_mut_cna_fus_clin.csv')

In [61]:
for drug in ['egfr', 'folfiri', 'tri', 'ib', 'folfox']:
    data = pd.read_csv('../data/crc_{}_mut_cna_fus_clin.csv'.format(drug))
    outcome = 'PFS'
    X = data[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col or 'fus_' in col]]
    y = data[outcome]
    #find the correlation between the features and the outcome
    correlations = X.corrwith(y)
    correlations.sort_values(inplace=True)
    print(drug)
    print(correlations.head(10))

egfr
cna_NBN                     -0.174800
cna_FANCA                   -0.171731
clin_ca_crc_crm             -0.171293
mut_TP53                    -0.148662
cna_SRC                     -0.132919
cna_CDH1                    -0.127540
cna_BRAF                    -0.118618
cna_PALB2                   -0.114993
clin_ca_tx_pre_path_stage   -0.112702
cna_CREBBP                  -0.110553
dtype: float64
folfiri
clin_ca_crc_crm   -0.153231
mut_PTCH1         -0.107211
mut_IGF2          -0.078722
mut_DNMT3A        -0.078310
cna_NBN           -0.073271
cna_SMAD2         -0.070084
mut_ERCC2         -0.066617
cna_SRC           -0.066122
cna_CIC           -0.062343
mut_MCL1          -0.060092
dtype: float64
tri
mut_SMAD2                 -0.248602
mut_MLH1                  -0.202277
mut_XPO1                  -0.202277
mut_GNAS                  -0.202277
mut_CDKN1A                -0.202277
clin_Histology Category   -0.200237
mut_KIT                   -0.191606
cna_APC                   -0.182775
clin_

In [64]:
pd.crosstab(data['OS'], data['PFS'], normalize=True)

PFS,0.0,1.0
OS,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.477858,0.088568
1,0.256437,0.177137
