In [None]:
# %%
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--drug', type=str, default='ib')
parser.add_argument('--outcome', type=str, default='OS')

args = parser.parse_args()

drug = args.drug
outcome = args.outcome

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [13]:
drug_list = ['ib', 'vegf', 'egfr']
out_list = ['PFS', 'OS']

In [14]:
res_df = pd.DataFrame()
for drug in drug_list:
    for outcome in out_list:
        data = pd.read_csv('../data/crc_{}_mut_cna_clin.csv'.format(drug), index_col=0)
        y_var = outcome
        data = data.dropna(subset=[y_var])
        #training data is id_institution == 'DFCI' or 'MSKCC', test is 'VICC'
        train = data[data['id_institution'].isin(['DFCI', 'MSKCC'])]
        test = data[data['id_institution'] == 'VICC']

        dtypes = ['mut', 'cna', 'clin', 'comb']

        train_dfs = {}
        test_dfs = {}

        for dtype in dtypes:
            if dtype != 'comb':
                train_dfs[dtype] = train[[col for col in data.columns if '{}_'.format(dtype) in col]]
                test_dfs[dtype] = test[[col for col in data.columns if '{}_'.format(dtype) in col]]
            else:
                train_dfs[dtype] = train[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col]]
                test_dfs[dtype] = test[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col]]

        
        
        for dtype in dtypes:
            rf = RandomForestClassifier(n_estimators=1000)
            X = train_dfs[dtype]
            y = train[y_var]
            rf.fit(X, y)
            auroc = roc_auc_score(test[y_var], rf.predict_proba(test_dfs[dtype])[:,1])
            #add the results to the dataframe
            # res_df = pd.concat(res_df, pd.DataFrame({'drug': drug, 'outcome': outcome, 'dtype': dtype, 'auroc': auroc}))
            #fix this error -  ValueError: If using all scalar values, you must pass an index
            res_df = pd.concat([res_df, pd.DataFrame({'drug': [drug], 'outcome': [outcome], 'dtype': [dtype], 'auroc': [auroc]})])


In [15]:
res_df

Unnamed: 0,drug,outcome,dtype,auroc
0,ib,PFS,mut,0.208333
0,ib,PFS,cna,0.598958
0,ib,PFS,clin,0.322917
0,ib,PFS,comb,0.416667
0,ib,OS,mut,0.718182
0,ib,OS,cna,0.672727
0,ib,OS,clin,0.618182
0,ib,OS,comb,0.681818
0,vegf,PFS,mut,0.5
0,vegf,PFS,cna,0.487731


In [16]:
#find the top performing model overall
res_df.groupby(['drug', 'outcome']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,dtype,auroc
drug,outcome,Unnamed: 2_level_1,Unnamed: 3_level_1
egfr,OS,mut,0.576149
egfr,PFS,mut,0.68125
ib,OS,mut,0.718182
ib,PFS,mut,0.598958
vegf,OS,mut,0.663441
vegf,PFS,mut,0.5


In [None]:
#get teh 

In [3]:
data = pd.read_csv('../data/crc_vegf_mut_cna_clin.csv', index_col=0)
data.head()

Unnamed: 0_level_0,institution,drugs_list,OS,PFS,mut_CDK4,mut_CCND3,mut_CDH1,mut_CDK8,mut_GNAS,mut_PRKAR1A,...,clin_ca_tx_pre_path_stage,clin_ca_first_dmets1,clin_ca_crc_td,clin_ca_crc_crm,clin_ca_crc_peri_inv,clin_crc_type,clin_Histology Category,clin_Histology,clin_Derived Grade or Differentiation of Tumor,clin_CEA
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GENIE-DFCI-000147,DFCI,"Bevacizumab, Fluorouracil, Irinotecan Hydrochl...",1,1.0,0,0,0,0,0,0,...,0,5,0,2,0,2,0,0,1,2.8
GENIE-DFCI-000233,DFCI,"Bevacizumab, Fluorouracil, Irinotecan Hydrochl...",1,0.0,0,0,0,0,0,0,...,1,5,2,2,0,2,0,0,1,1.0
GENIE-DFCI-000306,DFCI,"Bevacizumab, Fluorouracil, Leucovorin Calcium,...",1,1.0,0,0,0,0,0,0,...,0,1,2,2,0,3,0,0,1,0.7
GENIE-DFCI-000738,DFCI,"Bevacizumab, Fluorouracil, Irinotecan Hydrochl...",0,0.0,0,0,0,0,0,0,...,0,5,0,2,0,3,0,1,1,1.8
GENIE-DFCI-000946,DFCI,"Bevacizumab, Fluorouracil",1,0.0,0,0,0,0,0,0,...,0,5,2,3,1,0,0,0,1,5.5


In [4]:
data_mut = data[[col for col in data.columns if 'mut_' in col]]
data_cna = data[[col for col in data.columns if 'cna_' in col]]
data_clin = data[[col for col in data.columns if 'clin_' in col]]
os = data['OS']
pfs = data['PFS']
print(data.shape, data_mut.shape, data_cna.shape, data_clin.shape)

(922, 473) (922, 224) (922, 224) (922, 21)


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
#get top 20 feature importances and plot as a horizontal bar chart
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plt.title("Feature importances")
plt.barh(range(20), importances[indices][:20], color="r", align="center")
plt.yticks(range(20), X.columns[indices][:20])
plt.ylim([-1, 20])
plt.savefig('../results/feature_importances.png')
