In [1]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--drug', type=str, default='folfox')
parser.add_argument('--outcome', type=str, default='OS')
parser.add_argument('--data_type', type=str, default='comb')

args = parser.parse_args("")


In [2]:

drug = args.drug
outcome = args.outcome
data_type = args.data_type

import datetime
today = datetime.date.today()
today_str = today.strftime('%m%d%y')


from sklearn.metrics import average_precision_score, precision_recall_curve, roc_auc_score, roc_curve, auc
from math import sqrt


In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from skopt import BayesSearchCV


2023-06-20 12:39:55.270729: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:

data = pd.read_csv('../../data/crc_{}_mut_cna_fus_clin.csv'.format(drug), index_col=0)
data.reset_index(inplace=True, drop=False, names=['id'])
data = data.dropna(subset=[outcome])
data.head()


Unnamed: 0,id,id_institution,id_drugs_list,OS,id_tt_os_g_mos,PFS,id_tt_pfs_m_g_mos,mut_CDK4,mut_CCND3,mut_CDH1,...,clin_ca_first_dmets1,clin_ca_crc_td,clin_ca_crc_crm,clin_ca_crc_peri_inv,clin_crc_type,OS_time,clin_Histology Category,clin_Histology,clin_Derived Grade or Differentiation of Tumor,clin_CEA
0,GENIE-DFCI-000233,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,100.986842,0.0,31.578947,0,0,0,...,5,2,2,0,2,3303,0,0,1,1.0
1,GENIE-DFCI-000247,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",1,35.789474,0.0,3.914474,0,0,0,...,0,2,0,0,2,1163,0,0,0,4.9
2,GENIE-DFCI-000306,DFCI,"bev, Fluorouracil, Leucovorin Calcium, Oxalipl...",0,38.125,0.0,12.960526,0,0,0,...,1,2,2,0,3,1230,0,0,1,0.7
3,GENIE-DFCI-000738,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,51.578947,,,0,0,0,...,5,0,2,0,3,1623,0,1,1,1.8
4,GENIE-DFCI-000924,DFCI,"Fluorouracil, Leucovorin Calcium, Oxaliplatin",0,65.361842,,,0,0,0,...,5,0,2,1,0,2039,0,0,1,0.6


In [5]:
#create 5 train, test splits, within each train, create 5 train, valid splits
skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=1)
groups = data['id']
X = data[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col or 'fus_' in col]]
y = data[outcome]


In [6]:
res_df = pd.DataFrame(columns=['fold', 'val_auroc_mean', 'test_auroc_mean', 'test_auprc_mean'])
fold_count = 0
for train_index, test_index in skf.split(X, y, groups):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    
    #move all the samples in X_test, where clin_stage_dx is 0 to X_train
    mov_test_idxs = X_test[X_test['clin_stage_dx_iv'] == 0].index

    X_train = pd.concat([X_train, X_test.loc[mov_test_idxs]])
    X_test = X_test[X_test['clin_stage_dx_iv'] == 1]
    # use mov_test_idxs to move the y_test values to y_train
    y_train = pd.concat([y_train, y_test.loc[mov_test_idxs]])
    y_test.drop(mov_test_idxs, inplace=True)
    #now get a sample of X_train of the same len as mov_test_idxs, where clin_stage_dx_iv == 1
    X_1_samp = X_train[X_train['clin_stage_dx_iv'] == 1].sample(n=len(mov_test_idxs), random_state=1)
    y_1_samp = y_train.loc[X_1_samp.index]
    X_test = pd.concat([X_test, X_1_samp])
    y_test = pd.concat([y_test, y_1_samp])
    X_train.drop(X_1_samp.index, inplace=True)
    y_train.drop(X_1_samp.index, inplace=True)

    if data_type != 'comb':
        X_train = X_train[[col for col in data.columns if '{}_'.format(data_type) in col]]
        X_test = X_test[[col for col in data.columns if '{}_'.format(data_type) in col]]
    else:
        X_train = X_train[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col or 'fus_' in col]]
        X_test = X_test[[col for col in data.columns if 'mut_' in col or 'cna_' in col or 'clin_' in col or 'fus_' in col]]


    clf_xgb = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False)

    param_dist = {'n_estimators': [20, 50, 100, 200, 400],
                'learning_rate': [0.03, 0.05, 0.075, 0.1, 0.3, 0.5],
                'subsample': [0.4, 0.6, 1.0],
                'max_depth': [6, 8, 12, 20],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'min_child_weight': [2, 4, 6],
                'reg_alpha': [0, 0.5, 1, 5],
                'reg_lambda': [0, 0.5, 1, 5],
                }

    bayes_xgb = BayesSearchCV(clf_xgb, 
                            param_dist,
                            cv = 3,  
                            n_iter = 10, 
                            scoring = 'roc_auc', 
                            error_score = 0, 
                            verbose = 0, 
                            n_jobs = -1)
    print('Fitting model...')
    bayes_xgb.fit(X_train, y_train)
    results = pd.DataFrame(bayes_xgb.cv_results_)
    results.sort_values(by='rank_test_score').to_csv('../../results/hp_search/results_xgb_{}_{}_{}_{}.csv'.format(drug, outcome, data_type, today_str))

    best_xgb = bayes_xgb.best_estimator_

    best_xgb = bayes_xgb.best_estimator_

    y_pred = best_xgb.predict_proba(X_test)[:,1]
    
    test_auroc_mean = roc_auc_score(y_test, y_pred)
        
    test_auprc_mean = average_precision_score(y_test, y_pred)
    
    val_auroc = bayes_xgb.best_score_
    val_auroc_mean = val_auroc
    val_auroc_std = str(val_auroc - results['std_test_score'][bayes_xgb.best_index_]) + '-' + str(val_auroc + results['std_test_score'][bayes_xgb.best_index_])
    val_auroc_ci = str(val_auroc - 2*results['std_test_score'][bayes_xgb.best_index_]) + '-' + str(val_auroc + 2*results['std_test_score'][bayes_xgb.best_index_])
    res_df.loc[fold_count] = [fold_count, val_auroc_mean, test_auroc_mean, test_auprc_mean]
    
    del bayes_xgb
    del best_xgb

    fold_count += 1

ave_val_auroc = res_df['val_auroc_mean'].mean()
variance = res_df['val_auroc_mean'].var()


out_folder = '../../results/runs/{}'.format(today_str)
#if out_folder does not exist, create it
if not os.path.exists(out_folder):
    os.mkdir(out_folder)
    res_df.to_csv('../../results/runs/{}/results_xgb_{}_{}_{}_{}.csv'.format(today_str, drug, outcome, data_type, today_str))
else:
    res_df.to_csv('../../results/runs/{}/results_xgb_{}_{}_{}_{}.csv'.format(today_str, drug, outcome, data_type, today_str))


Fitting model...




Fitting model...




Fitting model...




Fitting model...




Fitting model...


