In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from ipywidgets import IntProgress
from tqdm import tqdm_notebook as tqdm

# silences package warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
def prepare_df(df, target_var, cont_vars=[], cat_vars=[]):
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)

    # turns categorical variables into dummy variables
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)

    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = preprocessing.scale(cleaned_df[var])

    return cleaned_df

In [5]:
def xgbclassify(df, model, params=None, trainCV='none', target='txgot_binary', folds=5, iterations=100, print_id=1):
    
    feat_vars = [var for var in list(df.columns) if var != target]
    X = df[feat_vars].values
    y = df[target].values
    
    if trainCV=='grid':
        param_bins = {key: {i:0 for i in params[key]} for key in params.keys()}
    
    avg_train_auc = 0
    avg_test_auc = 0
    avg_train_acc = 0
    avg_test_acc = 0
    
    rskf = RepeatedStratifiedKFold(n_splits=folds, n_repeats=iterations)
    for train_index, test_index in tqdm(rskf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        train_auc, test_auc, train_acc, test_acc, best_params = xgbmodel(model, params, 
                                                                         X_train, X_test, 
                                                                         y_train, y_test, 
                                                                         trainCV, print_id)
        
        if trainCV=='grid':
            for key, value in best_params.items():
                param_bins[key][value] += 1
        
        avg_train_auc += train_auc
        avg_test_auc += test_auc
        avg_train_acc += train_acc
        avg_test_acc += test_acc
        
    avg_train_auc /= (folds * iterations)
    avg_test_auc /= (folds * iterations)
    avg_train_acc /= (folds * iterations)
    avg_test_acc /= (folds * iterations)
    
    print('Train vs. Test')
    print('Training AUC: {}'.format(round(avg_train_auc, 3)))
    print('Testing AUC: {}'.format(round(avg_test_auc, 3)))
    print()
    print('Training Accuracy: {}'.format(round(avg_train_acc, 3)))
    print('Testing Accuracy: {}'.format(round(avg_test_acc, 3)))
    
    if trainCV=='grid':
        print()
        for key, value in param_bins.items():
            print('{}: {}'.format(key, value))
    
    print()
    print('Feature Importance:')
    if not (trainCV is 'random' or trainCV is 'grid'):
        sorted_idx = np.argsort(model.feature_importances_)[::-1]
        for index in sorted_idx:
            print([feat_vars[index], model.feature_importances_[index]])
        
    return avg_train_auc, avg_test_auc    

In [6]:
def xgbmodel(model, params, X_train, X_test, y_train, y_test, trainCV, print_id):
    if trainCV is 'random':
        skf = StratifiedKFold(n_splits=5, shuffle = True, random_state=print_id)
        
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)
        
        my_model = RandomizedSearchCV(model, param_distributions=params, n_iter=5, 
                           scoring='roc_auc', n_jobs=4, 
                           cv=skf.split(X, Y), verbose=3)
        
        my_model.fit(X, Y)
        best_params = my_model.best_params_
        # print(best_params)
        
    elif trainCV is 'grid':
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)

        skf = StratifiedKFold(n_splits=5, shuffle = True)
        
        my_model = GridSearchCV(model, params, scoring='roc_auc', cv=skf.split(X,Y))
                
        my_model.fit(X, Y)
        best_params = my_model.best_params_
        # print(best_params)
        
    else:
        my_model = model
        
        my_model.fit(X_train, y_train)

#         eval_set=[(X_train, y_train), (X_test, y_test)]
#         my_model.fit(X_train, y_train, eval_set=eval_set, 
#                      eval_metric="auc", early_stopping_rounds=15, verbose=False)
        
        best_params = params
    
    train_pred = my_model.predict(X_train)
    test_pred = my_model.predict(X_test)
    
    fpr, tpr, _ = roc_curve(y_train, train_pred, pos_label=1)
    train_auc_score = auc(fpr, tpr)
    
    fpr, tpr, _ = roc_curve(y_test, test_pred, pos_label=1)
    test_auc_score = auc(fpr, tpr)
    
    train_auc = train_auc_score
    test_auc = test_auc_score
    
    train_acc = my_model.score(X_train, y_train)
    test_acc = my_model.score(X_test, y_test)
    
    test_metrics = confusion_matrix(y_test, test_pred).ravel()
    train_metrics = confusion_matrix(y_train, train_pred).ravel()

    return train_auc, test_auc, train_acc, test_acc, best_params

## Original Features

In [55]:
cat1 = ['gleason', 'UroRec_AS', 'UroRec_AS', 'UroRec_AS']
cat2 = ['white_binary', 'edu_binary', 'marry_binary']
cont1 = ['age', 'DVD_UroTalk_AS', 'DVD_UroTalk_surgery', 'DVD_UroTalk_rad']
topics = ['appt_topic', 'surgery_topic', 'radiation_topic', 'active_surveillance_topic']

In [58]:
topics_df = pd.read_csv('../../DataPlus/topics_dataframe.csv')

In [59]:
topics_df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Tx3,Advice1,Anx11,Anx111,Anx112,Anx113,Anx12,...,Convo_1_lemmatized,Convo_1_corp,radiation_topic,active_surveillance_topic,appt_topic,surgery_topic,edu_binary,combined_race,white_binary,marry_binary
0,0,263,263,1.0,SR,,,0.0,0.0,0.0,...,"['interview_length', 'significant', 'saw', 'su...","[(0, 1), (1, 2), (2, 1), (3, 4), (4, 31), (5, ...",0.380425,0.013018,0.0,0.50084,No College Degree,white,White,Married
1,1,264,264,2.0,A,0.0,1.0,,0.0,,...,"['interview_length', 'significant', 'person', ...","[(2, 1), (4, 2), (7, 1), (8, 3), (12, 2), (16,...",0.107739,0.0,0.172553,0.718799,College Degree,black,Not White,Married
2,2,265,265,2.0,A,2.0,3.0,2.0,2.0,3.0,...,"['interview_length', 'significant', 'significa...","[(4, 1), (9, 1), (12, 1), (17, 1), (19, 2), (2...",0.173313,0.119528,0.189936,0.390042,No College Degree,white,White,Married


## Original Features with Topics

In [57]:
topics_df = prepare_df(topics_df, 'txgot_binary', cat1+cat2, cont1+topics)

KeyError: "['UroRec_AS' 'UroRec_AS' 'UroRec_AS' 'DVD_UroTalk_AS'\n 'DVD_UroTalk_surgery' 'DVD_UroTalk_rad'] not in index"

In [17]:
xgb2 = XGBClassifier()

In [12]:
xgbclassify(original_w_topics_df, xgb2, trainCV=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train vs. Test
Training AUC: 0.987
Testing AUC: 0.684

Training Accuracy: 0.994
Testing Accuracy: 0.798

Feature Importance:
['surgery_topic', 0.2462203]
['age', 0.18574513]
['active_surveillance_topic', 0.116630666]
['appt_topic', 0.10151188]
['radiation_topic', 0.09287257]
[7.0, 0.088552915]
['R', 0.051835854]
['SR', 0.03887689]
['No College Degree', 0.034557234]
['ASR', 0.025917927]
['Not Married', 0.017278617]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]
['White', 0.0]


(0.9869839988525552, 0.6841071428571428)

## Grid Search

In [46]:
grid_params = {
    'min_child_weight': [1],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.6],
    'colsample_bytree': [0.1, 0.2, 0.3],
    'max_depth': [1, 2],
    'n_estimators': [150, 175]
}

In [47]:
xgb4 = XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=1)

In [48]:
xgbclassify(original_w_topics_df2, xgb4, params=grid_params, 
            trainCV='grid', iterations=40, print_id=3)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train vs. Test
Training AUC: 0.605
Testing AUC: 0.598

Training Accuracy: 0.936
Testing Accuracy: 0.936

min_child_weight: {1: 200}
gamma: {0: 108, 0.1: 43, 0.2: 49}
subsample: {0.5: 70, 0.6: 130}
colsample_bytree: {0.1: 85, 0.2: 71, 0.3: 44}
max_depth: {1: 115, 2: 85}
n_estimators: {150: 119, 175: 81}

Feature Importance:


(0.604767598967298, 0.5978273809523807)

In [51]:
xgb5=XGBClassifier(max_depth=1, gamma=0, min_child_weight=1, subsample=0.6,
                   colsample_bytree=0.1, n_estimators=150)
xgbclassify(original_w_topics_df2, xgb5, trainCV='none', folds=5, iterations=100)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train vs. Test
Training AUC: 0.715
Testing AUC: 0.641

Training Accuracy: 0.849
Testing Accuracy: 0.806

Feature Importance:
[7.0, 0.12631579]
['SR', 0.11578947]
['radiation_topic', 0.11578947]
['surgery_topic', 0.11578947]
['appt_topic', 0.094736844]
['age', 0.094736844]
['R', 0.08421053]
['ASR', 0.07368421]
['Not Married', 0.06315789]
['active_surveillance_topic', 0.06315789]
['No College Degree', 0.042105265]
['White', 0.010526316]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]


(0.7150352983362013, 0.6409095238095239)

In [100]:
cat_vars = ['gleason', 'Advice1']
cont_vars = ['age', 'active_surveillance_topic', 'surgery_topic']

In [21]:
original_w_topics_df2 = prepare_df(topics_df, 'txgot_binary', cont_vars, cat_vars)

In [102]:
xgb5=XGBClassifier(max_depth=2, gamma=0.5, min_child_weight=1, subsample=0.6,
                   colsample_bytree=0.5, n_estimators=200)
xgbclassify(original_w_topics_df2, xgb5, trainCV='none', folds=5, iterations=100)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Train vs. Test
Training AUC: 0.9
Testing AUC: 0.72

Training Accuracy: 0.946
Testing Accuracy: 0.823

Feature Importance:
['surgery_topic', 0.325]
['age', 0.2375]
['active_surveillance_topic', 0.23125]
[7.0, 0.075]
['SR', 0.0625]
['R', 0.05625]
['ASR', 0.0125]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]


(0.899512630522086, 0.7196071428571433)

In [None]:
grid_params = {
    'min_child_weight': [1],
    'gamma': [0.5, 0.55],
    'subsample': [0.5, 0.6],
    'colsample_bytree': [0.5, 0.55, 0.6],
    'max_depth': [2, 3],
    'n_estimators': [180, 200]
}

## Random Search

In [244]:
params = {
        'n_estimators': [50, 100, 150, 200],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [245]:
xgb3 = XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=1)

In [246]:
xgbclassify(original_w_topics_df, xgb3, params=params, 
            trainCV='random', iterations=1, print_id=2)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.7s finished


{'subsample': 0.8, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 4, 'gamma': 2, 'colsample_bytree': 0.6}
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.9s finished


{'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 2, 'colsample_bytree': 0.8}
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.8s finished


{'subsample': 0.8, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 4, 'gamma': 1, 'colsample_bytree': 0.6}
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.8s finished


{'subsample': 1.0, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 4, 'gamma': 2, 'colsample_bytree': 0.6}
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.8s finished


{'subsample': 0.8, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 4, 'gamma': 1.5, 'colsample_bytree': 0.6}
Train vs. Test
Training AUC: 0.777
Testing AUC: 0.769

Training Accuracy: 0.957
Testing Accuracy: 0.967

Feature Importance:


(0.7769279977051061, 0.7688095238095238)

ID's for Random CV Results: 1, 2

## Grid Search

In [50]:
grid_params = {
    'min_child_weight': [1, 5],
    'gamma': [0.5, 1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'max_depth': [5, 4],
    'n_estimators': [175, 200, 225]
}

In [51]:
xgb4 = XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=1)

In [52]:
xgbclassify(original_w_topics_df, xgb4, params=grid_params, 
            trainCV='grid', iterations=1, print_id=3)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Train vs. Test
Training AUC: 0.816
Testing AUC: 0.862

Training Accuracy: 0.968
Testing Accuracy: 0.967

min_child_weight: {1: 4, 5: 1}
gamma: {0.5: 3, 1: 2}
subsample: {0.6: 4, 0.8: 1}
colsample_bytree: {0.6: 2, 0.7: 1, 0.8: 2}
max_depth: {5: 1, 4: 4}
n_estimators: {175: 2, 200: 2, 225: 1}

Feature Importance:


(0.8160043029259898, 0.8619047619047621)

ID's for Grid CV Results: 3, 4, 5, 6

## Trying out Hyperparameters from Search

In [227]:
xgb5=XGBClassifier(max_depth=3, gamma=0.5, min_child_weight=1, subsample=0.6,
                   colsample_bytree=0.7, n_estimators=150)

In [228]:
xgbclassify(original_w_topics_df, xgb5, trainCV='none', iterations=100)

Train vs. Test
Training AUC: 0.952
Testing AUC: 0.682

Training Accuracy: 0.976
Testing Accuracy: 0.804

Feature Importance:
['surgery_topic', 0.23579545]
['active_surveillance_topic', 0.1903409]
['age', 0.16193181]
['radiation_topic', 0.11931818]
['appt_topic', 0.08238637]
[7.0, 0.0625]
['SR', 0.048295453]
['R', 0.045454547]
['ASR', 0.025568182]
['No College Degree', 0.025568182]
['Not Married', 0.0028409092]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]
['White', 0.0]


(0.9521177423981624, 0.6820309523809523)

Don't go below 0.7 colsample_bytree