In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from ipywidgets import IntProgress
from tqdm import tqdm_notebook as tqdm

# silences package warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def prepare_df(df, target_var, cont_vars=[], cat_vars=[]):
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)

    # turns categorical variables into dummy variables
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)

    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = preprocessing.scale(cleaned_df[var])

    return cleaned_df

In [116]:
def xgbclassify(df, model, params=None, trainCV='none', target='txgot_binary', folds=5, iterations=100, print_id=1):
    
    feat_vars = [var for var in list(df.columns) if var != target]
    
    X = df[feat_vars].values
    y = df[target].values
    
    if trainCV=='grid':
        param_bins = {key: {i:0 for i in params[key]} for key in params.keys()}
    
    avg_train_auc = 0
    avg_test_auc = 0
    avg_train_acc = 0
    avg_test_acc = 0
    
    rskf = RepeatedStratifiedKFold(n_splits=folds, n_repeats=iterations)
    for train_index, test_index in tqdm(rskf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        train_auc, test_auc, train_acc, test_acc, best_params, feats = xgbmodel(model, params, 
                                                                         X_train, X_test, 
                                                                         y_train, y_test, 
                                                                         trainCV, print_id)
        
        if trainCV=='grid':
            for key, value in best_params.items():
                param_bins[key][value] += 1
        
        avg_train_auc += train_auc
        avg_test_auc += test_auc
        avg_train_acc += train_acc
        avg_test_acc += test_acc
        
    avg_train_auc /= (folds * iterations)
    avg_test_auc /= (folds * iterations)
    avg_train_acc /= (folds * iterations)
    avg_test_acc /= (folds * iterations)
    
    print('Train vs. Test')
    print('Training AUC: {}'.format(round(avg_train_auc, 3)))
    print('Testing AUC: {}'.format(round(avg_test_auc, 3)))
    print()
    print('Training Accuracy: {}'.format(round(avg_train_acc, 3)))
    print('Testing Accuracy: {}'.format(round(avg_test_acc, 3)))
    
    if trainCV=='grid':
        print()
        for key, value in param_bins.items():
            print('{}: {}'.format(key, value))
    
    print()
    print('Feature Importance:')
    if not (trainCV is 'random' or trainCV is 'grid'):
        print(feats)
        
        sorted_idx = np.argsort(feats)[::-1]
        print(sorted_idx)
        for index in sorted_idx:
            print([feat_vars[index], feats[index]])
        
    return avg_train_auc, avg_test_auc    

In [126]:
def xgbmodel(model, params, X_train, X_test, y_train, y_test, trainCV, print_id):
    if trainCV is 'random':
        skf = StratifiedKFold(n_splits=5, shuffle = True, random_state=print_id)
        
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)
        
        my_model = RandomizedSearchCV(model, param_distributions=params, n_iter=5, 
                           scoring='roc_auc', n_jobs=4, 
                           cv=skf.split(X, Y), verbose=3)
        
        my_model.fit(X, Y)
        best_params = my_model.best_params_
        # print(best_params)
        
    elif trainCV is 'grid':
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)

        skf = StratifiedKFold(n_splits=5, shuffle = True)
        
        my_model = GridSearchCV(model, params, scoring='roc_auc', cv=skf.split(X,Y))
                
        my_model.fit(X, Y)
        
        feats = None
        best_params = my_model.best_params_
        # print(best_params)
        
    else:
        my_model = model
        
        my_model.fit(X_train, y_train)

#         eval_set=[(X_train, y_train), (X_test, y_test)]
#         my_model.fit(X_train, y_train, eval_set=eval_set, 
#                      eval_metric="auc", early_stopping_rounds=15, verbose=False)
        
        feats = my_model.feature_importances_
        best_params = params
    
    train_pred = my_model.predict(X_train)
    test_pred = my_model.predict(X_test)
    
    fpr, tpr, _ = roc_curve(y_train, train_pred, pos_label=1)
    train_auc_score = auc(fpr, tpr)
    
    fpr, tpr, _ = roc_curve(y_test, test_pred, pos_label=1)
    test_auc_score = auc(fpr, tpr)
    
    train_auc = train_auc_score
    test_auc = test_auc_score
    
    train_acc = my_model.score(X_train, y_train)
    test_acc = my_model.score(X_test, y_test)
    
    test_metrics = confusion_matrix(y_test, test_pred).ravel()
    train_metrics = confusion_matrix(y_train, train_pred).ravel()

    return train_auc, test_auc, train_acc, test_acc, best_params, feats

In [5]:
feat_df = pd.read_csv('../../DataPlus/dvd_split.csv')

In [90]:
cat1 = ['gleason', 'DVD_UroRec_AS', 'DVD_UroRec_AS', 'DVD_UroRec_AS']
cont1 = ['age', 'DVD_UroTalk_AS', 'DVD_UroTalk_surgery', 'DVD_UroTalk_rad']

In [97]:
first_df = prepare_df(feat_df, 'txgot_binary', cont1, cat1)

In [98]:
second_df = prepare_df(feat_df, 'txgot_binary', ['age'], ['gleason'])

In [95]:
first_df.head(5)

Unnamed: 0,age,DVD_UroTalk_AS,DVD_UroTalk_surgery,DVD_UroTalk_rad,txgot_binary,7.0,DVD_UroRec_AS,DVD_UroRec_AS.1,DVD_UroRec_AS.2
5,0.843002,-0.77582,-0.495508,1.215406,0.0,1,0.0,0.0,0.0
9,0.420208,-0.815134,1.221288,-0.743932,0.0,1,1.0,1.0,1.0
12,-0.566311,-0.815134,0.877928,-0.464027,0.0,1,0.0,0.0,0.0
14,0.56114,1.150565,-0.152149,-1.023838,1.0,1,1.0,1.0,1.0
16,0.279277,-0.028854,-1.182227,0.37569,1.0,0,1.0,1.0,1.0


## Grid Search

In [129]:
grid_params = {
    'min_child_weight': [2, 3],
    'gamma': [0],
    'subsample': [0.5, 0.6, 0.7],
    'colsample_bytree': [0.2, 0.3, 0.4],
    'max_depth': [1, 2],
    'n_estimators': [180, 190]
}

In [131]:
xgb4 = XGBClassifier(learning_rate=0.02, objective='binary:logistic',
                    silent=True, nthread=1)
xgbclassify(first_df, xgb4, params=grid_params, 
            trainCV='grid', iterations=40, print_id=3)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Train vs. Test
Training AUC: 0.888
Testing AUC: 0.889

Training Accuracy: 0.961
Testing Accuracy: 0.963

min_child_weight: {2: 121, 3: 79}
gamma: {0: 200}
subsample: {0.5: 70, 0.6: 46, 0.7: 84}
colsample_bytree: {0.2: 100, 0.3: 36, 0.4: 64}
max_depth: {1: 118, 2: 82}
n_estimators: {180: 157, 190: 43}

Feature Importance:


(0.8875756848438305, 0.8888601190476189)

In [132]:
xgb5=XGBClassifier(max_depth=1, gamma=0, min_child_weight=2, subsample=0.7,
                   colsample_bytree=0.2, n_estimators=180)
xgbclassify(first_df, xgb5, trainCV='none', folds=5, iterations=100)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Train vs. Test
Training AUC: 0.889
Testing AUC: 0.853

Training Accuracy: 0.905
Testing Accuracy: 0.869

Feature Importance:
[0.20833333 0.18333334 0.14166667 0.2        0.01666667 0.025
 0.06666667 0.05       0.03333334 0.025      0.00833333 0.01666667
 0.01666667 0.00833333]
[ 0  3  1  2  6  7  8  9  5 12 11  4 13 10]
['age', 0.20833333]
['DVD_UroTalk_rad', 0.2]
['DVD_UroTalk_AS', 0.18333334]
['DVD_UroTalk_surgery', 0.14166667]
['DVD_UroRec_AS', 0.06666667]
['DVD_UroRec_AS', 0.05]


IndexError: list index out of range