# XGBoost Model

In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm_notebook as tqdm

In [4]:
import GeneralModel as gm

In [5]:
def prepare_df(df, target_var, cont_vars=[], cat_vars=[]):
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)

    # turns categorical variables into dummy variables
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)

    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = preprocessing.scale(cleaned_df[var])

    return cleaned_df

In [53]:
va_split = pd.read_csv('../../DataPlus/va_split.csv')
dvd_split = pd.read_csv('../../DataPlus/dvd_split.csv')
dvd_topics = pd.read_csv('../../DataPlus/topics_dataframe.csv')

In [55]:
cont_vars=['age']
cat_vars=['edu_binary', 'marry_binary', 'white_binary', 'Advice1', 'gleason']
topic_vars = ['appt_topic', 'radiation_topic', 'surgery_topic', 'active_surveillance_topic']

In [56]:
va = prepare_df(va_split, 'txgot_binary', cont_vars, cat_vars)
dvd = prepare_df(dvd_split, 'txgot_binary', cont_vars, cat_vars)
topics = prepare_df(dvd_topics, 'txgot_binary', cont_vars + topic_vars, cat_vars)

In [47]:
def xgbclassify(df, model, trainCV=True, target='txgot_binary', folds=5, iterations=5):
    
    feat_vars = [var for var in list(df.columns) if var != target]
    X = df[feat_vars].values
    y = df[target].values
    
    avg_pos_prec = 0
    avg_pos_rec = 0
    avg_neg_prec = 0
    avg_neg_rec = 0
    avg_auc = 0
    
    rskf = RepeatedStratifiedKFold(n_splits=folds, n_repeats=iterations)
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        auc, (tn, fp, fn, tp) = xgbmodel(model, X_train, X_test, y_train, y_test, trainCV=True)
        avg_auc += auc
        
        avg_pos_prec += tp / (tp + fp)
        avg_pos_rec += tp / (tp + fn)
        avg_neg_prec += tn / (tn + fn)
        avg_neg_rec += tn / (tn + fp)
        
    avg_auc /= (folds*iterations)
    avg_pos_prec /= (folds * iterations)
    avg_pos_rec /= (folds * iterations)
    avg_neg_prec /= (folds * iterations)
    avg_neg_rec /= (folds * iterations)
    
    print('Average Metrics:')
    print('Positive Class Precision: {}'.format(round(avg_pos_prec, 3)))
    print('Positive Class Recall: {}'.format(round(avg_pos_rec, 3)))
    print('Negative Class Precision: {}'.format(round(avg_neg_prec, 3)))
    print('Negative Class Recall: {}'.format(round(avg_neg_rec, 3)))
    
    print()
    print('Feature Importance:')
    sorted_idx = np.argsort(model.feature_importances_)[::-1]
    for index in sorted_idx:
        print([feat_vars[index], model.feature_importances_[index]])
        
    return avg_auc
        

In [48]:
def xgbmodel(model, X_train, X_test, y_train, y_test, trainCV=True):
    model.fit(X_train, y_train)
    
    pred = my_model.predict(X_test)
    
    fpr, tpr, _ = roc_curve(y_test, pred, pos_label=1)
    auc_score = auc(fpr, tpr)
    
    metrics = confusion_matrix(y_test, pred).ravel()

    return auc_score, metrics

In [49]:
xgb1 = XGBClassifier()

In [50]:
xgbclassify(va, xgb1)

Average Metrics:
Positive Class Precision: 0.773
Positive Class Recall: 0.788
Negative Class Precision: 0.822
Negative Class Recall: 0.794

Feature Importance:
['age', 0.4704797]
[7.0, 0.08856089]
['S', 0.08856089]
['SR', 0.08118081]
['Not Married', 0.062730625]
['R', 0.055350553]
['No College Degree', 0.04797048]
['ASR', 0.042435423]
['AS', 0.042435423]
['White', 0.020295203]
['AR', 0.0]


0.7913680396643784

In [51]:
xgb2 = XGBClassifier()

In [52]:
xgbclassify(dvd, xgb2)

Average Metrics:
Positive Class Precision: 0.533
Positive Class Recall: 0.463
Negative Class Precision: 0.834
Negative Class Recall: 0.863

Feature Importance:
['age', 0.51167727]
[7.0, 0.13800424]
['No College Degree', 0.10403397]
['ASR', 0.07430998]
['R', 0.070063695]
['SR', 0.05732484]
['Not Married', 0.044585988]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]
['White', 0.0]


0.6629870129870131

In [57]:
xgb3 = XGBClassifier()

In [59]:
xgbclassify(topics, xgb3)

Average Metrics:
Positive Class Precision: 0.623
Positive Class Recall: 0.466
Negative Class Precision: 0.854
Negative Class Recall: 0.906

Feature Importance:
['surgery_topic', 0.26056337]
['active_surveillance_topic', 0.16901408]
['age', 0.16666667]
['appt_topic', 0.103286386]
[7.0, 0.08215962]
['radiation_topic', 0.07511737]
['SR', 0.06338028]
['R', 0.03521127]
['Not Married', 0.02112676]
['ASR', 0.016431924]
['No College Degree', 0.0070422534]
['S', 0.0]
['AS', 0.0]
['AR', 0.0]
['White', 0.0]


0.6857142857142858