# XGBoost Model

In [61]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm_notebook as tqdm

In [15]:
import GeneralModel as gm

In [18]:
def prepare_df(df, target_var, cont_vars=[], cat_vars=[]):
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)

    # turns categorical variables into dummy variables
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)

    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = preprocessing.scale(cleaned_df[var])

    return cleaned_df

In [19]:
va_split = pd.read_csv('../../DataPlus/va_split.csv')
dvd_split = pd.read_csv('../../DataPlus/dvd_split.csv')

In [20]:
cont_vars=['age']
cat_vars=['edu_binary', 'marry_binary', 'white_binary', 'Advice1', 'gleason']

In [21]:
va = prepare_df(va_split, 'txgot_binary', cont_vars, cat_vars)
dvd = prepare_df(va_split, 'txgot_binary', cont_vars, cat_vars)

In [74]:
def xgbclassify(df, model, trainCV=True, target='txgot_binary', folds=5, iterations=5):
    
    feat_vars = [var for var in list(df.columns) if var != target]
    X = df[feat_vars].values
    y = df[target].values
    
    avg_pos_prec = 0
    avg_pos_rec = 0
    avg_neg_prec = 0
    avg_neg_rec = 0
    avg_auc = 0
    
    rskf = RepeatedStratifiedKFold(n_splits=folds, n_repeats=iterations)
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        auc, (tn, fp, fn, tp) = xgbmodel(model, X_train, X_test, y_train, y_test, trainCV=True)
        avg_auc += auc
        
        avg_pos_prec += tp / (tp + fp)
        avg_pos_rec += tp / (tp + fn)
        avg_neg_prec += tn / (tn + fn)
        avg_neg_rec += tn / (tn + fp)
        
    avg_auc /= (folds*iterations)
    avg_pos_prec /= (folds * iterations)
    avg_pos_rec /= (folds * iterations)
    avg_neg_prec /= (folds * iterations)
    avg_neg_rec /= (folds * iterations)
    
    print('Average Metrics:')
    print('Positive Class Precision: {}'.format(round(avg_pos_prec, 3)))
    print('Positive Class Recall: {}'.format(round(avg_pos_rec, 3)))
    print('Negative Class Precision: {}'.format(round(avg_neg_prec, 3)))
    print('Negative Class Recall: {}'.format(round(avg_neg_rec, 3)))
    
    print()
    print('Feature Importance:')
    sorted_idx = np.argsort(model.feature_importances_)[::-1]
    for index in sorted_idx:
        print([train.columns[index], model.feature_importances_[index]])
        
    return avg_auc
        

In [75]:
def xgbmodel(model, X_train, X_test, y_train, y_test, trainCV=True):
    my_model = model
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    
    fpr, tpr, _ = roc_curve(y_test, pred, pos_label=1)
    auc_score = auc(fpr, tpr)
    
    metrics = confusion_matrix(y_test, pred).ravel()

    return auc_score, metrics

In [76]:
xgb1 = XGBClassifier()

In [78]:
xgbclassify(va, xgb1)

Average Metrics:
Positive Class Precision: 0.751
Positive Class Recall: 0.789
Negative Class Precision: 0.818
Negative Class Recall: 0.772


AttributeError: 'XGBClassifier' object has no attribute 'plot_importance'

In [56]:
xgbclassify(dvd, xgb)

Average Metrics:
Positive Class Precision: 0.758
Positive Class Recall: 0.778
Negative Class Precision: 0.814
Negative Class Recall: 0.788


0.7832917620137301