In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from ipywidgets import IntProgress
from tqdm import tqdm_notebook as tqdm

# silences package warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def prepare_df(df, target_var, cont_vars=[], cat_vars=[]):
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)

    # turns categorical variables into dummy variables
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)

    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = preprocessing.scale(cleaned_df[var])

    return cleaned_df

In [3]:
def xgbclassify(df, model, params=None, trainCV='none', target='txgot_binary', folds=5, iterations=100, print_id=1):
    
    feat_vars = [var for var in list(df.columns) if var != target]
    X = df[feat_vars].values
    y = df[target].values
    
    if trainCV=='grid':
        param_bins = {key: {i:0 for i in params[key]} for key in params.keys()}
    
    avg_train_auc = 0
    avg_test_auc = 0
    avg_train_acc = 0
    avg_test_acc = 0
    
    rskf = RepeatedStratifiedKFold(n_splits=folds, n_repeats=iterations)
    for train_index, test_index in tqdm(rskf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        train_auc, test_auc, train_acc, test_acc, best_params = xgbmodel(model, params, 
                                                                         X_train, X_test, 
                                                                         y_train, y_test, 
                                                                         trainCV, print_id)
        
        if trainCV=='grid':
            for key, value in best_params.items():
                param_bins[key][value] += 1
        
        avg_train_auc += train_auc
        avg_test_auc += test_auc
        avg_train_acc += train_acc
        avg_test_acc += test_acc
        
    avg_train_auc /= (folds * iterations)
    avg_test_auc /= (folds * iterations)
    avg_train_acc /= (folds * iterations)
    avg_test_acc /= (folds * iterations)
    
    print('Train vs. Test')
    print('Training AUC: {}'.format(round(avg_train_auc, 3)))
    print('Testing AUC: {}'.format(round(avg_test_auc, 3)))
    print()
    print('Training Accuracy: {}'.format(round(avg_train_acc, 3)))
    print('Testing Accuracy: {}'.format(round(avg_test_acc, 3)))
    
    if trainCV=='grid':
        print()
        for key, value in param_bins.items():
            print('{}: {}'.format(key, value))
    
    print()
    print('Feature Importance:')
    if not (trainCV is 'random' or trainCV is 'grid'):
        sorted_idx = np.argsort(model.feature_importances_)[::-1]
        for index in sorted_idx:
            print([feat_vars[index], model.feature_importances_[index]])
        
    return avg_train_auc, avg_test_auc    

In [4]:
def xgbmodel(model, params, X_train, X_test, y_train, y_test, trainCV, print_id):
    if trainCV is 'random':
        skf = StratifiedKFold(n_splits=5, shuffle = True, random_state=print_id)
        
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)
        
        my_model = RandomizedSearchCV(model, param_distributions=params, n_iter=5, 
                           scoring='roc_auc', n_jobs=4, 
                           cv=skf.split(X, Y), verbose=3)
        
        my_model.fit(X, Y)
        best_params = my_model.best_params_
        # print(best_params)
        
    elif trainCV is 'grid':
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((y_train, y_test), axis=0)

        skf = StratifiedKFold(n_splits=5, shuffle = True)
        
        my_model = GridSearchCV(model, params, scoring='roc_auc', cv=skf.split(X,Y))
                
        my_model.fit(X, Y)
        best_params = my_model.best_params_
        # print(best_params)
        
    else:
        my_model = model
        
        my_model.fit(X_train, y_train)

#         eval_set=[(X_train, y_train), (X_test, y_test)]
#         my_model.fit(X_train, y_train, eval_set=eval_set, 
#                      eval_metric="auc", early_stopping_rounds=15, verbose=False)
        
        best_params = params
    
    train_pred = my_model.predict(X_train)
    test_pred = my_model.predict(X_test)
    
    fpr, tpr, _ = roc_curve(y_train, train_pred, pos_label=1)
    train_auc_score = auc(fpr, tpr)
    
    fpr, tpr, _ = roc_curve(y_test, test_pred, pos_label=1)
    test_auc_score = auc(fpr, tpr)
    
    train_auc = train_auc_score
    test_auc = test_auc_score
    
    train_acc = my_model.score(X_train, y_train)
    test_acc = my_model.score(X_test, y_test)
    
    test_metrics = confusion_matrix(y_test, test_pred).ravel()
    train_metrics = confusion_matrix(y_train, train_pred).ravel()

    return train_auc, test_auc, train_acc, test_acc, best_params

## Grid Search