# A custom xgboost classifier pipeline which does early stopping within cross validation, feature selection, and probability calibration

In [1]:
# Import modules

import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from matplotlib import markers
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, auc, classification_report, f1_score, log_loss, precision_recall_curve, roc_curve, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, SelectKBest
from sklearn.isotonic import IsotonicRegression
from sklearn.preprocessing import LabelBinarizer
from xgboost.sklearn import XGBClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from random import uniform
from bayes_opt import BayesianOptimization

In [2]:
# Set plot style
plt.style.use('dark_background')

In [3]:
# Set pandas preferences:
pd.options.display.max_columns=500
pd.options.display.max_colwidth=1000

# Load some easy sample data for testing

In [4]:
from sklearn import datasets

In [5]:
cancer = datasets.load_breast_cancer()

In [6]:
X = cancer.data

In [7]:
y = cancer.target

In [8]:
# There's not a lot of data here...
from sklearn.datasets import make_classification

In [9]:
data, target = make_classification(n_samples=10000,
                                   n_features=50,
                                   n_informative=30,
                                   n_redundant=10)

In [16]:
# This is a balanced dataset already.

In [17]:
# Split up the data as needed.
train_X, calibrate_X, train_y, calibrate_y = train_test_split(data, target, test_size=0.1)

In [18]:
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.2)

# Some useful functions

In [56]:
def make_roc_curves_array(true_vals, scores_vals, figsize=(10,10)):
    """Function for making nice-ish looking roc curves."""
    fpr, tpr, threshold = roc_curve(y_true=true_vals, y_score=scores_vals)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=figsize)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc='lower right')
    plt.show()
    
    return fpr, tpr, threshold, roc_auc

In [57]:
def make_roc_curves_df(df, true_col, scores_col, figsize=(10,10)):
    """Function for making nice-ish looking roc curves."""
    fpr, tpr, threshold = roc_curve(y_true=df[true_col].values, y_score=df[scores_col].values)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=figsize)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc='lower right')
    plt.show()
    
    return fpr, tpr, threshold, roc_auc

In [58]:
def calibration_func_array(true_vals, probability_vals_list, legend_labels, title, n_bins=20, figsize=(10,10)):
    """Function for making reliability plots"""
    y_vals = []
    x_vals = []
    for vals in probability_vals_list:
        y_array, x_array = calibration_curve(true_vals, vals, n_bins=n_bins)
        y_vals.append(y_array)
        x_vals.append(x_array)
    
    fig, ax = plt.subplots(figsize=figsize)
    for i in range(0, len(legend_labels)):
        plt.plot(x_vals[i], y_vals[i], linewidth=1, label=legend_labels[i], marker=markers.MarkerStyle.filled_markers[i])
    
    # Reference line, legends, and axis labels
    line = mlines.line2D([0, 1], [0, 1], color='white')
    transform = ax.transAxes
    line.set_transform(transform)
    ax.add_line(line)
    fig.suptitle(title)
    ax.set_xlabel('Predicted probability')
    ax.set_ylabel('True probability in each bin')
    plt.legend()
    plt.show()

# Custom xgboost classifier class object

In [61]:
class EarlyStoppingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifier, evaluation_metric, num_rounds, feature_selector=None, validation_size=None):
        self.classifier = classifier
        self.evaluation_metric = evaluation_metric
        self.num_rounds = num_rounds
        self.feature_selector = feature_selector
        self.validation_size = validation_size
        self.classes_ = None
        self.classifier_ = None
        self.feature_selector_ = None
        
    def fit(self, X, y):
        """Function for fitting the base estimator.  Implements feature selection and also auto generates a validation set for early stopping.
        Also supports probability calibration."""
        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        
        # Do feature selection if desired
        if self.feature_selector:
            X_use = self.feature_select(X, y)
        else:
            X_use = X
        
        # Make the validation set:
        X_train, X_valid, y_train, y_valid = self.make_validation_set(X_use, y)

        # Fit the base estimator
        self.classifier_ = clone(self.classifier)
        self.classifier = self.classifier_.fit(X_train,
                                               y_train,
                                               eval_metric=self.evaluation_metric,
                                               eval_set=[[X_valid, y_valid]],
                                               early_stopping_rounds=self.num_rounds)
        
        return self
    
    def predict(self, X):
        """Makes predictions on input data from trained classifier.  
        If feature selection is used, it is applied automatically to the input data."""
        # The predict function changes in XGBoost when early stopping is performed.
        if self.feature_selector:
            X_select = self.feature_selector.transform(X)
        else:
            X_select = X
            
        # This method should only be called once the model is fit anyway, and since we are always using early stopping, 
        # we can set ntree_limit by default.
        return self.classifier.predict(X_select, ntree_limit=self.classifier.best_ntree_limit)
    
    def predict_proba(self, X):
        """Makes probability predictions on input data from trained classifier.  If feature selection is used,
        it is applied automatically to the input data.  Since this method should only be called after the 
        classifier is fit, and since we are always using early stopping, the best_ntree_limit should be defined by default."""
        if self.feature_selector:
            X_select = self.feature_selector.transform(X)
        else:
            X_select = X
        
        return self.classifier.predict_proba(X_select, ntree_limit=self.classifier.best_ntree_limit)

    def feature_select(self, X, y):
        """Implements feature selection if so desired."""
        # TODO: Get tree based feature selection working.
        self.feature_selector_ = clone(self.feature_selector)
        self.feature_selector = self.feature_selector_.fit(X, y)
        
        return self.feature_selector.transform(X)
        
    def make_validation_set(self, X, y):
        """Makes the validation set."""
        if self.validation_size:
            val_size = self.validation_size
        else:
            val_size = 0.1
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=val_size)
        return X_train, X_valid, y_train, y_valid

In [13]:
cv = 5

In [28]:
params_feature_select_xgb = EarlyStoppingClassifier(classifier=XGBClassifier(objective='binary:logistic', booster='gbtree', silent=True),
                                                    feature_selector=SelectKBest(score_func=mutual_info_classif),
                                                    evaluation_metric='logloss',
                                                    num_rounds=5)

In [29]:
param_grid = {'feature_selector__k': sp_randint(20, 45),
              'classifier__max_depth': sp_randint(3, 15),
              'classifier__min_child_weight': sp_randint(1, 5),
              'classifier__learning_rate': np.random.uniform(low=0.01, high=0.2, size=(100,)),
              'classifier__n_estimators': [1000],
              'classifier__colsample_bytree': np.random.uniform(low=0.7, high=1, size=(100,)),
              'classifier__subsample': np.random.uniform(low=0.7, high=1, size=(100,)),
              'classifier__seed': [0]}

In [30]:
# Let's try something a bit different.  First a random gridsearch.
model_gen_random = RandomizedSearchCV(estimator=params_feature_select_xgb, 
                                      param_distributions=param_grid, 
                                      cv=cv, 
                                      scoring='recall', 
                                      verbose=10, 
                                      iid=True, 
                                      refit=False, 
                                      n_iter=15)

In [31]:
model_gen_random.fit(train_X, train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] classifier__colsample_bytree=0.717886834273, classifier__subsample=0.910590715578, classifier__learning_rate=0.119746200149, classifier__seed=0, classifier__n_estimators=1000, feature_selector__k=42, classifier__min_child_weight=1, classifier__max_depth=12 
[0]	validation_0-logloss:0.646533
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.604991
[2]	validation_0-logloss:0.571073
[3]	validation_0-logloss:0.542483
[4]	validation_0-logloss:0.5168
[5]	validation_0-logloss:0.489674
[6]	validation_0-logloss:0.469428
[7]	validation_0-logloss:0.448473
[8]	validation_0-logloss:0.431707
[9]	validation_0-logloss:0.418417
[10]	validation_0-logloss:0.405729
[11]	validation_0-logloss:0.396978
[12]	validation_0-logloss:0.386223
[13]	validation_0-logloss:0.373424
[14]	validation_0-logloss:0.36407
[15]	validation_0-logloss:0.353852
[16]	validation_0-logloss:0.346229
[17]	validation_0-logloss:0

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s


[0]	validation_0-logloss:0.641505
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.602332
[2]	validation_0-logloss:0.56523
[3]	validation_0-logloss:0.539956
[4]	validation_0-logloss:0.513317
[5]	validation_0-logloss:0.489683
[6]	validation_0-logloss:0.466173
[7]	validation_0-logloss:0.44469
[8]	validation_0-logloss:0.42513
[9]	validation_0-logloss:0.407707
[10]	validation_0-logloss:0.392545
[11]	validation_0-logloss:0.378505
[12]	validation_0-logloss:0.365773
[13]	validation_0-logloss:0.35269
[14]	validation_0-logloss:0.344334
[15]	validation_0-logloss:0.331564
[16]	validation_0-logloss:0.32158
[17]	validation_0-logloss:0.310596
[18]	validation_0-logloss:0.301421
[19]	validation_0-logloss:0.292124
[20]	validation_0-logloss:0.284054
[21]	validation_0-logloss:0.278036
[22]	validation_0-logloss:0.272051
[23]	validation_0-logloss:0.266199
[24]	validation_0-logloss:0.261095
[25]	validation_0-logloss:0.255636
[26]	validation_0-logloss:0.249569
[27

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.5s remaining:    0.0s


[0]	validation_0-logloss:0.648173
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.605366
[2]	validation_0-logloss:0.576094
[3]	validation_0-logloss:0.543885
[4]	validation_0-logloss:0.514356
[5]	validation_0-logloss:0.487688
[6]	validation_0-logloss:0.464895
[7]	validation_0-logloss:0.44999
[8]	validation_0-logloss:0.43435
[9]	validation_0-logloss:0.419401
[10]	validation_0-logloss:0.406251
[11]	validation_0-logloss:0.394076
[12]	validation_0-logloss:0.382434
[13]	validation_0-logloss:0.374062
[14]	validation_0-logloss:0.365135
[15]	validation_0-logloss:0.357222
[16]	validation_0-logloss:0.349949
[17]	validation_0-logloss:0.342774
[18]	validation_0-logloss:0.335569
[19]	validation_0-logloss:0.327897
[20]	validation_0-logloss:0.320269
[21]	validation_0-logloss:0.31347
[22]	validation_0-logloss:0.30702
[23]	validation_0-logloss:0.303139
[24]	validation_0-logloss:0.298019
[25]	validation_0-logloss:0.292961
[26]	validation_0-logloss:0.288348
[2

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.1s remaining:    0.0s


[0]	validation_0-logloss:0.645637
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.603846
[2]	validation_0-logloss:0.568399
[3]	validation_0-logloss:0.536803
[4]	validation_0-logloss:0.51034
[5]	validation_0-logloss:0.491271
[6]	validation_0-logloss:0.469005
[7]	validation_0-logloss:0.448393
[8]	validation_0-logloss:0.428974
[9]	validation_0-logloss:0.413103
[10]	validation_0-logloss:0.397812
[11]	validation_0-logloss:0.38343
[12]	validation_0-logloss:0.371692
[13]	validation_0-logloss:0.360918
[14]	validation_0-logloss:0.348625
[15]	validation_0-logloss:0.341072
[16]	validation_0-logloss:0.330106
[17]	validation_0-logloss:0.321088
[18]	validation_0-logloss:0.316909
[19]	validation_0-logloss:0.307508
[20]	validation_0-logloss:0.29949
[21]	validation_0-logloss:0.292933
[22]	validation_0-logloss:0.286339
[23]	validation_0-logloss:0.280827
[24]	validation_0-logloss:0.276675
[25]	validation_0-logloss:0.273517
[26]	validation_0-logloss:0.267068
[

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   21.3s remaining:    0.0s


[0]	validation_0-logloss:0.646443
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.603239
[2]	validation_0-logloss:0.569398
[3]	validation_0-logloss:0.540776
[4]	validation_0-logloss:0.511291
[5]	validation_0-logloss:0.487355
[6]	validation_0-logloss:0.466487
[7]	validation_0-logloss:0.448006
[8]	validation_0-logloss:0.431457
[9]	validation_0-logloss:0.415592
[10]	validation_0-logloss:0.400519
[11]	validation_0-logloss:0.38719
[12]	validation_0-logloss:0.376269
[13]	validation_0-logloss:0.36642
[14]	validation_0-logloss:0.35728
[15]	validation_0-logloss:0.345565
[16]	validation_0-logloss:0.335021
[17]	validation_0-logloss:0.328332
[18]	validation_0-logloss:0.319573
[19]	validation_0-logloss:0.311952
[20]	validation_0-logloss:0.304275
[21]	validation_0-logloss:0.298977
[22]	validation_0-logloss:0.294025
[23]	validation_0-logloss:0.289736
[24]	validation_0-logloss:0.283315
[25]	validation_0-logloss:0.280206
[26]	validation_0-logloss:0.276129
[

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   26.0s remaining:    0.0s


[0]	validation_0-logloss:0.637258
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.587678
[2]	validation_0-logloss:0.553636
[3]	validation_0-logloss:0.528014
[4]	validation_0-logloss:0.504723
[5]	validation_0-logloss:0.482295
[6]	validation_0-logloss:0.464447
[7]	validation_0-logloss:0.446586
[8]	validation_0-logloss:0.431104
[9]	validation_0-logloss:0.422617
[10]	validation_0-logloss:0.413997
[11]	validation_0-logloss:0.404722
[12]	validation_0-logloss:0.3967
[13]	validation_0-logloss:0.387343
[14]	validation_0-logloss:0.382786
[15]	validation_0-logloss:0.376893
[16]	validation_0-logloss:0.368948
[17]	validation_0-logloss:0.364996
[18]	validation_0-logloss:0.361991
[19]	validation_0-logloss:0.359764
[20]	validation_0-logloss:0.352561
[21]	validation_0-logloss:0.346794
[22]	validation_0-logloss:0.34201
[23]	validation_0-logloss:0.336784
[24]	validation_0-logloss:0.33466
[25]	validation_0-logloss:0.330564
[26]	validation_0-logloss:0.326249
[2

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   29.2s remaining:    0.0s


[0]	validation_0-logloss:0.633293
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.590326
[2]	validation_0-logloss:0.548195
[3]	validation_0-logloss:0.51956
[4]	validation_0-logloss:0.495378
[5]	validation_0-logloss:0.475008
[6]	validation_0-logloss:0.452978
[7]	validation_0-logloss:0.436522
[8]	validation_0-logloss:0.417922
[9]	validation_0-logloss:0.403781
[10]	validation_0-logloss:0.391443
[11]	validation_0-logloss:0.382553
[12]	validation_0-logloss:0.374834
[13]	validation_0-logloss:0.36647
[14]	validation_0-logloss:0.360169
[15]	validation_0-logloss:0.35244
[16]	validation_0-logloss:0.346402
[17]	validation_0-logloss:0.340521
[18]	validation_0-logloss:0.333544
[19]	validation_0-logloss:0.326814
[20]	validation_0-logloss:0.323047
[21]	validation_0-logloss:0.318268
[22]	validation_0-logloss:0.315324
[23]	validation_0-logloss:0.312072
[24]	validation_0-logloss:0.308858
[25]	validation_0-logloss:0.304543
[26]	validation_0-logloss:0.30087
[2

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   32.5s remaining:    0.0s


[0]	validation_0-logloss:0.623725
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.575036
[2]	validation_0-logloss:0.535718
[3]	validation_0-logloss:0.500963
[4]	validation_0-logloss:0.474575
[5]	validation_0-logloss:0.449229
[6]	validation_0-logloss:0.433578
[7]	validation_0-logloss:0.414969
[8]	validation_0-logloss:0.402446
[9]	validation_0-logloss:0.386474
[10]	validation_0-logloss:0.3742
[11]	validation_0-logloss:0.364409
[12]	validation_0-logloss:0.35642
[13]	validation_0-logloss:0.348079
[14]	validation_0-logloss:0.339215
[15]	validation_0-logloss:0.334161
[16]	validation_0-logloss:0.330229
[17]	validation_0-logloss:0.324668
[18]	validation_0-logloss:0.320433
[19]	validation_0-logloss:0.315252
[20]	validation_0-logloss:0.310243
[21]	validation_0-logloss:0.307497
[22]	validation_0-logloss:0.30338
[23]	validation_0-logloss:0.299763
[24]	validation_0-logloss:0.296555
[25]	validation_0-logloss:0.292035
[26]	validation_0-logloss:0.28703
[27

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   36.0s remaining:    0.0s


[0]	validation_0-logloss:0.636632
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.593185
[2]	validation_0-logloss:0.560096
[3]	validation_0-logloss:0.531901
[4]	validation_0-logloss:0.50457
[5]	validation_0-logloss:0.487273
[6]	validation_0-logloss:0.467988
[7]	validation_0-logloss:0.452586
[8]	validation_0-logloss:0.437543
[9]	validation_0-logloss:0.428843
[10]	validation_0-logloss:0.41854
[11]	validation_0-logloss:0.406731
[12]	validation_0-logloss:0.399009
[13]	validation_0-logloss:0.388112
[14]	validation_0-logloss:0.379105
[15]	validation_0-logloss:0.37329
[16]	validation_0-logloss:0.365544
[17]	validation_0-logloss:0.358165
[18]	validation_0-logloss:0.349617
[19]	validation_0-logloss:0.343713
[20]	validation_0-logloss:0.339172
[21]	validation_0-logloss:0.337115
[22]	validation_0-logloss:0.335956
[23]	validation_0-logloss:0.333405
[24]	validation_0-logloss:0.328572
[25]	validation_0-logloss:0.322644
[26]	validation_0-logloss:0.320092
[

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   39.1s remaining:    0.0s


[0]	validation_0-logloss:0.629068
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.578235
[2]	validation_0-logloss:0.548716
[3]	validation_0-logloss:0.516501
[4]	validation_0-logloss:0.490322
[5]	validation_0-logloss:0.46839
[6]	validation_0-logloss:0.456523
[7]	validation_0-logloss:0.438915
[8]	validation_0-logloss:0.421217
[9]	validation_0-logloss:0.410472
[10]	validation_0-logloss:0.398915
[11]	validation_0-logloss:0.389076
[12]	validation_0-logloss:0.378472
[13]	validation_0-logloss:0.37035
[14]	validation_0-logloss:0.364805
[15]	validation_0-logloss:0.36051
[16]	validation_0-logloss:0.354985
[17]	validation_0-logloss:0.3493
[18]	validation_0-logloss:0.346732
[19]	validation_0-logloss:0.342883
[20]	validation_0-logloss:0.339728
[21]	validation_0-logloss:0.335367
[22]	validation_0-logloss:0.333068
[23]	validation_0-logloss:0.329562
[24]	validation_0-logloss:0.324283
[25]	validation_0-logloss:0.322841
[26]	validation_0-logloss:0.321743
[27

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  8.4min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=EarlyStoppingClassifier(classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective=...c=<function mutual_info_classif at 0x7f4456eed320>),
            num_rounds=5, validation_size=None),
          fit_params=None, iid=True, n_iter=15, n_jobs=1,
          param_distributions={'feature_selector__k': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f4455ee3610>, 'classifier__colsample_bytree': array([ 0.78492,  0.76086, ...,  0.87338,  0.7685 ]), 'classifier__subsample': array([ 0.82708,  0.97825, ...,  0.86918,  0.77545]), 'classifier__min_ch...0], 'classifier__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f4455b4a690>},
          pre_dispatch='2*n_jobs', random_state=None, refit=False,

In [32]:
model_gen_random.best_score_

0.94055285554248291

In [33]:
model_gen_random.best_params_

{'classifier__colsample_bytree': 0.87906681060131531,
 'classifier__learning_rate': 0.061278019458606885,
 'classifier__max_depth': 12,
 'classifier__min_child_weight': 2,
 'classifier__n_estimators': 1000,
 'classifier__seed': 0,
 'classifier__subsample': 0.72729098712745366,
 'feature_selector__k': 35}

In [35]:
best_params_random = model_gen_random.best_params_

In [36]:
final_model_ran = EarlyStoppingClassifier(classifier=XGBClassifier(objective='binary:logistic', booster='gbtree', silent=True),
                                          feature_selector=SelectKBest(score_func=mutual_info_classif),
                                          evaluation_metric='logloss',
                                          num_rounds=5)

In [38]:
final_model_ran.set_params(**best_params_random)

EarlyStoppingClassifier(classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.87906681060131531, gamma=0,
       learning_rate=0.061278019458606885, max_delta_step=0, max_depth=12,
       min_child_weight=2, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.72729098712745366),
            evaluation_metric='logloss',
            feature_selector=SelectKBest(k=35, score_func=<function mutual_info_classif at 0x7f4456eed320>),
            num_rounds=5, validation_size=None)

In [39]:
final_model_ran.fit(train_X, train_y)

[0]	validation_0-logloss:0.667832
Will train until validation_0-logloss hasn't improved in 5 rounds.
[1]	validation_0-logloss:0.644545
[2]	validation_0-logloss:0.622953
[3]	validation_0-logloss:0.602403
[4]	validation_0-logloss:0.584092
[5]	validation_0-logloss:0.565051
[6]	validation_0-logloss:0.546774
[7]	validation_0-logloss:0.532419
[8]	validation_0-logloss:0.517509
[9]	validation_0-logloss:0.505117
[10]	validation_0-logloss:0.491454
[11]	validation_0-logloss:0.479078
[12]	validation_0-logloss:0.466153
[13]	validation_0-logloss:0.455591
[14]	validation_0-logloss:0.444289
[15]	validation_0-logloss:0.43483
[16]	validation_0-logloss:0.425112
[17]	validation_0-logloss:0.416366
[18]	validation_0-logloss:0.406434
[19]	validation_0-logloss:0.397519
[20]	validation_0-logloss:0.389874
[21]	validation_0-logloss:0.381938
[22]	validation_0-logloss:0.374608
[23]	validation_0-logloss:0.367232
[24]	validation_0-logloss:0.360513
[25]	validation_0-logloss:0.353945
[26]	validation_0-logloss:0.348236

EarlyStoppingClassifier(classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.87906681060131531, gamma=0,
       learning_rate=0.061278019458606885, max_delta_step=0, max_depth=12,
       min_child_weight=2, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.72729098712745366),
            evaluation_metric='logloss',
            feature_selector=SelectKBest(k=35, score_func=<function mutual_info_classif at 0x7f4456eed320>),
            num_rounds=5, validation_size=None)

In [40]:
final_model_ran.classifier.best_ntree_limit

276

In [41]:
ran_preds = final_model_ran.predict(test_X)

In [43]:
ran_preds_proba = final_model_ran.predict_proba(test_X)

In [44]:
accuracy_score(y_true=test_y, y_pred=ran_preds)

0.94833333333333336

In [45]:
recall_score(y_true=test_y, y_pred=ran_preds)

0.95394736842105265

In [46]:
precision_score(y_true=test_y, y_pred=ran_preds)

0.94462540716612375

In [47]:
# Shockingly good.

In [48]:
# Let's do a baseline...

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
lg_model = LogisticRegression(penalty='l1')

In [51]:
lg_model.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [52]:
lg_preds = lg_model.predict(test_X)
lg_preds_proba = lg_model.predict(test_X)

In [53]:
accuracy_score(y_true=test_y, y_pred=lg_preds)

0.80277777777777781

In [54]:
recall_score(y_true=test_y, y_pred=lg_preds)

0.80372807017543857

In [55]:
precision_score(y_true=test_y, y_pred=lg_preds)

0.80638063806380633

In [62]:
# hmm, nice.  
# tpr, fpr, threshold, roc_auc_ = make_roc_curves_array(true_vals=test_y, scores_vals=ran_preds_proba)