In [1]:
# imports
import pandas as pd
import numpy as np

from impute_transform import ImputeTransform
#import model_metrics

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV

from fancyimpute import *

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('data/train_data.csv')

In [3]:
X = train_data.drop(columns=['DX','DXSUB'])
y = train_data['DX'].map({3:1, 1:0})
#X = train_data.drop(columns=['DX','DXSUB'])
#y = train_data['DX'].map({3:1, 1:0})

## Metrics
The metrics for a default, cross validated XGBoost model are as follows:

Test Accuracy: 0.927  
Test Log Loss: -0.212  
Test ROC AUC: 0.96998  

The code for such classifier is here:  
```xgb_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                           XGBClassifier(max_depth=3, learning_rate=0.1,
                           n_estimators=100,
                           random_state=56, n_jobs=-1))```

# Parameter Tuning for XGBoost

Following the guidelines set out here: [Complete Guide to Parameter Tuning in XGBoost](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

In [4]:
impute = ImputeTransform(strategy=MatrixFactorization())
clf = XGBClassifier(
                  learning_rate = 0.1,
                  n_estimators = 1000,
                  max_depth = 3,
                  objective = 'binary:logistic',
                  n_jobs = 6,
                  random_state=56)
#clf = XGBClassifier(
#                  learning_rate = 0.1,
#                  n_estimators = 1000,
#                  max_depth = 5,
#                  min_child_weight = 1,
#                  gamma = 0,
#                  subsample = 0.8,
#                  colsample_bytree = 0.8,
#                  scale_pos_weight=1,
#                  objective = 'binary:logistic',
#                  n_jobs = -1,
#                  random_state=56)

In [5]:
steps = [('impute_transform', impute),
         ('xgboost', clf)]

pipeline = Pipeline(steps)

## Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

In [6]:
param_test1 = {'xgboost__n_estimators': range(20,301,10)}

gsearch1 = GridSearchCV(pipeline,
                        param_grid=param_test1,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [7]:
%%capture
gsearch1.fit(X, y)

In [8]:
print(gsearch1.best_params_, gsearch1.best_score_)
best_n_estimators = gsearch1.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

{'xgboost__n_estimators': 90} 0.978607842939


## Step 2: Tune max_depth and min_child_weight

In [9]:
param_test2 = {'xgboost__max_depth': range(1,10,2),
               'xgboost__min_child_weight': range(1,6,2)}

gsearch2 = GridSearchCV(pipeline,
                        param_grid=param_test2,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [10]:
%%capture
gsearch2.fit(X, y)

In [11]:
print(gsearch2.best_params_, gsearch2.best_score_)

{'xgboost__max_depth': 5, 'xgboost__min_child_weight': 5} 0.977429623099


### Step 2b: Find optimum values

In [None]:
param_test2a = {'xgboost__max_depth': [6,7,8,0],
                'xgboost__min_child_weight': [4,5,6]}
gsearch2a = GridSearchCV(pipeline,
                         param_grid=param_test2a,
                         scoring='roc_auc',
                         iid=False, cv=3)

In [None]:
%%capture
gsearch2a.fit(X, y)

In [None]:
print(gsearch2a.best_params_, gsearch2a.best_score_)

In [None]:
# Set the new parameters
best_max_depth = gsearch2a.best_params_['xgboost__max_depth']
best_child_weight = gsearch2a.best_params_['xgboost__min_child_weight']
_ = pipeline.set_params(xgboost__max_depth=best_max_depth,
                        xgboost__min_child_weight=best_child_weight)

### Step 2c: Find optimum values

In [None]:
param_test2b = {'xgboost__max_depth': [4,5,6],
                'xgboost__min_child_weight': [3,4,5]}
gsearch2b = GridSearchCV(pipeline,
                         param_grid=param_test2b,
                         scoring='roc_auc',
                         iid=False, cv=3)

In [None]:
%%capture
gsearch2b.fit(X, y)

In [None]:
print(gsearch2b.best_params_, gsearch2a.best_score_)

In [None]:
# Set the new parameters
best_max_depth = gsearch2b.best_params_['xgboost__max_depth']
best_child_weight = gsearch2b.best_params_['xgboost__min_child_weight']
_ = pipeline.set_params(xgboost__max_depth=best_max_depth,
                        xgboost__min_child_weight=best_child_weight)

## Step 3: Tune gamma

In [None]:
param_test3 = {'xgboost__gamma': [i/10.0 for i in range(0,5)]}
gsearch3 = GridSearchCV(pipeline,
                        param_grid=param_test3,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [None]:
%%capture
gsearch3.fit(X, y)

In [None]:
print(gsearch3.best_params_, gsearch3.best_score_)

In [None]:
# Set the new parameters
best_gamma = gsearch3.best_params_['xgboost__gamma']
_ = pipeline.set_params(xgboost__gamma=best_max_depth)

### Step 3b: Re-calibrate n_estimators for updated parameters

In [None]:
param_test3b = {'xgboost__n_estimators': range(10,101,10)}

gsearch3b = GridSearchCV(pipeline,
                         param_grid=param_test3b,
                         scoring='roc_auc',
                         iid=False, cv=3)

In [None]:
%%capture
gsearch3b.fit(X, y)

In [None]:
print(gsearch3b.best_params_, gsearch3b.best_score_)

In [None]:
best_n_estimators = gsearch3b.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

### Step 3c: Re-calibrate n_estimators for updated parameters

In [None]:
param_test3c = {'xgboost__n_estimators': [40,45,50,55,60]}

gsearch3c = GridSearchCV(pipeline,
                         param_grid=param_test3c,
                         scoring='roc_auc',
                         iid=False, cv=3)

In [None]:
%%capture
gsearch3c.fit(X, y)

In [None]:
print(gsearch3c.best_params_, gsearch3c.best_score_)

In [None]:
best_n_estimators = gsearch3c.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

## Step 4: Tune subsample and colsample_bytree

In [None]:
# probably skippable

## Step 5: Tuning Regularization Parameters

In [None]:
# probably skippable

## Step 6: Reducing Learning Rate, Add Trees

In [None]:
_ = pipeline.set_params(xgboost__learning_rate=0.01)

In [None]:
param_test6 = {'xgboost__n_estimators': range(40,200,10)}

gsearch6 = GridSearchCV(pipeline,
                        param_grid=param_test6,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [None]:
%%capture
gsearch6.fit(X, y)

In [None]:
print(gsearch6.best_params_, gsearch6.best_score_)

In [None]:
best_n_estimators = gsearch6.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

## Step 6b: Optimize n_estimators (again)

In [None]:
param_test6b = {'xgboost__n_estimators': range(100,200,5)}

gsearch6b = GridSearchCV(pipeline,
                        param_grid=param_test6b,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [None]:
%%capture
gsearch6b.fit(X, y)

In [None]:
print(gsearch6b.best_params_, gsearch6b.best_score_)

In [None]:
best_n_estimators = gsearch6b.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

## Step 6c: Optimize n_estimators (again)

In [None]:
param_test6c = {'xgboost__n_estimators': [120, 125, 130, 135, 140, 145]}

gsearch6c = GridSearchCV(pipeline,
                        param_grid=param_test6c,
                        scoring='roc_auc',
                        iid=False, cv=3)

In [None]:
%%capture
gsearch6c.fit(X, y)

In [None]:
print(gsearch6c.best_params_, gsearch6c.best_score_)

In [None]:
best_n_estimators = gsearch6b.best_params_['xgboost__n_estimators']
_ = pipeline.set_params(xgboost__n_estimators=best_n_estimators)

## Step 7: Fit full model, examine feature importances

In [None]:
pipeline.fit(X, y)

In [None]:
feat_imp = pd.Series(pipeline.steps[1][1].feature_importances_,index=X.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')