# Boosting

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn import model_selection as ms, ensemble

%matplotlib inline

In [2]:
WHITES_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

Read in the Wine Quality dataset.

In [3]:
whites = pd.read_csv(WHITES_URL, sep=';')

Define a new variable 'good_quality' for whites with quality >= 7.

In [4]:
whites['good_quality'] = whites.quality >= 7

Prepare the data.

In [5]:
X = whites.drop(['quality', 'good_quality'], axis=1)
y = whites['good_quality']

## Gradient tree boosting

Train a gradient tree boosting classifier with 20 decision trees.

In [6]:
gtb1 = ensemble.GradientBoostingClassifier(n_estimators=20)
gtb1.fit(X, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=20,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

Investigate importances of predictors (the higher, the more important).

In [7]:
gtb1.feature_importances_

array([0.02465249, 0.09244135, 0.0392292 , 0.05886643, 0.08529957,
       0.08485939, 0.007621  , 0.01461353, 0.06358805, 0.04802948,
       0.48079951])

Evaluate performance through cross-validation.

In [8]:
ten_fold_cv = ms.StratifiedKFold(n_splits=10, shuffle=True)
aucs = ms.cross_val_score(gtb1, X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.8365674250681477

What happens when we increase the number of trees?

In [9]:
for n_trees in [2, 5, 10, 20, 50, 100]:
    aucs = ms.cross_val_score(
        ensemble.GradientBoostingClassifier(n_estimators=n_trees), X, y, scoring='roc_auc', cv=ten_fold_cv)
    print('{:>3} trees: mean AUC {:.2%}'.format(n_trees, np.mean(aucs)))

  2 trees: mean AUC 79.51%
  5 trees: mean AUC 80.78%
 10 trees: mean AUC 81.94%
 20 trees: mean AUC 83.55%
 50 trees: mean AUC 85.24%
100 trees: mean AUC 86.32%


## XGBoost

In [10]:
xgb1 = xgb.XGBClassifier(n_estimators=20)
xgb1.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=20,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
xgb1.feature_importances_

array([0.05714286, 0.16428572, 0.02142857, 0.10714286, 0.06428572,
       0.10714286, 0.00714286, 0.01428571, 0.13571429, 0.06428572,
       0.25714287], dtype=float32)

In [12]:
aucs = ms.cross_val_score(xgb1, X, y, scoring='roc_auc', cv=ten_fold_cv)
np.mean(aucs)

0.8314101737616385

In [13]:
for n_trees in [2, 5, 10, 20, 50, 100]:
    aucs = ms.cross_val_score(
        xgb.XGBClassifier(n_estimators=n_trees), X, y, scoring='roc_auc', cv=ten_fold_cv)
    print('{:>3} trees: mean AUC {:.2%}'.format(n_trees, np.mean(aucs)))

  2 trees: mean AUC 79.07%
  5 trees: mean AUC 81.07%
 10 trees: mean AUC 81.50%
 20 trees: mean AUC 82.86%
 50 trees: mean AUC 85.14%
100 trees: mean AUC 86.33%
