In [2]:
import pandas as pd
import numpy as np

from datetime import datetime
import xgboost as xgb



In [6]:
from sklearn.grid_search import GridSearchCV

In [3]:
%run import_preprocessing.ipynb

In [28]:
def submission_to_csv(y_pred, filename, ids = ids):
    pd.DataFrame(y_pred, ids).to_csv(filename, sep=',', 
                                     index_label = 'id', 
                                     header = ['loss'])

def make_submission_gridsearch(gsearch, y_pred):
    y_pred = ['{0:.1f}'.format(p) for p in y_pred]
    
    n_trees = gsearch.get_params()['estimator__n_estimators']
    params = '_'.join([('%s_%s' % (k, v)) for k, v in gsearch.best_params_.items()])
    datename = datetime.now().strftime(format = '%d.%m_%H:%M')
    folder = 'submissions'
    filename = '%s/%s_mae%s_%strees_%s.csv' % (folder, datename, round(abs(gsearch.best_score_), 2), n_trees, params)
    # filename = folder + '/' + 'xgb_tuning_' + params + '_' + datename + '.csv'
    print('saving to %s...' % filename)

    submission_to_csv(y_pred, filename)

# `xgboost` tuning

In [5]:
dtrain = xgb.DMatrix(X, y, missing = 0.0)
dtest = xgb.DMatrix(X_test)

In [9]:
def tune_xgb(param_grid, n_trees = 100):
    gsearch = GridSearchCV(estimator = xgb.XGBRegressor(seed = 27, n_estimators = n_trees, silent = False), 
                             param_grid = param_grid, 
                             iid = False, cv = 3,
                             verbose = 3, scoring = 'neg_mean_absolute_error')
#                            )

    gsearch.fit(X, y)

    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)

    pred = gsearch.best_estimator_.predict(X_test)
    return gsearch, pred

## `min_child_weight`

```[mean: -1229.63925, std: 1.27339, params: {'min_child_weight': 1}, mean: -1229.11425, std: 1.62145, params: {'min_child_weight': 3}, mean: -1229.98570, std: 1.59661, params: {'min_child_weight': 5}]
{'min_child_weight': 3}
-1229.11424698
CPU times: user 15min 35s, sys: 7.27 s, total: 15min 42s
Wall time: 4min 48s```

In [29]:
%%time
param_test = { 'min_child_weight': [2, 3, 4] }

gsearch, pred = tune_xgb(param_test, 500)

make_submission_gridsearch(gsearch, pred)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] min_child_weight=2 ..............................................
[CV] ................. min_child_weight=2, score=-1207.062086 - 1.5min
[CV] min_child_weight=2 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] ................. min_child_weight=2, score=-1203.322119 - 1.9min
[CV] min_child_weight=2 ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.3min remaining:    0.0s


[CV] ................. min_child_weight=2, score=-1205.412069 - 1.5min
[CV] min_child_weight=3 ..............................................
[CV] ................. min_child_weight=3, score=-1207.645791 - 1.4min
[CV] min_child_weight=3 ..............................................
[CV] ................. min_child_weight=3, score=-1203.142366 - 1.7min
[CV] min_child_weight=3 ..............................................
[CV] ................. min_child_weight=3, score=-1204.780227 - 1.7min
[CV] min_child_weight=4 ..............................................
[CV] ................. min_child_weight=4, score=-1206.182961 - 1.8min
[CV] min_child_weight=4 ..............................................
[CV] ................. min_child_weight=4, score=-1202.282733 - 1.6min
[CV] min_child_weight=4 ..............................................
[CV] ................. min_child_weight=4, score=-1204.917152 - 2.0min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 15.0min finished


[mean: -1205.26542, std: 1.53035, params: {'min_child_weight': 2}, mean: -1205.18946, std: 1.86115, params: {'min_child_weight': 3}, mean: -1204.46095, std: 1.62461, params: {'min_child_weight': 4}]
{'min_child_weight': 4}
-1204.46094856
saving to submissions/11.10_18:27_mae1204.46_500trees_min_child_weight_4.csv...
CPU times: user 55min 6s, sys: 10.8 s, total: 55min 17s
Wall time: 17min 53s


## `max_depth`

`[default=6]`
* The maximum depth of a tree, same as GBM.
* Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
* Should be tuned using CV.
* Typical values: 3-10

In [10]:
%%time
param_test = { 'max_depth': [8, 9, 10] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

## `gamma` 

`[default=0]`
* A node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split.
* Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.

In [14]:
%%time
param_test = { 'gamma': [0, 0.1, 0.3] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 9 µs, sys: 3 µs, total: 12 µs
Wall time: 32.9 µs


## `subsample` 

`[default=1]`
* Same as the subsample of GBM. Denotes the fraction of observations to be randomly samples for each tree.
* Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
* Typical values: 0.5-1

In [15]:
%%time
param_test = { 'subsample': [0.5, 0.75, 1.0] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.2 µs


## `subsample * max_depth` 

In [16]:
%%time
param_test = { 'max_depth': [8, 9, 10], 'subsample': [0.5, 0.75, 1.0] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 15 µs


## `eta`  / `learning_rate`

`[default=0.3]`

* Analogous to learning rate in GBM
* Makes the model more robust by shrinking the weights on each step
* Typical final values to be used: 0.01-0.2

In [17]:
%%time
param_test = { 'learning_rate': [0.01, 0.2, 0.3], 'max_depth': [10] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


## `colsample_bytree`

`[default=1]`

* Similar to `max_features` in GBM. Denotes the fraction of columns to be randomly samples for each tree.
* Typical values: 0.5-1


In [21]:
%%time
param_test = { 'colsample_bytree': [0.5, 0.75, 1], 'max_depth': [10] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.951584 - 4.4min
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.961536 - 4.3min
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.952211 - 4.4min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.945259 - 5.7min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.966289 - 5.4min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.960061 - 4.9min
[CV] max_depth=10, colsample_bytree=1 ................................
[CV] ....... max_d

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 48.6min finished


[mean: 0.95511, std: 0.00455, params: {'max_depth': 10, 'colsample_bytree': 0.5}, mean: 0.95720, std: 0.00882, params: {'max_depth': 10, 'colsample_bytree': 0.75}, mean: 0.95638, std: 0.01238, params: {'max_depth': 10, 'colsample_bytree': 1}]
{'max_depth': 10, 'colsample_bytree': 0.75}
0.957203107604
CPU times: user 54min 41s, sys: 49 s, total: 55min 30s
Wall time: 57min 30s


In [None]:
%%time
param_test = { 'colsample_bytree': [0.8, 1], 'max_depth': [10],  'learning_rate': [0.1, 0.2] }

# gsearch, pred = tune_xgb(param_test)
# make_submission_gridsearch(gsearch, pred)