In [None]:
from datetime import datetime
import xgboost as xgb
import numpy as np

from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

# from rmse import rmse

%run load_encoded.ipynb

In [None]:
def submission_to_csv(cible, filename, id = id):
    (pd.DataFrame(cible, id)
     .to_csv(filename, sep=';', 
             index_label = 'id', 
             header = ['cible'], 
             float_format = '{.1f}')
    )
def make_submission_gridsearch(gsearch, pred):
    cible = ['{0:.1f}'.format(p) for p in pred]
    
    params = '_'.join([('%s_%s' % (k, v)) for k, v in gsearch.best_params_.items()])
    datename = datetime.now().strftime(format = '%d%m_%H%M')
    folder = 'submissions'
    filename = folder + '/' + 'xgb_tuning_' + params + '_' + datename + '.csv'
    print('saving to %s...' % filename)

    submission_to_csv(cible, filename)

# %import make_submission.ipynb

# `xgboost` tuning

In [7]:
dtrain = xgb.DMatrix(X, y, missing = 0.0)
dtest = xgb.DMatrix(X_test)

In [20]:
def tune_xgb(param_grid, n_trees = 100):
    gsearch = GridSearchCV(estimator = xgb.XGBRegressor(seed = 27, n_estimators = n_trees, silent = False), 
                             param_grid = param_grid, 
                             iid = False, 
                             cv = 3,
                             verbose = 3#,
#                              scoring = 'neg_mean_squared_error')
                           )

    gsearch.fit(X, y)

    print(gsearch.grid_scores_)
    print(gsearch.best_params_)
    print(gsearch.best_score_)

    pred = gsearch.best_estimator_.predict(X_test)
    return gsearch, pred

## `min_child_weight`

In [9]:
# %%time
# param_test_1 = { 'min_child_weight': [1, 3, 5] }

# gsearch, pred = tune_xgb(param_test_1, n_trees = 150)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 14.1 µs


## `max_depth`

`[default=6]`
* The maximum depth of a tree, same as GBM.
* Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
* Should be tuned using CV.
* Typical values: 3-10

```python
[mean: 0.95449, std: 0.00880, params: {'max_depth': 8}, mean: 0.95939, std: 0.00803, params: {'max_depth': 9}, mean: 0.96333, std: 0.00698, params: {'max_depth': 10}]
{'max_depth': 10}
0.963330452626
CPU times: user 2h 2min 27s, sys: 9.29 s, total: 2h 2min 36s
Wall time: 35min 52s
```

In [10]:
# %%time
# param_test_max_depth = { 'max_depth': [8, 9, 10] }

# gsearch, pred = tune_xgb(param_test_max_depth, n_trees = 150)
# make_submission_gridsearch(gsearch, pred)

## `gamma` 

`[default=0]`
* A node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split.
* Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.

In [14]:
# %%time
# param_test_gamma = { 'gamma': [0, 0.1, 0.3] }

# gsearch, pred = tune_xgb(param_test_gamma, n_trees = 200)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 9 µs, sys: 3 µs, total: 12 µs
Wall time: 32.9 µs


## `subsample` 

`[default=1]`
* Same as the subsample of GBM. Denotes the fraction of observations to be randomly samples for each tree.
* Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
* Typical values: 0.5-1

```python
[mean: 0.91839, std: 0.00824, params: {'subsample': 0.5}, mean: 0.91845, std: 0.00792, params: {'subsample': 0.75}, mean: 0.91801, std: 0.00807, params: {'subsample': 1.0}]
{'subsample': 0.75}
0.918446100422
CPU times: user 56min 17s, sys: 7.14 s, total: 56min 24s
Wall time: 19min 26s```

In [15]:
# %%time
# param_test_subsample = { 'subsample': [0.5, 0.75, 1.0] }

# gsearch, pred = tune_xgb(param_test_subsample, n_trees = 200)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 12.2 µs


## `subsample * max_depth` 

```python
[mean: -0.00118, std: 0.00045, params: {'subsample': 0.5, 'max_depth': 8}, mean: -0.00120, std: 0.00048, params: {'subsample': 0.75, 'max_depth': 8}, mean: -0.00121, std: 0.00047, params: {'subsample': 1.0, 'max_depth': 8}, mean: -0.00106, std: 0.00041, params: {'subsample': 0.5, 'max_depth': 9}, mean: -0.00106, std: 0.00042, params: {'subsample': 0.75, 'max_depth': 9}, mean: -0.00107, std: 0.00042, params: {'subsample': 1.0, 'max_depth': 9}, mean: -0.00098, std: 0.00040, params: {'subsample': 0.5, 'max_depth': 10}, mean: -0.00099, std: 0.00038, params: {'subsample': 0.75, 'max_depth': 10}, mean: -0.00098, std: 0.00037, params: {'subsample': 1.0, 'max_depth': 10}]
{'subsample': 1.0, 'max_depth': 10}
-0.000976894278568
CPU times: user 8h 20min 10s, sys: 48.7 s, total: 8h 20min 58s
Wall time: 2h 39min 30s
```

In [16]:
# %%time
# param_test = { 'max_depth': [8, 9, 10], 'subsample': [0.5, 0.75, 1.0] }

# gsearch, pred = tune_xgb(param_test, n_trees = 200)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 15 µs


## `eta`  / `learning_rate`

`[default=0.3]`

* Analogous to learning rate in GBM
* Makes the model more robust by shrinking the weights on each step
* Typical final values to be used: 0.01-0.2

```python
[mean: -0.00148, std: 0.00024, params: {'learning_rate': 0.01, 'max_depth': 10}, mean: -0.00109, std: 0.00025, params: {'learning_rate': 0.2, 'max_depth': 10}, mean: -0.00114, std: 0.00029, params: {'learning_rate': 0.3, 'max_depth': 10}]
{'learning_rate': 0.2, 'max_depth': 10}
-0.00108544710169
```

In [17]:
# %%time
# param_test_lr = { 'learning_rate': [0.01, 0.2, 0.3], 'max_depth': [10] }

# gsearch, pred = tune_xgb(param_test_lr, n_trees = 1000)
# make_submission_gridsearch(gsearch, pred)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


## `colsample_bytree`

`[default=1]`

* Similar to `max_features` in GBM. Denotes the fraction of columns to be randomly samples for each tree.
* Typical values: 0.5-1

```python
[mean: 0.95511, std: 0.00455, params: {'max_depth': 10, 'colsample_bytree': 0.5}, mean: 0.95720, std: 0.00882, params: {'max_depth': 10, 'colsample_bytree': 0.75}, mean: 0.95638, std: 0.01238, params: {'max_depth': 10, 'colsample_bytree': 1}]
{'max_depth': 10, 'colsample_bytree': 0.75}
0.957203107604
CPU times: user 54min 41s, sys: 49 s, total: 55min 30s
Wall time: 57min 30s
```

In [21]:
# %%time
# param_test_cs = { 'colsample_bytree': [0.5, 0.75, 1], 'max_depth': [10] }

# gsearch, pred = tune_xgb(param_test_cs, n_trees = 200)
# make_submission_gridsearch(gsearch, pred)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.951584 - 4.4min
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.961536 - 4.3min
[CV] max_depth=10, colsample_bytree=0.5 ..............................
[CV] ..... max_depth=10, colsample_bytree=0.5, score=0.952211 - 4.4min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.945259 - 5.7min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.966289 - 5.4min
[CV] max_depth=10, colsample_bytree=0.75 .............................
[CV] .... max_depth=10, colsample_bytree=0.75, score=0.960061 - 4.9min
[CV] max_depth=10, colsample_bytree=1 ................................
[CV] ....... max_d

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 48.6min finished


[mean: 0.95511, std: 0.00455, params: {'max_depth': 10, 'colsample_bytree': 0.5}, mean: 0.95720, std: 0.00882, params: {'max_depth': 10, 'colsample_bytree': 0.75}, mean: 0.95638, std: 0.01238, params: {'max_depth': 10, 'colsample_bytree': 1}]
{'max_depth': 10, 'colsample_bytree': 0.75}
0.957203107604
CPU times: user 54min 41s, sys: 49 s, total: 55min 30s
Wall time: 57min 30s


```python
[mean: 0.95482, std: 0.00912, params: {'learning_rate': 0.1, 'colsample_bytree': 0.8, 'max_depth': 10}, mean: 0.95957, std: 0.00996, params: {'learning_rate': 0.2, 'colsample_bytree': 0.8, 'max_depth': 10}, mean: 0.95623, std: 0.01258, params: {'learning_rate': 0.1, 'colsample_bytree': 1, 'max_depth': 10}, mean: 0.95941, std: 0.01505, params: {'learning_rate': 0.2, 'colsample_bytree': 1, 'max_depth': 10}]
{'learning_rate': 0.2, 'colsample_bytree': 0.8, 'max_depth': 10}```

In [None]:
# %%time
# param_test_cs = { 'colsample_bytree': [0.8, 1], 'max_depth': [10],  'learning_rate': [0.1, 0.2] }

# gsearch, pred = tune_xgb(param_test_cs, n_trees = 200)
# make_submission_gridsearch(gsearch, pred)