In [10]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import cross_validation, metrics   # Additional scklearn functions
from sklearn.grid_search import GridSearchCV # Perforing grid search
from sklearn.preprocessing import LabelEncoder

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

from datetime import datetime

-----
## Preprocessing

In [11]:
train = pd.read_csv("data/ech_apprentissage.csv", sep = ';', index_col = 'id', low_memory = False)
X = train.drop('prime_tot_ttc', axis = 1)
X_test = pd.read_csv('data/ech_test.csv', sep = ';', index_col = 'id', low_memory = False)
y = train.prime_tot_ttc

df = pd.concat([X, X_test], axis=0)
print(X.shape, X_test.shape)
# df.dtypes

vars_to_drop = ['var1', 'var3', 'var11', 'var14', 'codepostal']
# vars_to_drop = ['var1', 'var3', 'var11', 'codepostal'] # bad idea: +0.8%
# vars_to_drop = ['var1', 'var11', 'codepostal'] # bad idea: +1.06%
df.drop(vars_to_drop, axis = 1, inplace=True)

(300000, 32) (30000, 32)


In [12]:
df.replace('NR', 0, inplace=True)

In [13]:
df.var6.replace('N', 0, inplace=True)

In [14]:
cat_variables = [i for i in df.columns if df[i].dtype == 'O']
print(cat_variables)

['marque', 'energie_veh', 'profession', 'var6', 'var7', 'var8', 'var16']


In [15]:
encods = [LabelEncoder() for col in cat_variables]
for i, col in enumerate(cat_variables):
    df[col] = encods[i].fit_transform(df[col].astype(str))

In [16]:
df.fillna(0, inplace=True)

X = df[:300000]
X_test = df[300000:]

------
## XGBoost Tuning

In [17]:
# человеческая метрика
def mape2(preds, dtrain):
    y_true = dtrain.get_label()
    return 'mape2', np.mean(np.abs((y_true - preds) / y_true))

In [18]:
dtrain = xgb.DMatrix(X, y, missing=0.0)
dtest = xgb.DMatrix(X_test)

In [None]:
# param = { }
# num_round = 100
# bst = xgb.train(param, dtrain, num_round)

### `min_child_weight`

In [19]:
%%time

param_test_1 = {
#  'max_depth': [3, 5, 7, 9],
#  'min_child_weight': [1, 3, 5]
     'min_child_weight': [1, 3]
}

gsearch_1 = GridSearchCV(estimator = XGBRegressor(seed = 27), 
                         param_grid = param_test_1, 
#                          scoring = mape2, 
                         iid = False, 
                         cv = 5)

gsearch_1.fit(X, y)
gsearch_1.grid_scores_, gsearch_1.best_params_, gsearch_1.best_score_

### `max_depth`

`[default=6]`
* The maximum depth of a tree, same as GBM.
* Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
* Should be tuned using CV.
* Typical values: 3-10

In [35]:
%%time

param_test_2 = { 'max_depth': [6, 9] }

gsearch_2 = GridSearchCV(estimator = XGBRegressor(seed = 11), 
                         param_grid = param_test_2, 
#                          scoring = mape2, 
                         iid = False, 
                         cv = 5)

gsearch_2.fit(X, y)
# gsearch_2.grid_scores_, gsearch_2.best_params_, gsearch_2.best_score_

CPU times: user 20min 48s, sys: 4.29 s, total: 20min 52s
Wall time: 21min


In [36]:
print(gsearch_2.grid_scores_)
print(gsearch_2.best_params_)
print(gsearch_2.best_score_)
# [mean: 0.84584, std: 0.00293, params: {'max_depth': 3}, mean: 0.89360, std: 0.00240, params: {'max_depth': 9}]
# {'max_depth': 9}
# 0.893603987388

[mean: 0.88464, std: 0.00254, params: {'max_depth': 6}, mean: 0.89360, std: 0.00240, params: {'max_depth': 9}]
{'max_depth': 9}
0.893603987388


### `subsample`

`[default=1]`

* Same as the subsample of GBM. Denotes the fraction of observations to be randomly samples for each tree.
* Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
* Typical values: 0.5-1

In [28]:
%%time

param_test_3 = { 'subsample': [0.5, 0.8, 1] }

gsearch_3 = GridSearchCV(estimator = XGBRegressor(seed = 11), 
                         param_grid = param_test_3, 
#                          scoring = mape2, 
                         iid = False, 
                         cv = 5)

gsearch_3.fit(X, y)

print(gsearch_3.grid_scores_)
print(gsearch_3.best_params_)
print(gsearch_3.best_score_)

[mean: 0.84781, std: 0.00181, params: {'subsample': 0.5}, mean: 0.84681, std: 0.00167, params: {'subsample': 0.8}, mean: 0.84584, std: 0.00293, params: {'subsample': 1}]
{'subsample': 0.5}
0.847808192227
CPU times: user 11min 50s, sys: 4.41 s, total: 11min 54s
Wall time: 11min 58s


------

In [29]:
param = {'subsample': 0.5, 'max_depth': 9}
num_round = 100
bst = xgb.train(param, dtrain, num_round)

pred = bst.predict(dtest)

pred = pred * 0.99 # multiplying the answers by 0.99 is a nice hack

In [31]:
filename = 'submissions/xgb_tuning_' + datetime.now().strftime(format = '%d%m_%H%M') + '.csv'
print('saving to %s...' % filename)
pd.DataFrame({'id': X_test.index, 'pred': pred}).to_csv(filename,
                                                        index=False,
                                                        sep=';')

saving to submissions/xgb_tuning_2008_2112.csv...


------

### `learning_rate`

In [32]:
%%time

param_test = { 'learning_rate': [0.1, 0.2, 0.05] }

gsearch = GridSearchCV(estimator = XGBRegressor(seed = 11), 
                         param_grid = param_test, 
                         iid = False, 
                         cv = 5)

gsearch.fit(X, y)

print(gsearch.grid_scores_)
print(gsearch.best_params_)
print(gsearch.best_score_)

[mean: 0.84584, std: 0.00293, params: {'learning_rate': 0.1}, mean: 0.86764, std: 0.00225, params: {'learning_rate': 0.2}, mean: 0.80411, std: 0.00186, params: {'learning_rate': 0.05}]
{'learning_rate': 0.2}
0.867641998714
CPU times: user 11min 38s, sys: 4.66 s, total: 11min 43s
Wall time: 11min 46s


In [37]:
%%time

param_test = { 'learning_rate': [0.3] }

gsearch = GridSearchCV(estimator = XGBRegressor(seed = 11), 
                         param_grid = param_test, 
                         iid = False, 
                         cv = 5)

gsearch.fit(X, y)

print(gsearch.grid_scores_)
print(gsearch.best_params_)
print(gsearch.best_score_)

[mean: 0.87495, std: 0.00209, params: {'learning_rate': 0.3}]
{'learning_rate': 0.3}
0.874945101343
CPU times: user 4min 30s, sys: 2 s, total: 4min 32s
Wall time: 4min 35s


------

In [38]:
param = {'subsample': 0.5, 'max_depth': 9, 'learning_rate': 0.3}
num_round = 100
bst = xgb.train(param, dtrain, num_round)

pred = bst.predict(dtest)

pred = pred * 0.99 # multiplying the answers by 0.99 is a nice hack

In [39]:
filename = 'submissions/xgb_tuning_' + datetime.now().strftime(format = '%d%m_%H%M') + '.csv'
print('saving to %s...' % filename)
pd.DataFrame({'id': X_test.index, 'pred': pred}).to_csv(filename,
                                                        index=False,
                                                        sep=';')

saving to submissions/xgb_tuning_2008_2240.csv...


------