In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train_without_noise.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

In [3]:
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)

In [4]:
# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))

In [5]:
x_test = test.drop(["timestamp"], axis=1)

In [6]:
# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))

In [7]:
# base values
xgb_params = {
    "eta": 0.05,
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "silent": 1,
    "seed":42
}

In [7]:
xgb1 = XGBRegressor(learning_rate=0.05,
                     max_depth=5,
                     subsample=0.7,
                     colsample_bylevel=0.7,
                     objective="reg:linear",
                     seed=42)

In [8]:
xgb1.get_xgb_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 0.7,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'objective': 'reg:linear',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 42,
 'silent': 1,
 'subsample': 0.7}

In [10]:
# search for optimal max_depth 3-8 is usually good size
param_test1 = {
    "max_depth": range(3,10,2),
    "min_child_weight": range(1, 6, 2)
}
gsearch1 = GridSearchCV(estimator=xgb1, param_grid=param_test1, scoring="r2", verbose=1)

In [11]:
gsearch1.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 15.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_child_weight': range(1, 6, 2), 'max_depth': range(3, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [12]:
print(gsearch1.cv_results_)
print(gsearch1.best_score_)
print(gsearch1.best_params_)

{'mean_score_time': array([ 0.06701175,  0.06393933,  0.06573129,  0.08408968,  0.08918341,
        0.08569034,  0.11710993,  0.11174297,  0.11425265,  0.13856006,
        0.13878004,  0.13838251]), 'split1_train_score': array([ 0.6832496 ,  0.68139135,  0.68042287,  0.7805569 ,  0.77216189,
        0.76830724,  0.85269423,  0.8432753 ,  0.83369414,  0.90689041,
        0.896589  ,  0.88596617]), 'mean_test_score': array([ 0.59756906,  0.59920533,  0.59746256,  0.60988449,  0.61355777,
        0.6161175 ,  0.6177487 ,  0.61778345,  0.62144769,  0.6043361 ,
        0.6050557 ,  0.60500439]), 'split1_test_score': array([ 0.62661742,  0.62747537,  0.6311931 ,  0.66397505,  0.65987904,
        0.66573214,  0.66774137,  0.66796597,  0.66766953,  0.65825125,
        0.66715402,  0.66651737]), 'std_test_score': array([ 0.05736395,  0.0549439 ,  0.05571959,  0.08756236,  0.0805866 ,
        0.08246068,  0.08173085,  0.08564962,  0.07723884,  0.09030452,
        0.09037289,  0.0912571 ]), 'rank

In [16]:
param_test2 = {
    "max_depth": [6, 7, 8],
    "min_child_weight": [4, 5, 6]
}
gsearch2 = GridSearchCV(estimator=xgb1, param_grid=param_test2, scoring="r2", verbose=1)

In [17]:
gsearch2.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 12.6min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_child_weight': [4, 5, 6], 'max_depth': [6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [18]:
print(gsearch2.cv_results_)
print(gsearch2.best_score_)
print(gsearch2.best_params_)

{'mean_score_time': array([ 0.09515031,  0.09352493,  0.09306359,  0.10771243,  0.10707641,
        0.10636965,  0.12179669,  0.1200006 ,  0.11703531]), 'split1_train_score': array([ 0.804579  ,  0.80184064,  0.79810338,  0.8392366 ,  0.83369414,
        0.82997376,  0.86772108,  0.86288497,  0.85916154]), 'mean_test_score': array([ 0.61279463,  0.61706851,  0.61888388,  0.61979564,  0.62144769,
        0.62404684,  0.60346076,  0.60608769,  0.61016511]), 'split1_test_score': array([ 0.66250872,  0.66436937,  0.66476746,  0.6675491 ,  0.66766953,
        0.66509784,  0.6639709 ,  0.66494551,  0.66695904]), 'std_test_score': array([ 0.08587164,  0.07994363,  0.0795714 ,  0.08108955,  0.07723884,
        0.07185241,  0.09401714,  0.09467049,  0.09233051]), 'rank_test_score': array([6, 5, 4, 3, 2, 1, 9, 8, 7], dtype=int32), 'split2_test_score': array([ 0.68389624,  0.6823512 ,  0.68493141,  0.68621649,  0.6840515 ,
        0.68402484,  0.67574122,  0.68080497,  0.68359669]), 'mean_fit_tim

In [17]:
xgb2 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.7,
                     colsample_bylevel=0.7,
                     objective="reg:linear",
                     seed=42)

In [18]:
param_test3 = {
    "gamma": np.arange(0, 1.2, 0.2)
}
gsearch3 = GridSearchCV(estimator=xgb2, param_grid=param_test3, scoring="r2", verbose=1)

In [19]:
gsearch3.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': array([ 0. ,  0.2,  0.4,  0.6,  0.8,  1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [20]:
print(gsearch3.cv_results_)
print(gsearch3.best_score_)
print(gsearch3.best_params_)

{'mean_fit_time': array([ 30.24002806,  30.48722537,  29.1691583 ,  30.44474498,
        30.30343501,  29.50940537]), 'split1_train_score': array([ 0.82997376,  0.82997376,  0.82997376,  0.82997376,  0.82997376,
        0.82997376]), 'std_test_score': array([ 0.07185241,  0.07185241,  0.07185241,  0.07185241,  0.07185241,
        0.07185241]), 'mean_score_time': array([ 0.11473529,  0.11179773,  0.10508498,  0.1105334 ,  0.11938731,
        0.11219994]), 'split1_test_score': array([ 0.66509784,  0.66509784,  0.66509784,  0.66509784,  0.66509784,
        0.66509784]), 'split2_test_score': array([ 0.68402484,  0.68402484,  0.68402484,  0.68402484,  0.68402484,
        0.68402484]), 'std_score_time': array([ 0.00217996,  0.00314583,  0.00381724,  0.0059813 ,  0.01124935,
        0.00417286]), 'std_train_score': array([ 0.01309754,  0.01309754,  0.01309754,  0.01309754,  0.01309754,
        0.01309754]), 'split0_test_score': array([ 0.52302407,  0.52302407,  0.52302407,  0.52302407,  0.523

In [52]:
xgb3 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.7,
                     colsample_bytree=0.7,
                     objective="reg:linear",
                     seed=42)

In [53]:
param_test4 = {
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1)
}
gsearch4 = GridSearchCV(estimator=xgb3, param_grid=param_test4, scoring="r2", verbose=1)

In [54]:
gsearch4.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 37.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'subsample': array([ 0.5,  0.6,  0.7,  0.8,  0.9]), 'colsample_bytree': array([ 0.5,  0.6,  0.7,  0.8,  0.9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [55]:
print(gsearch4.cv_results_)
print(gsearch4.best_score_)
print(gsearch4.best_params_)

{'mean_fit_time': array([ 21.97196261,  22.55792165,  22.82459633,  21.99888261,
        22.22140702,  25.26790325,  25.72357694,  25.84685032,
        25.39442937,  24.98320524,  28.40661867,  29.47806629,
        29.57766366,  29.06855226,  28.74974394,  32.99740299,
        33.93686136,  32.69801434,  33.21947956,  33.254445  ,
        36.5933876 ,  37.35075307,  37.35155725,  37.19543401,  36.50886933]), 'split1_train_score': array([ 0.80285941,  0.81573785,  0.82427744,  0.83262439,  0.83767667,
        0.80436458,  0.81773754,  0.82874569,  0.83695997,  0.84032511,
        0.80645391,  0.81876318,  0.83107344,  0.83783258,  0.84478245,
        0.80947238,  0.820264  ,  0.83041896,  0.83815992,  0.84423326,
        0.81129656,  0.82248296,  0.83095846,  0.83519809,  0.84571339]), 'param_colsample_bytree': masked_array(data = [0.5 0.5 0.5 0.5 0.5 0.59999999999999998 0.59999999999999998
 0.59999999999999998 0.59999999999999998 0.59999999999999998
 0.69999999999999996 0.6999999999999

In [56]:
xgb4 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     seed=42)

In [57]:
param_test5 = {
    "learning_rate": np.arange(0.01, 0.12, 0.02)
}
gsearch5 = GridSearchCV(estimator=xgb4, param_grid=param_test5, scoring="r2", verbose=1)

In [58]:
gsearch5.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'learning_rate': array([ 0.01,  0.03,  0.05,  0.07,  0.09,  0.11])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [59]:
print(gsearch5.cv_results_)
print(gsearch5.best_score_)
print(gsearch5.best_params_)

{'mean_fit_time': array([ 34.23959025,  34.2434996 ,  31.95830027,  31.46197629,
        31.30107164,  31.03376762]), 'split1_train_score': array([ 0.30940852,  0.80281028,  0.84423326,  0.86204105,  0.87536992,
        0.88139915]), 'std_test_score': array([ 0.06727064,  0.07700247,  0.09114049,  0.10448471,  0.12765139,
        0.10479015]), 'mean_score_time': array([ 0.10457738,  0.11623812,  0.11232471,  0.10873866,  0.10951837,
        0.10567331]), 'split1_test_score': array([ 0.19489699,  0.64886758,  0.6671072 ,  0.65912031,  0.6649731 ,
        0.65853678]), 'split2_test_score': array([ 0.18976284,  0.66537266,  0.68683571,  0.69682493,  0.69373271,
        0.69153157]), 'std_score_time': array([ 0.00422106,  0.00333164,  0.00174697,  0.00452629,  0.00743915,
        0.00196409]), 'std_train_score': array([ 0.00573531,  0.01381136,  0.01073932,  0.01087661,  0.00937434,
        0.00840055]), 'split0_test_score': array([ 0.33496212,  0.49440077,  0.48439083,  0.4587464 ,  0.409

In [60]:
param_test6 = {
    "learning_rate": [0.04, 0.05, 0.06]
}
gsearch6 = GridSearchCV(estimator=xgb4, param_grid=param_test6, scoring="r2", verbose=1)

In [61]:
gsearch6.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.04, 0.05, 0.06]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [62]:
print(gsearch6.cv_results_)
print(gsearch6.best_score_)
print(gsearch6.best_params_)

{'mean_fit_time': array([ 33.27052132,  33.16510908,  31.805921  ]), 'split1_train_score': array([ 0.82878414,  0.84423326,  0.85414104]), 'std_test_score': array([ 0.0895207 ,  0.09114049,  0.10798384]), 'mean_score_time': array([ 0.1187044 ,  0.10999537,  0.11767554]), 'split1_test_score': array([ 0.66483569,  0.6671072 ,  0.66251147]), 'split2_test_score': array([ 0.67581682,  0.68683571,  0.6920061 ]), 'std_score_time': array([ 0.00586206,  0.00478827,  0.00750173]), 'std_train_score': array([ 0.01207467,  0.01073932,  0.01089907]), 'split0_test_score': array([ 0.4806638 ,  0.48439083,  0.44962024]), 'std_fit_time': array([ 0.47589441,  1.45608282,  0.55955369]), 'split0_train_score': array([ 0.84990347,  0.86198947,  0.86991835]), 'param_learning_rate': masked_array(data = [0.04 0.05 0.06],
             mask = [False False False],
       fill_value = ?)
, 'params': ({'learning_rate': 0.04}, {'learning_rate': 0.05}, {'learning_rate': 0.06}), 'mean_test_score': array([ 0.60710306,  

In [63]:
xgb5 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     reg_alpha=0,
                     seed=42)

In [64]:
param_test7 = {
    "reg_alpha": np.arange(0.01, 0.11, 0.02)
}
gsearch7 = GridSearchCV(estimator=xgb5, param_grid=param_test7, scoring="r2", verbose=1)

In [65]:
gsearch7.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  8.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.9),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'reg_alpha': array([ 0.01,  0.03,  0.05,  0.07,  0.09])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=1)

In [66]:
print(gsearch7.cv_results_)
print(gsearch7.best_score_)
print(gsearch7.best_params_)

{'mean_fit_time': array([ 33.35795204,  33.54191502,  33.4405396 ,  33.59721828,  33.59879231]), 'split1_train_score': array([ 0.84423326,  0.84423326,  0.84423326,  0.84423326,  0.84423326]), 'std_test_score': array([ 0.09114049,  0.09114049,  0.09114049,  0.09114049,  0.09114049]), 'mean_score_time': array([ 0.10843492,  0.10982132,  0.11626943,  0.10852631,  0.10972071]), 'split1_test_score': array([ 0.6671072,  0.6671072,  0.6671072,  0.6671072,  0.6671072]), 'split2_test_score': array([ 0.68683571,  0.68683571,  0.68683571,  0.68683571,  0.68683571]), 'std_score_time': array([ 0.0017089 ,  0.00453805,  0.00562956,  0.00307045,  0.00395006]), 'std_train_score': array([ 0.01073932,  0.01073932,  0.01073932,  0.01073932,  0.01073932]), 'split0_test_score': array([ 0.48439082,  0.48439082,  0.48439082,  0.48439082,  0.48439082]), 'std_fit_time': array([ 0.76139841,  0.77667015,  0.46750333,  1.28307987,  1.01757436]), 'split0_train_score': array([ 0.86198947,  0.86198947,  0.86198947,

In [None]:
xgb6 = XGBRegressor(learning_rate=0.05,
                     max_depth=7,
                     min_child_weight=6,
                     gamma=0,
                     subsample=0.9,
                     colsample_bytree=0.8,
                     objective="reg:linear",
                     reg_alpha=0.01,
                     seed=42)

### To-Do
- Learning rate 0.01 - 0.2
- Subsample 0.5 - 1
- colsample_bytree" 0.5 - 1
- Seed

In [15]:
list(range(3,10,2))

[3, 5, 7, 9]