In [1]:
"""
You need to run this cell for the code in following cells to work.
"""

# Enable module reloading
%load_ext autoreload
%autoreload 2

import os
os.chdir("..")

In [2]:
import math

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor

import xgboost as xgb

from src.data.preprocessing.utils import filter_data, merge_data
from src.data.feature_unions import merged_fu, merged_fu_without_outliers

In [3]:
def cv_mean_std_score(cv_result, scoring):
    result = {}
    for key, value in cv_result.items():
        if 'test_' in key or 'train_' in key:
            result[key] = {'mean': value.mean(), 'std': value.std()}
    return result

def rmse(y_true, y_pred):
    y_pred = [0.0 if x < 0 else x for x in y_pred]
    try:
        res = mean_squared_error(y_true, y_pred, squared=False)
    except ValueError as e:
        print(e)
        res = -1000
    return res

def nrmse(y_true, y_pred):
    return rmse(y_true, y_pred) / (max(y_true) - min(y_true)) * 100

def rmsle(y_true, y_pred):
    y_pred = [0.0 if x < 0 else x for x in y_pred]
    try:
        res = math.sqrt(mean_squared_log_error(y_true, y_pred))
    except ValueError as e:
        print(e)
        res = -1000
    return res

scoring = {
    'nrmse': make_scorer(rmse, greater_is_better=False),
    'nnrmse': make_scorer(nrmse, greater_is_better=False),
    'nrmsle': make_scorer(rmsle, greater_is_better=False)
}

In [4]:
%%time
buildings_data = pd.read_csv('data/building_metadata.csv')
weather_data = pd.read_csv('data/weather_train.csv')
meter_data = pd.read_csv('data/train.csv')

CPU times: user 8.23 s, sys: 670 ms, total: 8.9 s
Wall time: 8.9 s


# Experiments

##  Experiments with `site_id` 0

In [5]:
%%time
buildings_data, weather_data, meter_data = filter_data(
    buildings_data, weather_data, meter_data, site_id=0, meter=0, meter_reading=200
)
data = merge_data(buildings_data, weather_data, meter_data)

CPU times: user 3.26 s, sys: 344 ms, total: 3.6 s
Wall time: 3.6 s


In [6]:
y = data[['meter_reading']].values.ravel()
len(y), y

(605288,
 array([  0.    ,   0.    ,   0.    , ...,  47.2332, 142.245 , 132.802 ]))

In [17]:
%%time
x = merged_fu.union_features(data)
x

CPU times: user 1.46 s, sys: 544 ms, total: 2 s
Wall time: 2min 29s


Unnamed: 0,Education,Entertainment/public assembly,Lodging/residential,Office,Other,square_feet,meter_reading_scaled_1,meter_reading_scaled_2,meter_reading_scaled_3,meter_reading_scaled_4,meter_reading_scaled_5,air_temperature,dew_temperature,wind_direction,wind_speed
0,1.0,0.0,0.0,0.0,0.0,-0.779911,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
1,1.0,0.0,0.0,0.0,0.0,-0.755156,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
2,1.0,0.0,0.0,0.0,0.0,-0.584502,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
3,1.0,0.0,0.0,0.0,0.0,0.281602,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
4,1.0,0.0,0.0,0.0,0.0,-0.730698,-0.657943,-0.657943,-0.657943,-0.657943,-0.657943,0.358819,0.488281,-1.332590,-1.567109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605283,0.0,0.0,0.0,1.0,0.0,-0.784199,-0.261517,-0.305532,-0.205217,-0.176896,-0.196572,0.822506,1.086596,-0.139979,-0.127903
605284,0.0,0.0,1.0,0.0,0.0,-0.577315,1.926144,2.053517,1.989830,2.080827,1.971634,0.822506,1.086596,-0.139979,-0.127903
605285,0.0,0.0,0.0,1.0,0.0,-0.629474,0.292889,0.242847,0.306538,0.374780,0.361132,0.822506,1.086596,-0.139979,-0.127903
605286,1.0,0.0,0.0,0.0,0.0,-0.603404,1.741883,1.710039,1.662266,1.653168,1.500767,0.822506,1.086596,-0.139979,-0.127903


### Extremely  Randomized  Trees

In this section we experiment with **Extremely Randomized Trees Regression** algorithm.

####  With outliers

In [7]:
pipeline = Pipeline([
    ('features', merged_fu),
    ('classifier', ExtraTreesRegressor(
        n_jobs=-1
    ))
])

In [8]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 1min 8s, sys: 16.1 s, total: 1min 24s
Wall time: 16min 57s


In [9]:
result

{'fit_time': array([219.8563323 , 188.39170861, 178.42246723, 200.68005228,
        190.4943397 ]),
 'score_time': array([ 4.13445902,  6.01872849,  8.24122906, 13.21035266,  7.60145688]),
 'test_neg_root_mean_squared_error': array([-30.41032752, -34.67210464, -59.32714906, -24.16964255,
        -25.32649584]),
 'test_neg_normalized_root_mean_squared_error': array([-15.6437359 , -17.76125681, -29.66505778, -12.08542555,
        -12.66343787]),
 'test_neg_root_mean_squared_log_error': array([-2.39625613, -2.55994908, -3.11067965, -0.66217591, -0.56967891])}

In [10]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -34.781143923057456,
  'std': 12.834458504648522},
 'test_neg_normalized_root_mean_squared_error': {'mean': -17.56378278145746,
  'std': 6.390496239371232},
 'test_neg_root_mean_squared_log_error': {'mean': -1.8597479365086678,
  'std': 1.0432075249847157}}

####  Without outliers

In [7]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', ExtraTreesRegressor(
        n_jobs=-1
    ))
])

In [8]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 1min 6s, sys: 16.1 s, total: 1min 22s
Wall time: 16min 4s


In [9]:
result

{'fit_time': array([184.97140241, 183.02505994, 171.70970035, 180.8711586 ,
        204.79703856]),
 'score_time': array([ 3.95679188,  7.4586904 ,  8.17715549, 10.08739924,  9.48864102]),
 'test_neg_root_mean_squared_error': array([-29.28447081, -35.14438758, -60.24574366, -24.1713793 ,
        -25.33199547]),
 'test_neg_normalized_root_mean_squared_error': array([-15.06457064, -18.00319016, -30.12437805, -12.08629397,
        -12.66618773])}

In [10]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -34.83559536458503,
  'std': 13.271246272136471},
 'test_neg_normalized_root_mean_squared_error': {'mean': -17.588924109668923,
  'std': 6.605964793249112}}

### Gradient Boosting

In this section we experiment with **Gradient Boosting Regression** algorithm.

####  With outliers

In [8]:
pipeline = Pipeline([
    ('features', merged_fu),
    ('classifier', GradientBoostingRegressor())
])

In [12]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring)

CPU times: user 11min 47s, sys: 779 ms, total: 11min 48s
Wall time: 18min 15s


In [13]:
result

{'fit_time': array([229.45887518, 235.7858367 , 212.88394046, 189.19264197,
        194.28082752]),
 'score_time': array([3.67976141, 6.59404635, 9.59670663, 9.030231  , 4.92878103]),
 'test_neg_root_mean_squared_error': array([ -8.7156448 ,  -7.73164527, -16.99786573, -29.69650662,
        -32.12221959]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.48351782,  -3.96064037,  -8.49935783, -14.84899576,
        -16.06135072])}

In [14]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -19.052776404173873,
  'std': 10.230799539832114},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.570772500169276,
  'std': 5.0692981468776255}}

####  Without outliers

In [40]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', GradientBoostingRegressor())
])

In [41]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring, return_train_score=True)

CPU times: user 11min 27s, sys: 1.85 s, total: 11min 29s
Wall time: 23min 1s


In [42]:
result

{'fit_time': array([213.0756371 , 224.50695515, 212.17134857, 182.44044185,
        192.51071668]),
 'score_time': array([ 3.44237852,  5.39619136,  7.78681874, 10.21395588,  5.97975063]),
 'test_neg_root_mean_squared_error': array([ -7.830535  ,  -7.6937339 , -17.08321472, -29.73671728,
        -32.15608398]),
 'train_neg_root_mean_squared_error': array([-18.54917346, -18.40827548, -18.00550728, -17.32643597,
        -16.72149528]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.02819803,  -3.94121975,  -8.54203446, -14.86910209,
        -16.07828316]),
 'train_neg_normalized_root_mean_squared_error': array([-9.27472585, -9.2042758 , -9.00288868, -8.66334793, -8.3611657 ]),
 'test_neg_root_mean_squared_log_error': array([-1.19600217, -1.31780551, -1.8350073 , -0.71665458, -0.72805533]),
 'train_neg_root_mean_squared_log_error': array([-0.993806  , -0.97470993, -0.96503882, -1.057342  , -1.02068838])}

In [43]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -18.900056974555284,
  'std': 10.436209732735954},
 'train_neg_root_mean_squared_error': {'mean': -17.80217749299934,
  'std': 0.687154247227817},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.49176750000829,
  'std': 5.174148827378152},
 'train_neg_normalized_root_mean_squared_error': {'mean': -8.901280793871448,
  'std': 0.3434902367915776},
 'test_neg_root_mean_squared_log_error': {'mean': -1.1587049798858826,
  'std': 0.4159222436065244},
 'train_neg_root_mean_squared_log_error': {'mean': -1.0023170247762228,
  'std': 0.0334350370038346}}

We can see that Gradient Boosting algorithm outperforms Extremely Randomized Trees algorithm. Also, GB with replaced outliers achieve better results.

Try to use 200 estimators.

In [13]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', GradientBoostingRegressor(n_estimators=200))
])

In [14]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring, return_train_score=True)

CPU times: user 22min 30s, sys: 2.24 s, total: 22min 32s
Wall time: 35min 19s


In [15]:
result

{'fit_time': array([363.85885859, 361.26309967, 352.70529771, 305.29713702,
        311.60204792]),
 'score_time': array([ 6.97142267,  6.82749963, 10.10379553, 11.2027626 , 10.73599458]),
 'test_neg_root_mean_squared_error': array([ -7.94522843,  -7.79030336, -17.67358869, -29.10359223,
        -31.47425378]),
 'train_neg_root_mean_squared_error': array([-17.91152067, -17.85680315, -17.41504872, -16.8178865 ,
        -16.29680614]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.08719883,  -3.99068877,  -8.83723621, -14.55252374,
        -15.73736295]),
 'train_neg_normalized_root_mean_squared_error': array([-8.95589468, -8.9285355 , -8.70765497, -8.40906939, -8.14881051])}

In [20]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -18.797393297676003,
  'std': 10.07083885469336},
 'train_neg_root_mean_squared_error': {'mean': -17.259613038396687,
  'std': 0.620950769761718},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.441002100205653,
  'std': 4.990099628824158},
 'train_neg_normalized_root_mean_squared_error': {'mean': -8.629993010767093,
  'std': 0.31039160391687437}}

Try to use deeper trees - max depth equals to 6.

In [21]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', GradientBoostingRegressor(max_depth=6))
])

In [22]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring, return_train_score=True)

CPU times: user 21min 51s, sys: 2.12 s, total: 21min 53s
Wall time: 33min 52s


In [23]:
result

{'fit_time': array([355.61827588, 359.0064435 , 339.57385325, 295.07676649,
        302.99209166]),
 'score_time': array([4.86790609, 7.01241064, 9.39163494, 9.26880312, 5.64262557]),
 'test_neg_root_mean_squared_error': array([ -9.80987984,  -8.95157648, -23.27720904, -27.3468737 ,
        -29.64113574]),
 'train_neg_root_mean_squared_error': array([-15.79689211, -15.73661244, -15.01281957, -15.0873088 ,
        -14.45512188]),
 'test_neg_normalized_root_mean_squared_error': array([ -5.0464162 ,  -4.58556671, -11.63918648, -13.67412056,
        -14.82079018]),
 'train_neg_normalized_root_mean_squared_error': array([-7.89856453, -7.86842425, -7.50652238, -7.54376756, -7.22792233])}

In [24]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -19.805334961971397,
  'std': 8.756539149201132},
 'train_neg_root_mean_squared_error': {'mean': -15.217750958403467,
  'std': 0.4990464618945322},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.953216025452852,
  'std': 4.319010275782475},
 'train_neg_normalized_root_mean_squared_error': {'mean': -7.609040210261233,
  'std': 0.24944966264885923}}

Try to use shallower trees - max depth equals to 1.

In [25]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', GradientBoostingRegressor(max_depth=1))
])

In [26]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring, return_train_score=True)

CPU times: user 3min 47s, sys: 1.28 s, total: 3min 49s
Wall time: 15min 33s


In [27]:
result

{'fit_time': array([125.62971902, 118.04174852, 113.83048368,  98.93992829,
        113.23761392]),
 'score_time': array([3.54683805, 6.67581725, 8.09305763, 8.9606626 , 5.87607765]),
 'test_neg_root_mean_squared_error': array([ -8.37750939,  -7.94671181, -16.79778876, -36.23817497,
        -37.45160994]),
 'train_neg_root_mean_squared_error': array([-22.70235254, -22.7396642 , -22.58782099, -20.21723075,
        -19.9085798 ]),
 'test_neg_normalized_root_mean_squared_error': array([ -4.30957359,  -4.07081112,  -8.39931434, -18.11999348,
        -18.72608586]),
 'train_neg_normalized_root_mean_squared_error': array([-11.35134654, -11.37000265, -11.29407991, -10.10876701,
         -9.95478764])}

In [28]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -21.36235897384346,
  'std': 13.035145621444501},
 'train_neg_root_mean_squared_error': {'mean': -21.6311296549707,
  'std': 1.285139194195418},
 'test_neg_normalized_root_mean_squared_error': {'mean': -10.725155681050214,
  'std': 6.47376309159671},
 'train_neg_normalized_root_mean_squared_error': {'mean': -10.815796747950035,
  'std': 0.6424858416806662}}

We can see that Gradient Boosting algorithm performs best with default parameters. Maybe more exhausted optimization using grid or random search can help to find better parameters.

### Extreme Gradient Boosting

In this section we experiment with **Extreme Gradient Boosting Regression** algorithm.

In [22]:
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('classifier', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1,
        n_estimators=100, max_depth=3
    ))
])

In [15]:
%%time
result = cross_validate(pipeline, data, y, cv=5, scoring=scoring, return_train_score=True)

CPU times: user 3min 34s, sys: 1.91 s, total: 3min 36s
Wall time: 15min 43s


In [16]:
result

{'fit_time': array([126.54070187, 130.06502461, 102.55759263, 102.1102519 ,
        103.10842228]),
 'score_time': array([5.35396862, 6.61203384, 7.96159005, 9.19847941, 5.83580422]),
 'test_neg_root_mean_squared_error': array([ -7.73127129,  -8.23167394, -19.20983062, -28.88278977,
        -31.09197004]),
 'train_neg_root_mean_squared_error': array([-17.45806904, -17.51365218, -17.20145872, -16.55938095,
        -16.08478017]),
 'test_neg_normalized_root_mean_squared_error': array([ -3.97713461,  -4.21678685,  -9.60539558, -14.44211699,
        -15.54621822]),
 'train_neg_normalized_root_mean_squared_error': array([-8.72916546, -8.75695745, -8.60085837, -8.27981467, -8.04279223]),
 'test_neg_root_mean_squared_log_error': array([-1.12651746, -1.30856695, -1.91859027, -0.72005466, -0.719049  ]),
 'train_neg_root_mean_squared_log_error': array([-0.95164027, -0.94778474, -0.95826666, -1.04847284, -1.0125743 ])}

In [17]:
cv_mean_std_score(result, scoring)

{'test_neg_root_mean_squared_error': {'mean': -19.0295071322903,
  'std': 9.8677863954361},
 'train_neg_root_mean_squared_error': {'mean': -16.96346821419632,
  'std': 0.5548605329505107},
 'test_neg_normalized_root_mean_squared_error': {'mean': -9.557530449119401,
  'std': 4.886516506666996},
 'train_neg_normalized_root_mean_squared_error': {'mean': -8.481917635408273,
  'std': 0.277345278114628},
 'test_neg_root_mean_squared_log_error': {'mean': -1.1585556669804962,
  'std': 0.4442142873014233},
 'train_neg_root_mean_squared_log_error': {'mean': -0.983747761897391,
  'std': 0.039984551043543023}}

#### Hyperparameter optimization

In [9]:
%%time
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('clf', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, n_estimators=100
    ))
])
hyperparameters = {
    'clf__booster': ['gbtree', 'gblinear', 'dart']
}

clf = GridSearchCV(
    pipeline, hyperparameters, cv=5, return_train_score=True, scoring=scoring, refit=False
)
clf.fit(data, y)

CPU times: user 22min 47s, sys: 4.67 s, total: 22min 52s
Wall time: 52min 52s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        CustomFeatureUnion(n_jobs=-1,
                                                           transformer_list=[('primary '
                                                                              'use',
                                                                              Pipeline(memory=None,
                                                                                       steps=[('merge '
                                                                                               'categories',
                                                                                               <src.data.preprocessing.transformers.PrimaryUseTransformer object at 0x7f06e58feb90>),
                                                                                              ('one '
                          

In [11]:
clf.cv_results_

{'mean_fit_time': array([132.27734184,  73.1740406 , 227.60065475]),
 'std_fit_time': array([19.40847609,  5.73734186, 15.76975361]),
 'mean_score_time': array([6.54718237, 5.78775206, 6.37015796]),
 'std_score_time': array([1.63907923, 1.45576894, 1.56696783]),
 'param_clf__booster': masked_array(data=['gbtree', 'gblinear', 'dart'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__booster': 'gbtree'},
  {'clf__booster': 'gblinear'},
  {'clf__booster': 'dart'}],
 'split0_test_neg_root_mean_squared_error': array([-14.7530574 ,  -7.90385589, -14.75305778]),
 'split1_test_neg_root_mean_squared_error': array([-12.72545995,  -7.84608358, -12.72545996]),
 'split2_test_neg_root_mean_squared_error': array([-27.81656585, -16.75444661, -27.81656666]),
 'split3_test_neg_root_mean_squared_error': array([-27.22806669, -36.44696688, -27.22806667]),
 'split4_test_neg_root_mean_squared_error': array([-29.19846224, -37.74981882, -29.198462

In [24]:
df = pd.DataFrame(clf.cv_results_)
df
df[[
    'param_clf__booster',
    'mean_test_neg_root_mean_squared_error',
    'mean_test_neg_normalized_root_mean_squared_error',
    'mean_test_neg_root_mean_squared_log_error',
    'std_test_neg_root_mean_squared_error',
    'std_test_neg_normalized_root_mean_squared_error',
    'std_test_neg_root_mean_squared_log_error',
    'mean_train_neg_root_mean_squared_error',
    'mean_train_neg_normalized_root_mean_squared_error',
    'mean_train_neg_root_mean_squared_error',
]]

Unnamed: 0,param_clf__booster,mean_test_neg_root_mean_squared_error,mean_test_neg_normalized_root_mean_squared_error,mean_test_neg_root_mean_squared_log_error,std_test_neg_root_mean_squared_error,std_test_neg_normalized_root_mean_squared_error,std_test_neg_root_mean_squared_log_error,mean_train_neg_root_mean_squared_error,mean_train_neg_normalized_root_mean_squared_error,mean_train_neg_root_mean_squared_error.1
0,gbtree,-22.344322,-11.246245,-1.26159,7.084144,3.454443,0.503343,-14.140296,-7.070301,-14.140296
1,gblinear,-21.340234,-10.712482,-1.255605,13.275132,6.595243,0.424841,-21.875688,-10.938078,-21.875688
2,dart,-22.344323,-11.246246,-1.26159,7.084144,3.454444,0.503343,-14.140296,-7.070301,-14.140296


We can see that `gblinear` booster performs the best. Next we try optimize its hyperparemeters.

##### gblinear booster

In [7]:
%%time
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('clf', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, n_estimators=100, booster='gblinear'
    ))
])
hyperparameters = {
    'clf__feature_selector': ['cyclic', 'shuffle', 'random', 'greedy', 'thrifty'],
    'clf__learning_rate': [0.1, 0.01],
    'clf__n_estimators': [100, 200]
}

clf = GridSearchCV(
    pipeline, hyperparameters, cv=5, return_train_score=True, scoring=scoring, refit=False
)
_ = clf.fit(data, y)

xgboost.core.XGBoostError: [21:11:23] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:16:09] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:20:49] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:25:30] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:29:20] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:33:06] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:36:50] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:40:36] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:44:31] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:48:25] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:52:18] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:56:00] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [21:59:47] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [22:03:42] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

xgboost.core.XGBoostError: [22:07:32] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

CPU times: user 20min 37s, sys: 20.1 s, total: 20min 57s
Wall time: 3h 40s


xgboost.core.XGBoostError: [22:10:16] /workspace/src/linear/updater_shotgun.cc:21: Unsupported feature selector for shotgun updater.
Supported options are: {cyclic, shuffle}
Stack trace:
  [bt] (0) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f59381b2614]
  [bt] (1) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::linear::ShotgunUpdater::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x4e7) [0x7f59382b5597]
  [bt] (2) /home/denis/projects/school/oznal/fiit_knowledge_discovery_project/.venv/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBLinear::Configure(std::vector<std::pair<std::string, std::string>, std::allocator<std::pair<std::string, std::string> > > const&)+0x5

In [8]:
clf.cv_results_

{'mean_fit_time': array([124.38588758, 120.07462497,  76.87793536,  92.39500909,
         77.36332879, 110.61193624,  98.13134007,  98.08683228,
         71.32000084,  70.44380441,  63.68758254,  56.54914675,
         57.51182418,  56.75926042,  57.25250239,  58.49934578,
         57.51864829,  57.03219137,  56.95549064,  57.1865335 ]),
 'std_fit_time': array([15.94297589, 34.46741729,  4.76276334,  4.21289386,  5.07880913,
        12.71471044, 15.48142237,  8.43078823, 15.95567852,  7.2189119 ,
        12.03577026,  4.3026833 ,  4.28216237,  3.74966691,  4.5868129 ,
         4.51266303,  4.88950858,  4.95547592,  5.0551593 ,  3.94321038]),
 'mean_score_time': array([13.40793228,  9.44268742,  6.40514927,  6.48463812,  6.3151979 ,
         7.68261719,  8.16495137,  7.14517093,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]),
 'std_score_time': array([5.50990827, 

In [10]:
df = pd.DataFrame(clf.cv_results_)
df
df[[
    'param_clf__feature_selector',
    'param_clf__learning_rate',
    'param_clf__n_estimators',
    'mean_test_nrmse', 'mean_test_nnrmse', 'mean_test_nrmsle',
    'std_test_nrmse', 'std_test_nnrmse', 'std_test_nrmsle',
    'mean_train_nrmse', 'mean_train_nnrmse', 'mean_train_nrmsle',
]]

Unnamed: 0,param_clf__feature_selector,param_clf__learning_rate,param_clf__n_estimators,mean_test_nrmse,mean_test_nnrmse,mean_test_nrmsle,std_test_nrmse,std_test_nnrmse,std_test_nrmsle,mean_train_nrmse,mean_train_nnrmse,mean_train_nrmsle
0,cyclic,0.1,100,-21.461394,-10.772867,-1.280603,13.386678,6.651076,0.454155,-22.003115,-11.001793,-1.199948
1,cyclic,0.1,200,-21.355072,-10.719754,-1.262811,13.300629,6.608088,0.432794,-21.891782,-10.946126,-1.177355
2,cyclic,0.01,100,-22.996317,-11.530307,-0.892592,16.511718,8.223398,0.147344,-25.079233,-12.539883,-0.78996
3,cyclic,0.01,200,-22.590684,-11.33561,-1.262139,14.793565,7.356326,0.429894,-23.615477,-11.807991,-1.19048
4,shuffle,0.1,100,-21.489124,-10.786657,-1.282371,13.421594,6.668591,0.455414,-22.042062,-11.021267,-1.2017
5,shuffle,0.1,200,-21.360816,-10.722607,-1.264113,13.306553,6.611058,0.433872,-21.898561,-10.949515,-1.17865
6,shuffle,0.01,100,-23.027234,-11.545736,-0.890464,16.545385,8.24026,0.150272,-25.11911,-12.559822,-0.789032
7,shuffle,0.01,200,-22.612138,-11.346293,-1.261839,14.820913,7.37004,0.429621,-23.646433,-11.823469,-1.190219
8,random,0.1,100,,,,,,,,,
9,random,0.1,200,,,,,,,,,


`cyclic` feature selector performs the best. Next optimization is performed using this feature selector.

In [11]:
%%time
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('clf', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, booster='gblinear', feature_selector='cyclic'
    ))
])
hyperparameters = {
    'clf__learning_rate': [0.1, 0.3, 0.6, 0.9],
    'clf__n_estimators': [200, 300]
}

clf = GridSearchCV(
    pipeline, hyperparameters, cv=5, return_train_score=True, scoring=scoring, refit=False
)
_ = clf.fit(data, y)

CPU times: user 26min 9s, sys: 10.6 s, total: 26min 19s
Wall time: 1h 42min 39s


In [12]:
clf.cv_results_

{'mean_fit_time': array([86.76692972, 97.8038538 , 84.36827512, 97.33289647, 84.45016856,
        97.75361953, 84.29658999, 98.21990409]),
 'std_fit_time': array([6.40645921, 4.66699879, 4.99994263, 4.0610579 , 4.84523708,
        4.29916907, 4.52852822, 4.85324068]),
 'mean_score_time': array([5.87271614, 5.81142783, 5.80090356, 5.84348507, 5.76602798,
        5.75075235, 5.78940983, 5.77719703]),
 'std_score_time': array([1.58063653, 1.48558016, 1.50846396, 1.5746939 , 1.46576235,
        1.48441559, 1.54266633, 1.54218906]),
 'param_clf__learning_rate': masked_array(data=[0.1, 0.1, 0.3, 0.3, 0.6, 0.6, 0.9, 0.9],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_clf__n_estimators': masked_array(data=[200, 300, 200, 300, 200, 300, 200, 300],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__learning_rat

In [14]:
df = pd.DataFrame(clf.cv_results_)
df
df[[
    'param_clf__learning_rate',
    'param_clf__n_estimators',
    'mean_test_nrmse', 'mean_test_nnrmse', 'mean_test_nrmsle',
    'std_test_nrmse', 'std_test_nnrmse', 'std_test_nrmsle',
    'mean_train_nrmse', 'mean_train_nnrmse', 'mean_train_nrmsle',
]]

Unnamed: 0,param_clf__learning_rate,param_clf__n_estimators,mean_test_nrmse,mean_test_nnrmse,mean_test_nrmsle,std_test_nrmse,std_test_nnrmse,std_test_nrmsle,mean_train_nrmse,mean_train_nnrmse,mean_train_nrmsle
0,0.1,200,-21.355072,-10.719754,-1.262811,13.300629,6.608088,0.432794,-21.891782,-10.946126,-1.177355
1,0.1,300,-21.342296,-10.713439,-1.258108,13.28388,6.599669,0.427528,-21.878409,-10.939439,-1.171908
2,0.3,200,-21.340173,-10.71245,-1.255612,13.275236,6.595296,0.424845,-21.87569,-10.938079,-1.169182
3,0.3,300,-21.340184,-10.712458,-1.255532,13.27499,6.595172,0.424759,-21.875687,-10.938078,-1.1691
4,0.6,200,-21.340187,-10.71246,-1.255525,13.274967,6.59516,0.424753,-21.875687,-10.938078,-1.169094
5,0.6,300,-21.340187,-10.71246,-1.255525,13.274968,6.59516,0.424753,-21.875687,-10.938078,-1.169094
6,0.9,200,-21.340187,-10.71246,-1.255525,13.274968,6.59516,0.424753,-21.875687,-10.938078,-1.169094
7,0.9,300,-21.340187,-10.71246,-1.255525,13.274968,6.59516,0.424753,-21.875687,-10.938078,-1.169094


We can see that best parameters are `learning_rate = 0.3` and `n_estimators = 200`. Achieved results equal to the results achieved with default hyperparameters, so there is no improvement.

Next we try optimize hyperparemeters of `gbtree` booster.

##### gbtree booster

In [7]:
%%time
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('clf', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, booster='gbtree'
    ))
])
hyperparameters = {
    'clf__eta': [0.01, 0.1],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [2, 3, 6],
    'clf__lambda': [0, 1]
}

clf = GridSearchCV(
    pipeline, hyperparameters, cv=5, return_train_score=True, scoring=scoring, refit=False
)
_ = clf.fit(data, y)

CPU times: user 2h 19min 17s, sys: 34.1 s, total: 2h 19min 51s
Wall time: 6h 34min 47s


In [8]:
clf.cv_results_

{'mean_fit_time': array([102.54567809, 124.48679938, 105.06580558, 137.97431078,
        138.57706189, 207.59084625,  94.66609702, 121.52993531,
        106.65824423, 141.38808999, 137.62112021, 217.28763471,
         96.60370846, 104.31164861,  86.22533951, 114.83817501,
        115.43199782, 174.45477009,  78.14464498,  99.70094872,
         87.02250519, 116.1571857 , 116.63107696, 175.34276023]),
 'std_fit_time': array([11.27848793, 10.410943  , 13.96892409,  6.20386445,  7.96166942,
         8.45143404,  8.30130076,  5.52521771,  7.13611558,  9.6864664 ,
        12.86263163, 10.04233337, 10.90383503,  7.92476307,  5.86575793,
         5.24975016,  5.94945338,  6.81032987,  5.11434628,  4.70316485,
         5.16353774,  5.37203997,  5.66528394,  6.76256297]),
 'mean_score_time': array([8.46586375, 7.74398303, 7.62831235, 7.51378922, 7.38301673,
        8.25762119, 6.70856938, 7.06256762, 7.62100587, 7.15504513,
        6.88855305, 8.8269556 , 6.82310219, 6.66885095, 6.00417361,
    

In [9]:
df = pd.DataFrame(clf.cv_results_)
df
df[[
    'param_clf__eta',
    'param_clf__n_estimators',
    'param_clf__max_depth',
    'param_clf__lambda',
    'mean_test_nrmse', 'mean_test_nnrmse', 'mean_test_nrmsle',
    'std_test_nrmse', 'std_test_nnrmse', 'std_test_nrmsle',
    'mean_train_nrmse', 'mean_train_nnrmse', 'mean_train_nrmsle',
]]

Unnamed: 0,param_clf__eta,param_clf__n_estimators,param_clf__max_depth,param_clf__lambda,mean_test_nrmse,mean_test_nnrmse,mean_test_nrmsle,std_test_nrmse,std_test_nnrmse,std_test_nrmsle,mean_train_nrmse,mean_train_nnrmse,mean_train_nrmsle
0,0.01,100,2,0,-28.556026,-14.317901,-1.364015,20.920651,10.420883,0.325351,-33.297038,-16.648869,-1.235453
1,0.01,200,2,0,-21.939678,-11.014394,-1.380857,13.692693,6.80265,0.485427,-22.888035,-11.444261,-1.296906
2,0.01,100,3,0,-27.402676,-13.738747,-1.196499,20.221404,10.073714,0.22055,-32.016461,-16.008567,-1.073924
3,0.01,200,3,0,-20.534255,-10.307393,-1.223503,12.983189,6.451828,0.359661,-21.399909,-10.700183,-1.118333
4,0.01,100,6,0,-26.644256,-13.357004,-1.041116,19.611438,9.770623,0.151756,-30.809921,-15.405284,-0.904439
5,0.01,200,6,0,-19.735051,-9.904806,-1.086818,12.145317,6.033941,0.292944,-19.544877,-9.772648,-0.93331
6,0.01,100,2,1,-28.553795,-14.316778,-1.362362,20.918281,10.419702,0.324477,-33.298089,-16.649394,-1.234547
7,0.01,200,2,1,-21.935652,-11.012372,-1.380851,13.691272,6.801953,0.485137,-22.882346,-11.441417,-1.296285
8,0.01,100,3,1,-27.404011,-13.739419,-1.195789,20.222515,10.074267,0.219963,-32.017081,-16.008877,-1.073097
9,0.01,200,3,1,-20.533297,-10.306924,-1.222851,12.983639,6.452054,0.359232,-21.398742,-10.699599,-1.118594


We can see that best results are achieved using `max_depth = 3` and without regularization. There is no overfitting even without regularization. Therefore, we fix these hyperparameters in next experiments.

Next we try to optimize `eta (learning rate)` and `n_estimators`.

In [8]:
%%time
pipeline = Pipeline([
    ('features', merged_fu_without_outliers),
    ('clf', xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, booster='gbtree',
        max_depth=3,
    ))
])
hyperparameters = {
    'clf__eta': [0.1, 0.3, 0.6, 1.0],
    'clf__n_estimators': [200, 300, 500],
    'clf__lambda': [0]
}

clf = GridSearchCV(
    pipeline, hyperparameters, cv=5, return_train_score=True, scoring=scoring, refit=False
)
_ = clf.fit(data, y)

CPU times: user 1h 48min 26s, sys: 17.5 s, total: 1h 48min 43s
Wall time: 3h 41min 38s


In [9]:
clf.cv_results_

{'mean_fit_time': array([121.50026503, 145.0880806 , 202.7440011 , 115.1519495 ,
        145.24151344, 203.35412807, 115.07521272, 143.98969216,
        202.07889743, 115.02987809, 145.51915288, 203.01352768]),
 'std_fit_time': array([13.45785577,  7.00090555,  7.30443941,  6.1700072 ,  6.042259  ,
         7.51437771,  4.88749177,  5.93845748,  7.63920352,  5.56020724,
         5.56631053,  7.41336395]),
 'mean_score_time': array([6.36917248, 6.728968  , 7.36143913, 6.27391701, 6.64523077,
        7.33760934, 6.30112038, 6.60255198, 7.35945234, 6.19664402,
        6.64127674, 7.35332766]),
 'std_score_time': array([1.52886623, 1.53333474, 1.72184101, 1.53590226, 1.63555267,
        1.75434319, 1.60320282, 1.58824131, 1.65497026, 1.54175355,
        1.7023421 , 1.78096666]),
 'param_clf__eta': masked_array(data=[0.1, 0.1, 0.1, 0.3, 0.3, 0.3, 0.6, 0.6, 0.6, 1.0, 1.0,
                    1.0],
              mask=[False, False, False, False, False, False, False, False,
                   

In [10]:
df = pd.DataFrame(clf.cv_results_)
df
df[[
    'param_clf__eta',
    'param_clf__n_estimators',
    'param_clf__lambda',
    'mean_test_nrmse', 'mean_test_nnrmse', 'mean_test_nrmsle',
    'std_test_nrmse', 'std_test_nnrmse', 'std_test_nrmsle',
    'mean_train_nrmse', 'mean_train_nnrmse', 'mean_train_nrmsle',
]]

Unnamed: 0,param_clf__eta,param_clf__n_estimators,param_clf__lambda,mean_test_nrmse,mean_test_nnrmse,mean_test_nrmsle,std_test_nrmse,std_test_nnrmse,std_test_nrmsle,mean_train_nrmse,mean_train_nnrmse,mean_train_nrmsle
0,0.1,200,0,-18.718843,-9.401302,-1.149857,10.109039,5.009824,0.423522,-17.249028,-8.6247,-0.99153
1,0.1,300,0,-18.682738,-9.383672,-1.148589,9.937515,4.923317,0.42525,-16.938468,-8.469417,-0.983202
2,0.1,500,0,-18.673072,-9.379525,-1.155147,9.705899,4.806297,0.428654,-16.59363,-8.296995,-0.973028
3,0.3,200,0,-19.090848,-9.589993,-1.157026,9.546887,4.724016,0.44265,-16.440923,-8.220639,-0.967409
4,0.3,300,0,-19.461151,-9.779794,-1.185285,9.073332,4.481422,0.453115,-16.163223,-8.081786,-0.959953
5,0.3,500,0,-19.93805,-10.023428,-1.209093,8.611272,4.24468,0.469148,-15.82136,-7.910851,-0.948271
6,0.6,200,0,-20.249413,-10.179854,-1.200475,8.836155,4.357321,0.449617,-16.057079,-8.028713,-0.963881
7,0.6,300,0,-20.497184,-10.305373,-1.21172,8.683028,4.277948,0.457761,-15.754243,-7.877292,-0.950514
8,0.6,500,0,-21.021235,-10.571735,-1.241618,8.331742,4.09678,0.482588,-15.40474,-7.702537,-0.940207
9,1.0,200,0,-20.901946,-10.503337,-1.223177,9.404343,4.641417,0.452526,-15.993725,-7.997035,-0.969244


As we can see, model performs best with `eta = 0.1` and `n_estimators = 500`. We expect that using more estimators would result in better performance, but this improvement would not be large, so we end up with hyperparameter optimization and consider this as **the best achieved results**.