In [1]:
import numpy as np
import pandas as pd 
import sklearn
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

import catboost as cb
import xgboost as xgb

# Train/test split

For a sake of the programming assignment, let's artificially split the data into train and test. We will treat last month data as the test set.

In [2]:
all_data = pd.read_parquet('all_data.parquet')

In [7]:
# List of all lagged features

# We will drop these at fitting stage
to_drop_cols = ['target_item', 'target', 'target_shop', 'date_block_num']

In [8]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()-1
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [16]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train =  all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =   all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)
X_target = all_data.loc[dates == dates.max()].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [10]:
X_train

Unnamed: 0,shop_id,item_id,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,...,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,54,10297,3.0,42.0,10055.0,0.0,2.0,7978.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,54,10296,0.0,24.0,10055.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38
2,54,10298,21.0,369.0,10055.0,119.0,1309.0,7978.0,7.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
3,54,10300,1.0,54.0,10055.0,31.0,361.0,7978.0,0.0,53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
4,54,10284,0.0,4.0,10055.0,0.0,3.0,7978.0,0.0,5.0,...,0.0,3.0,7827.0,0.0,10.0,7792.0,0.0,0.0,0.0,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6186917,27,21279,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,3786.0,0.0,0.0,0.0,0.0,0.0,0.0,61
6186918,27,21283,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,3518.0,0.0,4.0,4026.0,61
6186919,27,21352,0.0,0.0,0.0,1.0,2.0,2478.0,0.0,0.0,...,1.0,1.0,3786.0,0.0,0.0,0.0,0.0,2.0,4026.0,37
6186920,27,21284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61


# First level models 

You need to implement a basic stacking scheme. We have a time component here, so we will use ***scheme f)*** from the reading material. Recall, that we always use first level models to build two datasets: test meta-features and 2-nd level train-metafetures. Let's see how we get test meta-features first. 

### Test meta-features

Firts, we will run *linear regression* on numeric columns and get predictions for the last month.

In [11]:
model_lr = LinearRegression()
model_lr.fit(X_train.values, y_train)
pred_lr = model_lr.predict(X_test.values)

print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))
print('Test RMSE linreg is %f' % np.sqrt(mean_squared_error(y_test, pred_lr)))

Test R-squared for linreg is 0.481407
Test RMSE linreg is 1.941356


And the we run *LightGBM*.

In [None]:
%%time
# LightGGM
model_lgb = lgb.LGBMRegressor(
    n_estimators=1000,
    n_jobs=-1,
    objective='mse',
    random_state=17
)
model_lgb.fit(X_train, y_train, 
              eval_set=(X_test, y_test), 
              early_stopping_rounds=100,
              verbose=100)
pred_lgb = model_lgb.predict(X_test)

print('Test R-squared for LightGBM is %f' % r2_score(y_test, pred_lgb))
print('Test RMSE LightGBM is %f' % np.sqrt(mean_squared_error(y_test, pred_lgb)))

Training until validation scores don't improve for 100 rounds


In [13]:
%%time
# Catboost
cat_features = ['shop_id', 'item_id', 'item_category_id']
cb_train = cb.Pool(X_train, label=y_train, cat_features=cat_features)
cb_test = cb.Pool(X_test, label=y_test, cat_features=cat_features)

model_cbr = cb.CatBoostRegressor(loss_function='RMSE')
model_cbr.fit(cb_train, 
              eval_set=cb_test, 
              verbose=100, 
              early_stopping_rounds=100)
pred_cb = model_cbr.predict(X_test)

print('Test R-squared for CatBoost is %f' % r2_score(y_test, pred_cb))
print('Test RMSE CatBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_cb)))

Learning rate set to 0.239925
0:	learn: 3.1898125	test: 2.4052391	best: 2.4052391 (0)	total: 5.16s	remaining: 1h 25m 55s
100:	learn: 1.7579511	test: 2.3472933	best: 1.9260270 (8)	total: 8m 27s	remaining: 1h 15m 16s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.92602704
bestIteration = 8

Shrink model to first 9 iterations.
Test R-squared for CatBoost is 0.489565
Test RMSE CatBoost is 1.926027
CPU times: user 15min 19s, sys: 1min 10s, total: 16min 29s
Wall time: 9min 21s


In [14]:
%%time
# XGBM
model_xgb = xgb.XGBRegressor(n_estimators=500, n_jobs=4, random_state=17)
model_xgb.fit(X_train, y_train, 
              eval_set=[(X_test, y_test)], 
              eval_metric="rmse", 
              verbose=10, 
              early_stopping_rounds = 30)
pred_xgb = model_xgb.predict(X_test)

print('Test R-squared for XGBoost is %f' % r2_score(y_test, pred_xgb))
print('Test RMSE XGBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_xgb)))

[0]	validation_0-rmse:2.36626
Will train until validation_0-rmse hasn't improved in 30 rounds.
[10]	validation_0-rmse:2.06379
[20]	validation_0-rmse:2.06608
[30]	validation_0-rmse:2.14833
Stopping. Best iteration:
[5]	validation_0-rmse:2.04074

Test R-squared for XGBoost is 0.427007
Test RMSE XGBoost is 2.040642
CPU times: user 15min 7s, sys: 17.9 s, total: 15min 25s
Wall time: 8min 35s


Finally, concatenate test predictions to get test meta-features.

In [None]:
X_train_level2 = np.c_[pred_lr, pred_lgb, pred_cb, pred_xgb] 
# X_test_level2.tofile('X_test_level2')

In [None]:
X_train_level2, X_train_level2.shape

In [None]:
%%time
# Make predictions for test data
test_lr  = model_lr.predict(X_target)
test_lgb = model_lgb.predict(X_target)
test_cb  = model_cb.predict(X_target)
test_xgb = model_xgb.predict(X_target)

X_test_level2 = np.c_[test_lr, test_lgb, test_cb, test_xgb]

In [None]:
X_test_level2, X_test_level2.shape

In [None]:
%%time
# Make prediction for target data
target_lr  = model_lr.predict(X_target)
target_lgb = model_lgb.predict(X_target)
target_cb  = model_cb.predict(X_target)
target_xgb = model_xgb.predict(X_target)

X_target_level2 = np.c_[target_lr, target_lgb, target_cb, target_xgb]

In [None]:
X_target_level2, X_target_level2.shape

In [None]:
l2_model = cb.CatBoostRegressor(loss_function='RMSE')
l2_model.fit(X_train_level2, labels=y_train
             eval_set=(X_test_level2, y_test)
             verbose=100)
pred_cb = l2_model.predict(X_target_level2)