In [1]:
import numpy as np
import pandas as pd 
import sklearn
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score
# from tqdm import tqdm_notebook
from tqdm.notebook import tqdm

import catboost as cb
import xgboost as xgb

# Train/test split

For a sake of the programming assignment, let's artificially split the data into train and test. We will treat last month data as the test set.

In [2]:
all_data = pd.read_parquet('data/all_data.parquet')
# print('Columns:', all_data.columns)
all_data.info()
all_data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6639294 entries, 0 to 6639293
Data columns (total 84 columns):
 #   Column                                 Dtype  
---  ------                                 -----  
 0   shop_id                                int32  
 1   item_id                                int32  
 2   date_block_num                         int32  
 3   item_category_id                       int32  
 4   target                                 float32
 5   target_shop                            float32
 6   target_item                            float32
 7   target_item_shop                       float32
 8   target_item_category                   float32
 9   target_shop_item_category              float32
 10  target_mean                            float32
 11  target_shop_mean                       float32
 12  target_item_mean                       float32
 13  target_item_shop_mean                  float32
 14  target_item_category_mean              float32
 15

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,target,target_shop,target_item,target_item_shop,target_item_category,target_shop_item_category,...,target_shop_item_category_mean_lag_12,target_shop_mean_lag_12,city_code,type_code,subtype_code,month_num,item_first_sale,item_shop_first_sale,days,delta_price_lag
0,59,944,12,37,1.0,1491.0,23.0,1.0,7511.0,104.0,...,8.988200,2017.0,30,11,1,0,0,0,31,0.000000
1,59,21861,12,37,1.0,1491.0,62.0,1.0,7511.0,104.0,...,8.988200,2017.0,30,11,1,0,0,0,31,0.000000
2,59,21455,12,37,1.0,1491.0,20.0,1.0,7511.0,104.0,...,8.988200,2017.0,30,11,1,0,0,0,31,0.000000
3,59,33,12,37,2.0,1491.0,42.0,2.0,7511.0,104.0,...,8.988200,2017.0,30,11,1,0,0,0,31,0.000000
4,59,19750,12,37,1.0,1491.0,8.0,1.0,7511.0,104.0,...,8.988200,2017.0,30,11,1,0,0,0,31,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6639289,45,18454,34,55,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,20,13,2,10,11,11,30,-0.475305
6639290,45,16188,34,64,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,20,14,42,10,2,2,30,0.081167
6639291,45,15757,34,55,0.0,0.0,0.0,0.0,0.0,0.0,...,12.416455,1251.0,20,13,2,10,22,22,30,0.155905
6639292,45,19648,34,40,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,20,11,4,10,11,11,30,-0.091709


In [3]:
list(all_data.columns)

['shop_id',
 'item_id',
 'date_block_num',
 'item_category_id',
 'target',
 'target_shop',
 'target_item',
 'target_item_shop',
 'target_item_category',
 'target_shop_item_category',
 'target_mean',
 'target_shop_mean',
 'target_item_mean',
 'target_item_shop_mean',
 'target_item_category_mean',
 'target_shop_item_category_mean',
 'target_lag_1',
 'target_item_lag_1',
 'target_item_category_lag_1',
 'target_item_category_mean_lag_1',
 'target_item_mean_lag_1',
 'target_item_shop_lag_1',
 'target_item_shop_mean_lag_1',
 'target_mean_lag_1',
 'target_shop_lag_1',
 'target_shop_item_category_lag_1',
 'target_shop_item_category_mean_lag_1',
 'target_shop_mean_lag_1',
 'target_lag_2',
 'target_item_lag_2',
 'target_item_category_lag_2',
 'target_item_category_mean_lag_2',
 'target_item_mean_lag_2',
 'target_item_shop_lag_2',
 'target_item_shop_mean_lag_2',
 'target_mean_lag_2',
 'target_shop_lag_2',
 'target_shop_item_category_lag_2',
 'target_shop_item_category_mean_lag_2',
 'target_shop_m

In [4]:
# List of all lagged features

# We will drop these at fitting stage
to_drop_cols = ['target_shop', 'target_item', 'target_item_mean', 'target_mean', 
                'target_item_shop', 'target', 'target_shop_mean', 'target_item_shop_mean', 
                'date_block_num']

# Categorical features
cat_features = ['shop_id', 'item_id', 'item_category_id', 'city_code', 
                'type_code', 'subtype_code']

In [5]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()-1
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [6]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train =  all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =   all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)
X_target = all_data.loc[dates == dates.max()].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [7]:
X_train

Unnamed: 0,shop_id,item_id,item_category_id,target_item_category,target_shop_item_category,target_item_category_mean,target_shop_item_category_mean,target_lag_1,target_item_lag_1,target_item_category_lag_1,...,target_shop_item_category_mean_lag_12,target_shop_mean_lag_12,city_code,type_code,subtype_code,month_num,item_first_sale,item_shop_first_sale,days,delta_price_lag
0,59,944,37,7511.0,104.0,8.909845,8.909845,0.0,24.0,9959.0,...,8.9882,2017.0,30,11,1,0,0,0,31,0.000000
1,59,21861,37,7511.0,104.0,8.909845,8.909845,0.0,28.0,9959.0,...,8.9882,2017.0,30,11,1,0,0,0,31,0.000000
2,59,21455,37,7511.0,104.0,8.909845,8.909845,1.0,12.0,9959.0,...,8.9882,2017.0,30,11,1,0,0,0,31,0.000000
3,59,33,37,7511.0,104.0,8.909845,8.909845,0.0,42.0,9959.0,...,8.9882,2017.0,30,11,1,0,0,0,31,0.000000
4,59,19750,37,7511.0,104.0,8.909845,8.909845,0.0,15.0,9959.0,...,8.9882,2017.0,30,11,1,0,0,0,31,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6186917,55,5230,26,9.0,9.0,4.500000,4.500000,1.0,1.0,6.0,...,0.0000,0.0,27,6,61,7,8,8,31,0.058935
6186918,55,2757,26,9.0,9.0,4.500000,4.500000,4.0,4.0,6.0,...,0.0000,0.0,27,6,61,7,8,8,31,-0.031505
6186919,55,7115,27,1.0,1.0,1.000000,1.000000,0.0,0.0,0.0,...,0.0000,0.0,27,7,61,7,0,0,31,0.000000
6186920,55,13095,36,5.0,5.0,2.500000,2.500000,0.0,0.0,0.0,...,0.0000,0.0,27,9,15,7,0,0,31,0.000000


# First level models 

You need to implement a basic stacking scheme. We have a time component here, so we will use ***scheme f)*** from the reading material. Recall, that we always use first level models to build two datasets: test meta-features and 2-nd level train-metafetures. Let's see how we get test meta-features first. 

### Test meta-features

Firts, we will run *linear regression* on numeric columns and get predictions for the last month.

In [8]:
model_lr = LinearRegression()
model_lr.fit(X_train.values, y_train)
pred_lr = model_lr.predict(X_test.values)

print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))
print('Test RMSE linreg is %f' % np.sqrt(mean_squared_error(y_test, pred_lr)))

Test R-squared for linreg is 0.507441
Test RMSE linreg is 1.891999


And the we run *LightGBM*.

In [None]:
%%time
# LightGGM
model_lgb = lgb.LGBMRegressor(
    n_estimators=1000,
    n_jobs=-1,
    objective='mse',
    random_state=17
)
model_lgb.fit(X_train, y_train, 
              eval_set=(X_test, y_test), 
              early_stopping_rounds=100,
              verbose=100)
pred_lgb = model_lgb.predict(X_test)

print('Test R-squared for LightGBM is %f' % r2_score(y_test, pred_lgb))
print('Test RMSE LightGBM is %f' % np.sqrt(mean_squared_error(y_test, pred_lgb)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.31428


In [None]:
# Plot LightGBM features importance
import lightgbm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,1,figsize=(10, 10))
lightgbm.plot_importance(booster=model_lgb, ax=ax)

In [None]:
%%time
# Catboost

model_cbr = cb.CatBoostRegressor(loss_function='RMSE',
                                 cat_features=cat_features,
                                 iterations=1000,
                                task_type='CPU')
model_cbr.fit(X_train, y_train, 
              eval_set=(X_test, y_test), 
              verbose=20, 
              early_stopping_rounds=50,
              plot=True)
pred_cb = model_cbr.predict(X_test)

print('Test R-squared for CatBoost is %f' % r2_score(y_test, pred_cb))
print('Test RMSE CatBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_cb)))

In [None]:
f_importance = model_cbr.get_feature_importance(prettified=True)
f_importance.sort_values(by='Importances', ascending=True).set_index('Feature Id').plot(
    kind='barh', figsize=(10,10), legend=False, grid=True, title="Feature importances");

In [None]:
# SVM
# model_svr = SVR()
# model_svr.fit(X_train.values, y_train)
# pred_svr = model_svr.predict(X_test.values, verbose=True)

# print('Test R-squared for SVR is %f' % r2_score(y_test, pred_svr))
# print('Test RMSE SVR is %f' % np.sqrt(mean_squared_error(y_test, pred_svr)))

In [None]:
%%time
# XGBM
model_xgb = xgb.XGBRegressor(n_estimators=500, n_jobs=4, random_state=17)
model_xgb.fit(X_train, y_train, 
              eval_set=[(X_test, y_test)], 
              eval_metric="rmse", 
              verbose=10, 
              early_stopping_rounds = 50)
pred_xgb = model_xgb.predict(X_test)

print('Test R-squared for XGBoost is %f' % r2_score(y_test, pred_xgb))
print('Test RMSE XGBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_xgb)))

In [None]:
# Plot XGBoost features importance
from xgboost import plot_importance
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,1,figsize=(10, 10))
plot_importance(booster=model_xgb, ax=ax)

Finally, concatenate test predictions to get test meta-features.

In [None]:
%%time
# Make predictions for test data
train_lr  = model_lr.predict(X_train)
train_lgb = model_lgb.predict(X_train)
train_cbr = model_cbr.predict(X_train)
train_xgb = model_xgb.predict(X_train)

X_train_level2 = np.c_[train_lr, train_lgb, train_cbr, train_xgb] 
# X_test_level2.tofile('X_test_level2')

In [None]:
X_train_level2, X_train_level2.shape

In [None]:
%%time
# Make predictions for test data (Kaggle's test dataset)
test_lr  = model_lr.predict(X_test)
test_lgb = model_lgb.predict(X_test)
test_cbr = model_cbr.predict(X_test)
test_xgb = model_xgb.predict(X_test)

X_test_level2 = np.c_[test_lr, test_lgb, test_cbr, test_xgb]

In [None]:
X_test_level2, X_test_level2.shape

In [None]:
%%time
# Make prediction for target data
target_lr  = model_lr.predict(X_target)
target_lgb = model_lgb.predict(X_target)
target_cbr  = model_cbr.predict(X_target)
target_xgb = model_xgb.predict(X_target)

X_target_level2 = np.c_[target_lr, target_lgb, target_cbr, target_xgb]

In [None]:
X_target_level2, X_target_level2.shape

In [None]:
l2_model_lr = LinearRegression()
l2_model_lr.fit(X_train_level2, y_train)
test_l2_lr = l2_model_lr.predict(X_test_level2)

print(f'Test RMSE for LinearRegression L2 is {np.sqrt(mean_squared_error(y_test, test_l2_lr))}')

In [None]:
l2_model_lor = LogisticRegression(random_state=17, n_jobs=4, solver='saga', verbose=1)
l2_model_lor.fit(X_train_level2, y_train)
test_l2_lor = l2_model_lor.predict(X_test_level2)

print(f'Test RMSE for LogisticRegression L2 is {np.sqrt(mean_squared_error(y_test, test_l2_lor))}')

In [None]:
l2_model_cb = cb.CatBoostRegressor(loss_function='RMSE',
                               task_type='CPU')
l2_model_cb.fit(X_train_level2, y=y_train,
             eval_set=(X_test_level2, y_test),
             early_stopping_rounds=50,
             verbose=20,
             plot=True)
test_l2_cbr = l2_model_cb.predict(X_test_level2)

print(f'Test RMSE for CatBoost L2 is {np.sqrt(mean_squared_error(y_test, test_l2_cbr))}')