In [1]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import pickle
import gc

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import catboost as cb
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
def model_performance_sc_plot(predictions, labels, title):
    # Get min and max values of the predictions and labels.
    min_val = max(max(predictions), max(labels))
    max_val = min(min(predictions), min(labels))
    # Create dataframe with predicitons and labels.
    performance_df = pd.DataFrame({"Label":labels})
    performance_df["Prediction"] = predictions
    # Plot data
    sns.jointplot(y="Label", x="Prediction", data=performance_df, kind="reg", height=7)
    plt.plot([min_val, max_val], [min_val, max_val], 'm--')
    plt.title(title, fontsize=9)
    plt.show()

# Train/test split

For a sake of the programming assignment, let's artificially split the data into train and test. We will treat last month data as the test set.

In [3]:
# Will we save intermediate models?
save_models = True

In [4]:
all_data = pd.read_parquet('data/all_data.parquet')
# print('Columns:', all_data.columns)
print('all_data coulumns =', list(all_data.columns))
all_data.info()
all_data

all_data coulumns = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'target', 'target_shop_date', 'target_item_date', 'target_item_shop_date', 'target_item-cat_date', 'target_shop_item-cat_date', 'target_mean', 'target_shop_date_mean', 'target_item_date_mean', 'target_item_shop_date_mean', 'target_item-cat_date_mean', 'target_shop_item-cat_date_mean', 'target_lag_1', 'target_item-cat_date_lag_1', 'target_item-cat_date_mean_lag_1', 'target_item_date_lag_1', 'target_item_date_mean_lag_1', 'target_item_shop_date_lag_1', 'target_item_shop_date_mean_lag_1', 'target_mean_lag_1', 'target_shop_date_lag_1', 'target_shop_date_mean_lag_1', 'target_shop_item-cat_date_lag_1', 'target_shop_item-cat_date_mean_lag_1', 'target_lag_2', 'target_item-cat_date_lag_2', 'target_item-cat_date_mean_lag_2', 'target_item_date_lag_2', 'target_item_date_mean_lag_2', 'target_item_shop_date_lag_2', 'target_item_shop_date_mean_lag_2', 'target_mean_lag_2', 'target_shop_date_lag_2', 'target_shop_date_mean_

Unnamed: 0,shop_id,item_id,date_block_num,item_category_id,target,target_shop_date,target_item_date,target_item_shop_date,target_item-cat_date,target_shop_item-cat_date,...,target_shop_item-cat_date_lag_12,target_shop_item-cat_date_mean_lag_12,city_code,type_code,subtype_code,month_num,item_first_sale,item_shop_first_sale,days,delta_price_lag
0,59,944,12,37,1.0,1491.0,23.0,1.0,7511.0,104.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
1,59,21861,12,37,1.0,1491.0,62.0,1.0,7511.0,104.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
2,59,21455,12,37,1.0,1491.0,20.0,1.0,7511.0,104.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
3,59,33,12,37,2.0,1491.0,42.0,2.0,7511.0,104.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
4,59,19750,12,37,1.0,1491.0,8.0,1.0,7511.0,104.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6639289,45,18454,34,55,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20,13,2,10,11,11,30,-0.475305
6639290,45,16188,34,64,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20,14,42,10,2,2,30,0.081167
6639291,45,15757,34,55,0.0,0.0,0.0,0.0,0.0,0.0,...,124.0,124.0,20,13,2,10,22,22,30,0.155905
6639292,45,19648,34,40,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,20,11,4,10,11,11,30,-0.091709


In [5]:
# list(all_data.columns)

In [6]:
# List of all lagged features

# We will drop these at fitting stage
to_drop_cols = ['target_shop_date', 'target_item_shop_date_mean', 'target_shop_item-cat_date_mean', 'target_shop_item-cat_date', 
                'target_item_date_mean', 'target_item-cat_date_mean', 'target_item-cat_date', 'target_item_shop_date', 
                'target_item_date', 'target', 'target_shop_date_mean', 'target_mean', 'date_block_num']

# Categorical features
cat_features = ['shop_id', 'item_id', 'item_category_id', 'city_code', 
                'type_code', 'subtype_code']

In [7]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()-1
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [8]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train =  all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =   all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)
X_target = all_data.loc[dates == dates.max()].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [9]:
X_train

Unnamed: 0,shop_id,item_id,item_category_id,target_lag_1,target_item-cat_date_lag_1,target_item-cat_date_mean_lag_1,target_item_date_lag_1,target_item_date_mean_lag_1,target_item_shop_date_lag_1,target_item_shop_date_mean_lag_1,...,target_shop_item-cat_date_lag_12,target_shop_item-cat_date_mean_lag_12,city_code,type_code,subtype_code,month_num,item_first_sale,item_shop_first_sale,days,delta_price_lag
0,59,944,37,0.0,9959.0,9959.0,24.0,24.0,0.0,0.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
1,59,21861,37,0.0,9959.0,9959.0,28.0,28.0,0.0,0.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
2,59,21455,37,1.0,9959.0,9959.0,12.0,12.0,1.0,1.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
3,59,33,37,0.0,9959.0,9959.0,42.0,42.0,0.0,0.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
4,59,19750,37,0.0,9959.0,9959.0,15.0,15.0,0.0,0.0,...,60.0,60.0,30,11,1,0,0,0,31,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6186917,55,5230,26,1.0,6.0,6.0,1.0,1.0,1.0,1.0,...,0.0,0.0,27,6,61,7,8,8,31,0.058935
6186918,55,2757,26,4.0,6.0,6.0,4.0,4.0,4.0,4.0,...,0.0,0.0,27,6,61,7,8,8,31,-0.031505
6186919,55,7115,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,27,7,61,7,0,0,31,0.000000
6186920,55,13095,36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,27,9,15,7,0,0,31,0.000000


In [10]:
del all_data
gc.collect()

44

# First level models 

You need to implement a basic stacking scheme. We have a time component here, so we will use ***scheme f)*** from the reading material. Recall, that we always use first level models to build two datasets: test meta-features and 2-nd level train-metafetures. Let's see how we get test meta-features first. 

### Test meta-features

Firts, we will run *linear regression* on numeric columns and get predictions for the last month.

In [11]:
cat_columns = ['shop_id', 'item_id', 'item_category_id', 'city_code', 'type_code', 'subtype_code']

print('cat_features =', cat_features)
print('cat_columns =', cat_columns)
for cat in cat_columns:
    print('for category ', cat, 'found unique values:', X_train[cat].unique().shape[0])

cat_features = ['shop_id', 'item_id', 'item_category_id', 'city_code', 'type_code', 'subtype_code']
cat_columns = ['shop_id', 'item_id', 'item_category_id', 'city_code', 'type_code', 'subtype_code']
for category  shop_id found unique values: 54
for category  item_id found unique values: 16561
for category  item_category_id found unique values: 79
for category  city_code found unique values: 31
for category  type_code found unique values: 19
for category  subtype_code found unique values: 62


In [None]:
%%time
linear_features = ['target_lag_1', 'target_item-cat_date_mean_lag_1', 
                   'target_item_date_mean_lag_1', 
                   'target_item_shop_date_mean_lag_1', 'target_mean_lag_1',
                   'target_shop_date_mean_lag_1', 'target_shop_item-cat_date_mean_lag_1', 
                   'target_lag_2', 'target_item-cat_date_mean_lag_2', 
                   'target_item_date_mean_lag_2', 'target_item_shop_date_mean_lag_2', 
                   'target_mean_lag_2', 'target_shop_date_mean_lag_2', 
                   'target_shop_item-cat_date_mean_lag_2',
                   'target_lag_3', 'target_item-cat_date_mean_lag_3', 'target_item_date_mean_lag_3', 
                   'target_item_shop_date_mean_lag_3', 'target_mean_lag_3', 'target_shop_date_mean_lag_3', 
                   'target_shop_item-cat_date_mean_lag_3',
                   'days', 'delta_price_lag']



model_lrc = make_pipeline(
    ColumnTransformer([
         ('one_hot_shop_id', OneHotEncoder(), ['shop_id']),
         ('one_hot_item_category_id', OneHotEncoder(), ['item_category_id']),
         ('one_hot_city_code', OneHotEncoder(), ['city_code']),
         ('one_hot_type_code', OneHotEncoder(), ['type_code']),
         ('one_hot_subtype_code', OneHotEncoder(), ['subtype_code']),
         ('to_pass', 'passthrough', linear_features)
        ], remainder='drop', n_jobs=-1, 
        sparse_threshold=0.3,
        verbose=True),
    MaxAbsScaler(),
    LinearRegression(n_jobs=-1),
    #ElasticNetCV(n_jobs=-1, random_state=43, max_iter=1000),
    #BayesianRidge(),
    verbose=True,
)
model_lrc.fit(X_train, y_train)
pred_lrc = model_lrc.predict(X_test)

print('\nTest R-squared for linreg is %f' % r2_score(y_test, pred_lrc))
print('Test RMSE linreg is %f' % np.sqrt(mean_squared_error(y_test, pred_lrc)))

if save_models:
    pickle.dump(model_lrc, open('data/sa-model_lrc.pkl', "wb"))

In [None]:
%%time
from sklearn.neural_network import MLPRegressor

model_mlp = make_pipeline(
    MinMaxScaler(),
    MLPRegressor(
        random_state=47, 
        verbose=True,
        early_stopping=True,
        n_iter_no_change=5,
        solver='adam',
#         hidden_layer_sizes=(300, 200 ,100), alpha=0.03, learning_rate='adaptive', learning_rate_init=0.0005, 
#         max_iter=200, momentum=0.9, nesterovs_momentum=True,
    ),
    verbose=True,
)
model_mlp.fit(X_train, y_train)
pred_mlp = model_mlp.predict(X_test)
train_mlp = model_mlp.predict(X_train)

print('Test R-squared is %f' % r2_score(y_test, pred_mlp))
print('Train R-squared is %f' % r2_score(y_train, train_mlp))
print(f'Test RMSE is {np.sqrt(mean_squared_error(y_test, pred_mlp)):.6}')
print(f'Train RMSE is {np.sqrt(mean_squared_error(y_train, train_mlp)):.6}')

if save_models:
    pickle.dump(model_mlp, open('data/sa-model_mlp.pkl', "wb"))

[Pipeline] ...... (step 1 of 2) Processing minmaxscaler, total=   3.3s
Iteration 1, loss = 0.40504097
Validation score: 0.433181
Iteration 2, loss = 0.39058312
Validation score: 0.446674
Iteration 3, loss = 0.38460675
Validation score: 0.451215
Iteration 4, loss = 0.38063824
Validation score: 0.458092
Iteration 5, loss = 0.37769591
Validation score: 0.461678
Iteration 6, loss = 0.37499981
Validation score: 0.465535
Iteration 7, loss = 0.37289355
Validation score: 0.463071
Iteration 8, loss = 0.37144070
Validation score: 0.468284
Iteration 9, loss = 0.37017903
Validation score: 0.470307
Iteration 10, loss = 0.36908898
Validation score: 0.470663
Iteration 11, loss = 0.36808530
Validation score: 0.473723
Iteration 12, loss = 0.36730323
Validation score: 0.470249
Iteration 13, loss = 0.36659489
Validation score: 0.475908
Iteration 14, loss = 0.36602787
Validation score: 0.477453
Iteration 15, loss = 0.36541763
Validation score: 0.479127
Iteration 16, loss = 0.36496405
Validation score: 0.4

In [None]:
%%time
# BayesianRidge regression
from sklearn.linear_model import BayesianRidge

model_br = make_pipeline(
    MinMaxScaler(), 
    BayesianRidge() #n_iter=1000, tol=0.0001)
)

model_br.fit(X_train, y_train)
pred_br = model_br.predict(X_test)

print(f'Test BayesianRidge linreg is {np.sqrt(mean_squared_error(y_test, pred_br)):.6f}' )

if save_models:
    pickle.dump(model_br, open('data/sa-model_br.pkl', "wb"))

In [None]:
%%time
# ElasticNetCV
from sklearn.linear_model import ElasticNetCV

model_en = make_pipeline(
    StandardScaler(), 
    ElasticNetCV(n_jobs=-1, random_state=43, max_iter=1000)
)
model_en.fit(X_train.values, y_train)
pred_en = model_en.predict(X_test.values)

print('Test ElasticNet linreg is %f' % np.sqrt(mean_squared_error(y_test, pred_en)))

if save_models:
    pickle.dump(model_en, open('data/sa-model_en.pkl', "wb"))

In [None]:
%%time
# LassoCV
from sklearn.linear_model import LassoCV

model_lcv =  make_pipeline(
    StandardScaler(),
    LassoCV(n_jobs=4, random_state=43)
)
model_lcv.fit(X_train.values, y_train)
pred_lcv = model_lcv.predict(X_test.values)

print('Test LassoCV is %f' % np.sqrt(mean_squared_error(y_test, pred_lcv)))

if save_models:
    pickle.dump(model_lcv, open('data/sa-model_lcv.pkl', "wb"))

In [None]:
%%time
# RidgeCV
from sklearn.linear_model import RidgeCV

model_rcv =  make_pipeline(
    StandardScaler(),
    RidgeCV(scoring='neg_mean_squared_error')
)
model_rcv.fit(X_train.values, y_train)
pred_rcv = model_rcv.predict(X_test.values)

print('Test RidgeCV is %f' % np.sqrt(mean_squared_error(y_test, pred_rcv)))

if save_models:
    pickle.dump(model_rcv, open('data/sa-model_rcv.pkl', "wb"))

## Tree-based models

In [None]:
%%time
# LightGBM
model_lgb = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=10,
    #num_leaves=60,
    n_jobs=-1,
    objective='mse',
    random_state=43
)

model_lgb.fit(
    X_train, y_train, 
    eval_set=(X_test, y_test), 
    early_stopping_rounds=100,
    verbose=100
)
pred_lgb = model_lgb.predict(X_test)

print('\nTest R-squared for LightGBM is %f' % r2_score(y_test, pred_lgb))
print('Test RMSE LightGBM is %f' % np.sqrt(mean_squared_error(y_test, pred_lgb)))
print()

# Save model to disk
if save_models:
    pickle.dump(model_lgb, open('data/sa-model_lgb.pkl', "wb"))

# Plot LightGBM features importance
fig, ax = plt.subplots(1,1,figsize=(10, 10))
lgb.plot_importance(booster=model_lgb, ax=ax)

In [None]:
#model_lgb.get_params()

In [None]:
%%time
# Catboost

model_cbr = cb.CatBoostRegressor(
    loss_function='RMSE',
    #cat_features=cat_features,
    #l2_leaf_reg=6,
    depth=8,
    #learning_rate=0.1,
    #iterations=10000,
    iterations=1000,
    task_type='CPU',
    random_state=43,
)
model_cbr.fit(
    X_train, y_train, 
    eval_set=(X_test, y_test), 
    verbose=10, 
    #early_stopping_rounds=500,
    early_stopping_rounds=30,
    plot=True,
)
pred_cb = model_cbr.predict(X_test)

print('\nTest R-squared for CatBoost is %f' % r2_score(y_test, pred_cb))
print('Test RMSE CatBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_cb)))
print()

# Save model to disk
if save_models:
    pickle.dump(model_cbr, open('data/sa-model_cbr.pkl', "wb"))

# Plot CatBoost features importance
f_importance = model_cbr.get_feature_importance(prettified=True)
f_importance[f_importance['Importances']>0].sort_values(by='Importances', ascending=True).set_index('Feature Id').plot(
    kind='barh', figsize=(14,14), legend=False, grid=True, title="Feature importances");

In [None]:
# model_cbr.get_all_params()

In [None]:
%%time
# xgboost
model_xgb = xgb.XGBRegressor(
    n_jobs=4, 
    random_state=43,
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,
)

model_xgb.fit(
    X_train, y_train, 
    eval_set=[(X_train, y_train), (X_test, y_test)], 
    eval_metric="rmse", 
    verbose=5, 
    early_stopping_rounds = 20
)
pred_xgb = model_xgb.predict(X_test)

print('\nTest R-squared for XGBoost is %f' % r2_score(y_test, pred_xgb))
print('Test RMSE XGBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_xgb)))
print()

# Save model to disk
if save_models:
    pickle.dump(model_xgb, open('data/sa-model_xgb.pkl', "wb"))

# Plot XGBoost features importance
fig, ax = plt.subplots(1,1,figsize=(14, 14))
xgb.plot_importance(booster=model_xgb, ax=ax)

In [None]:
%%time
# RandomForest
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    criterion='mse',
    n_jobs=-1,
    n_estimators=50, 
    max_depth=8,
    random_state=47,
    verbose=10,
)

model_rf.fit(
    X_train, y_train, 
)
pred_rf = model_rf.predict(X_test)

print('\nTest R-squared for RandomForestRegressor is %f' % r2_score(y_test, pred_rf))
print('Test RMSE XGBoost is %f' % np.sqrt(mean_squared_error(y_test, pred_rf)))
print()

# Save model to disk
if save_models:
    pickle.dump(model_rf, open('data/sa-model_rf.pkl', "wb"))

In [None]:
%%time
# KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
knn_features = ['target_item_date_lag_1', 'delta_price_lag', 
                'target_lag_1', 'target_lag_2', 'item_category_id', 'subtype_code', 'target_mean_lag_1',
                'target_item_date_mean_lag_1',  'target_item_shop_date_lag_1',
               'target_shop_date_mean_lag_1']

# Subsample train set (using the whole data was taking too long).
X_train_sampled = X_train[:400000]
y_train_sampled = y_train[:400000]

model_knr =  make_pipeline(
    ColumnTransformer([
         ('to_pass', 'passthrough', knn_features)
        ], remainder='drop', n_jobs=-1, verbose=True),
    MinMaxScaler(),
    KNeighborsRegressor(algorithm='auto', leaf_size=13, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=9, p=2,
          weights='uniform')
)
model_knr.fit(X_train_sampled, y_train_sampled)
print('Model fitting done')
pred_knr = model_knr.predict(X_test)

print(f'Test KNeighborsRegressor is {np.sqrt(mean_squared_error(y_test, pred_knr)):.6f}')

if save_models:
    pickle.dump(model_knr, open('data/sa-model_knr.pkl', "wb"))

In [None]:
model_performance_sc_plot(pred_knr, y_test, 'Test')

## Load pickled predictors 

In [None]:
# Load pickled predictors
if save_models:
    model_lr   = pickle.load(open('data/sa-model_lr.pkl', "rb"))    # LinearRegression
    model_lgb  = pickle.load(open('data/sa-model_lgb.pkl', "rb"))   # LightGBM
    model_cbr  = pickle.load(open('data/sa-model_cbr.pkl', "rb"))   # CatBoost
    model_xgb  = pickle.load(open('data/sa-model_xgb.pkl', "rb"))   # xgboost
    model_br   = pickle.load(open('data/sa-model_br.pkl', "rb"))    # BayesianRidge
    model_sgdr = pickle.load(open('data/sa-model_sgdr.pkl', "rb"))  # SGDRegressor (decrease quality)
    model_en   = pickle.load(open('data/sa-model_en.pkl', "rb"))    # ElasticNetCV

In [None]:
%%time
# Make predictions for test data
train_lr  = model_lr.predict(X_train)
train_lgb = model_lgb.predict(X_train)
train_cbr = model_cbr.predict(X_train)
train_xgb = model_xgb.predict(X_train)
train_br  = model_br.predict(X_train)
# train_sgdr= model_sgdr.predict(X_train)
train_en  = model_en.predict(X_train)

X_train_level2 = np.c_[train_lr, train_lgb, train_cbr, train_xgb, train_br, train_en] 
# X_train_level2 = np.c_[train_lr, train_cbr, train_xgb] 
# X_test_level2.tofile('X_test_level2')
print('Train L2 data shape:', X_train_level2.shape)

In [None]:
%%time
# Make predictions for test data (Kaggle's test dataset)
test_lr  = model_lr.predict(X_test)
test_lgb = model_lgb.predict(X_test)
test_cbr = model_cbr.predict(X_test)
test_xgb = model_xgb.predict(X_test)
test_br  = model_br.predict(X_test)
# test_sgdr= model_sgdr.predict(X_test)
test_en  = model_en.predict(X_test)

X_test_level2 = np.c_[test_lr, test_lgb, test_cbr, test_xgb, test_br, test_en]
# X_test_level2 = np.c_[test_lr, test_cbr, test_xgb]
print('Test L2 data shape:', X_test_level2.shape)

In [None]:
%%time
# Make prediction for target data
target_lr  = model_lr.predict(X_target)
target_lgb = model_lgb.predict(X_target)
target_cbr = model_cbr.predict(X_target)
target_xgb = model_xgb.predict(X_target)
target_br  = model_br.predict(X_target)
# target_sgdr= model_sgdr.predict(X_target)
target_en  = model_en.predict(X_target)

X_target_level2 = np.c_[target_lr, target_lgb, target_cbr, target_xgb, target_br, target_en]
# X_target_level2 = np.c_[target_lr, target_cbr, target_xgb]
print('Target L2 data shape:', X_target_level2.shape)

# Let's make L2 predictions

In [None]:
%%time
l2_model_lr = LinearRegression()
l2_model_lr.fit(X_train_level2, y_train)
test_l2_lr = l2_model_lr.predict(X_test_level2)

print(f'Test RMSE for LinearRegression L2 is {np.sqrt(mean_squared_error(y_test, test_l2_lr)):.6f}')

In [None]:
from sklearn.linear_model import ElasticNetCV
l2_model_en = ElasticNetCV()
l2_model_en.fit(X_train_level2, y_train)
test_l2_en = l2_model_en.predict(X_test_level2)

print(f'Test RMSE for ElasticNet L2 is {np.sqrt(mean_squared_error(y_test, test_l2_en)):.6f}')

In [None]:
%%time
# BayesianRidge regression
from sklearn.linear_model import BayesianRidge
l2_model_br = BayesianRidge()
l2_model_br.fit(X_train_level2, y_train)
test_l2_br = l2_model_br.predict(X_test_level2)

# print('Test R-squared for linreg is %f' % r2_score(y_test, pred_lr))
print('Test BayesianRidge L2 is %f' % np.sqrt(mean_squared_error(y_test, test_l2_br)))

In [None]:
%%time
# LassoCV
from sklearn.linear_model import LassoCV

l2_model_lcv = LassoCV(n_jobs=4, random_state=43)

l2_model_lcv.fit(X_train_level2, y_train)
test_l2_lcv = l2_model_lcv.predict(X_test_level2)

print('Test LassoCV L2 is %f' % np.sqrt(mean_squared_error(y_test, test_l2_lcv)))

In [None]:
%%time
l2_model_cb = cb.CatBoostRegressor(
    loss_function='RMSE',
    task_type='CPU',
    #depth=8,
    #learning_rate=0.001,
    #l2_leaf_reg=6,
)
l2_model_cb.fit(X_train_level2, y=y_train,
                eval_set=(X_test_level2, y_test),
                early_stopping_rounds=20,
                verbose=20,
                use_best_model=True,
                plot=True)
test_l2_cbr = l2_model_cb.predict(X_test_level2)

print()
print(f'Test RMSE for CatBoost L2 is {np.sqrt(mean_squared_error(y_test, test_l2_cbr))}')
print(f'Test RMSE for CatBoost L2 is {np.sqrt(mean_squared_error(y_test, test_l2_cbr))}')

In [None]:
# l2_model_cb.get_all_params()

## Prepare data to Kaggle

In [None]:
# Make submission

# Insert your model here
pred_model = l2_model_en # ElasticNetCV

tests = pd.read_csv('../readonly/final_project_data/test.csv.gz')
# export_pred = l2_model_lr.predict(X_target_level2)

export_pred = pred_model.predict(X_target_level2)
# export_pred = np.round(export_pred)
kaggle_data = tests.merge(
    pd.DataFrame(np.c_[X_target[['shop_id', 'item_id']], export_pred], 
                 columns=['shop_id', 'item_id', 'item_cnt_month']) )
kaggle_data['item_cnt_month'].clip(0, 20, inplace=True)

kaggle_data[['ID', 'item_cnt_month']].to_csv('simple_ensemble.csv', index=False)

print('kaggle_data shape:', kaggle_data.shape)
kaggle_data