# Ensemble

In [1]:
import numpy as np
import pandas as pd
import gc
import time

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

I use stacking with predictions for validation set and test sets. Predictions for validation set becomes X_train, predictions for the test set becomes X_test and Y_train is the target for the validation set.

My classifiers:
* xgb (15 depth) LB: 0.96734
* xgb (12 depth) LB: 0.96676
* xgb (7 depth) LB: 0.98534
* catboost LB: 1.00913
* random_forest LB: 1.08290

In [2]:
X_train = np.column_stack((
    pd.read_pickle('xgb15/xgb_train.pickle'), 
    pd.read_pickle('xgb12/xgb_train.pickle'),
    pd.read_pickle('xgb7/xgb_train.pickle'),
    pd.read_pickle('cat/cat_train.pickle'),
    pd.read_pickle('rf/rf_train.pickle')
))
X_test = np.column_stack((
    pd.read_pickle('xgb15/xgb_test.pickle'), 
    pd.read_pickle('xgb12/xgb_test.pickle'),
    pd.read_pickle('xgb7/xgb_test.pickle'),
    pd.read_pickle('cat/cat_test.pickle'),
    pd.read_pickle('rf/rf_test.pickle')
))

Load feature matrix to generate Y_train.

In [3]:
data = pd.read_pickle('feature_matrix.pickle')
Y_train = data[data.date_block_num == 33]['item_cnt_month']

In [4]:
del data
gc.collect();

In [5]:
ts = time.time()
meta_model = LinearRegression()
meta_model.fit(X_train, Y_train)
Y_test = meta_model.predict(X_test).clip(0,20)
time.time() - ts

0.05100297927856445

In [6]:
Y_pred = meta_model.predict(X_train).clip(0,20)
mean_squared_error(Y_pred, Y_train)

0.73292181114528809

In [7]:
test = pd.read_csv('test.csv.gz', compression='gzip').set_index('ID')
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('ensemble_submission.csv', index=False)

LB: 0.96290