# Modelling

In this notebook we're going to try to model our data using stacked regressors. We will use lightgbm as our meta-learner and baseline model to see if stacking improves our accuracy. The Symmetric Mean Absolute Percent Error (Smape) is specified as the evaluation metric that should be used.

---
### Load libraries and data

In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, BayesianRidge
from sklearn.model_selection import cross_val_score, KFold, train_test_split

In [2]:
train = pd.read_csv('data/final_train.csv')

### Split data into training and validation set
We will use the first three months of 2017 as the validation set to mirror the task at hand (predicting the first three motnhs of 2018). We will remove the rest of the 2017 data to prevent any data leakage.

In [3]:
train.drop('id', axis=1, inplace=True)
train.sales = np.log1p(train.sales.values)

train_data = train[train.year != 2017]
val_data = train[(train.year == 2017) & (train.month.isin([1, 2, 3]))]

train_X = train_data.drop('sales', axis=1)
train_y = train_data.sales

val_X = val_data.drop('sales', axis=1)
val_y = val_data.sales

---
### SMAPE loss functions

In [4]:
def calc_smape(pred, target):
    n = len(pred)
    non_zero_idx = np.invert((pred == 0) & (target == 0))
    pred, target = np.expm1(pred[non_zero_idx]), np.expm1(target[non_zero_idx])
    return (200/n)*np.sum( (np.abs(pred - target)) / (np.abs(pred) + np.abs(target)))


def lgbm_smape(preds, train_data):
    smape = calc_smape(preds, train_data.get_label())
    return 'SMAPE', smape, False

---
Function for creating stacked predictions for training and validation set

In [5]:
def create_pred_matrix(rgrs, t_X, t_y, v_X, v_y, n_splits=7):

    dsize = len(t_X)
    step = dsize // n_splits
    t_pred_mat = np.zeros((dsize, len(rgrs)))
    v_pred_mat = np.zeros((len(v_y), len(rgrs)))
    
    for i, rgrc in enumerate(rgrs):
        print('\ntraining {}...'.format(i))
        start = time.time()
        for idx in range(0, dsize - step, step):
            rgr = rgrc()
            X = t_X[idx:idx+step]
            y = t_y[idx:idx+step]

            next_X = t_X[idx+step:idx+(step*2)]
            next_y= t_y[idx+step:idx+(step*2)]
            
            rgr.fit(X, y)
            pred_y = rgr.predict(next_X)
            print('nxt score: ',  calc_smape(pred_y, next_y))           
            t_pred_mat[idx+step:idx+(step*2), i] = pred_y 
            
        print('trn score: ', calc_smape(t_pred_mat[step:, i], t_y[step:]))
        rgr = rgrc()
        rgr.fit(t_X, t_y)
        v_pred = rgr.predict(v_X)
                          
        print('val score: ', calc_smape(v_pred, v_y))
        v_pred_mat[:, i] = v_pred
        print('time taken: ', time.time() - start)
            
    return t_pred_mat, v_pred_mat

---
creating first layer predictions

In [6]:
dsize = 183000*2
t_X = train_X[-dsize:].reset_index(drop=True)
t_y = train_y[-dsize:].reset_index(drop=True)

ab = AdaBoostRegressor
rf = RandomForestRegressor
ls = Lasso
br = BayesianRidge
et = ExtraTreesRegressor

rgrs = [br, ls, ab, rf, et]
t_pred_mat, val_pred_mat = create_pred_matrix(rgrs, t_X.fillna(-1), t_y, val_X, val_y);

In [10]:
# simple average score:
t_avg_pred = t_pred_max.mean(axis=1)
v_avg_pred = val_pred_mat.mean(axis=1)
step = len(t_X) // 5 

print('trn score: ', smape(t_avg_pred[step:], t_y[step:]))
print('val score: ', smape(v_avg_pred, val_y))

---
### Baseline 1 lgbm

In [11]:
lgbtrain = lgb.Dataset(data=t_X[step:], label=t_y[step:])
lgbval = lgb.Dataset(data=val_X, label=val_y, reference=lgbtrain)

lgb_params_ = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 50, 'learning_rate': 0.05, 
              'feature_fraction': 0.8, 'max_depth': 8, 'verbose': -1,
              'num_boost_round':1000, 'early_stopping_rounds':70, 'nthread':-1}

rgr = lgb.train(lgb_params_, lgbtrain, valid_sets=[lgbtrain, lgbval], 
                  feval=lgbm_smape, categorical_feature=[0, 1, 2, 3, 4], verbose_eval=50)

---
### LightGBM predicting on stacked predictions 

In [14]:
lgbtrain = lgb.Dataset(data=t_pred_max[step:], label=t_y[step:])
lgbval = lgb.Dataset(data=val_pred_mat, label=val_y, reference=lgbtrain)

lgb_params_ = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 50, 'learning_rate': 0.05, 
              'feature_fraction': 0.8, 'max_depth': 8, 'verbose': -1,
              'num_boost_round':1000, 'early_stopping_rounds':70, 'nthread':-1}

rgr = lgb.train(lgb_params_, lgbtrain, valid_sets=[lgbtrain, lgbval], 
                  feval=lgbm_smape, categorical_feature=[0, 1, 2, 3, 4], verbose_eval=50)

---
### LightGBM predicting on combined data and predictions

In [15]:
combined_t_X = t_X.copy()
combined_t_X[['br_pred', 'ls_pred', 'ab_pred', 'rf_pred', 'et_pred']] = pd.DataFrame(t_pred_max)

combined_val_X = val_X.copy()
combined_val_X[['br_pred', 'ls_pred', 'ab_pred', 'rf_pred', 'et_pred']] = pd.DataFrame(val_pred_mat)

lgbtrain = lgb.Dataset(data=combined_t_X[:step], label=t_y[:step])
lgbval = lgb.Dataset(data=combined_val_X, label=val_y, reference=lgbtrain)

lgb_params_ = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 50, 'learning_rate': 0.05, 
              'feature_fraction': 0.8, 'max_depth': 8, 'verbose': -1,
              'num_boost_round':1000, 'early_stopping_rounds':70, 'nthread':-1}

rgr = lgb.train(lgb_params_, lgbtrain, valid_sets=[lgbtrain, lgbval], 
                  feval=lgbm_smape, categorical_feature=[0, 1, 2, 3, 4], verbose_eval=50)