# Modules

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import sys
import gc
import random

import urllib.request
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot
import statsmodels.api as sm

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, StrMethodFormatter
import matplotlib
import seaborn as sns

import lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

sys.path.append('..')
from src import config, dataset, models

plt.rcParams.update(plt.rcParamsDefault)
plt.style.use(['ggplot', 'bmh'])
matplotlib.rcParams['figure.figsize'] = (10.0, 10.0)

display_settings = {
    'max_columns': 999,
    'expand_frame_repr': True,
    'max_rows': 999,
    'precision': 4,
    'show_dimensions': True
}

for op, value in display_settings.items():
    pd.set_option("display.{}".format(op), value)


def fix_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)

fix_seed(1234)
%matplotlib inline

# Model

For each store, an lgbm model is optimized. Inference is done for each store's ids and concatenated at the end.
The idea is to allow more data to be used as the dataset is quite large, as well as to specialize models to specific stores and thus reduce variance of data they train on.

In [6]:
path = config.get_processed_filename('data_merged.parquet')
stores = config.STORES
categories = ['item_id', 'dept_id', 'cat_id', 'wday', 'month', 'snap']

# prediction horizon
horizon = 28

# The dataset is quite large with 46 million records. We use only a fraction of it for training.
last_day = 1913
start_day = last_day - 6 * horizon

losses = {}
df_predictions = pd.DataFrame()

#stores = ['CA_1']

for store in stores:
    print(f'Store: {store}')
    print('Load and filter store transactions...')
    # filter
    df_train = pd.read_parquet(path)
    df_train = df_train[df_train.store_id == store]
    df_train = df_train.drop('store_id', axis=1)
    df_train = df_train[df_train.day_ind >=start_day]
    
    print(f'   dataset dim: {df_train.shape}')
    
    print('Prepare features...')
    # convert categories to ordinal
    df_train = dataset.categorical_to_ordinal(df_train)
    df_train = df_train.sort_values('date')
    
    # time dependent features
    for c in categories:
        df_train = dataset.hierarchy_stats(df_train, c, ['sold_qty', 'sell_price'])

        df_grouped = df_train.groupby([c, 'date'])
        df_train = dataset.hierarchy_rolling_mean(df_train, c, ['sold_qty', 'sell_price'], window=7, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_std(df_train, c, ['sold_qty', 'sell_price'], window=7, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_mean(df_train, c, ['sold_qty', 'sell_price'], window=28, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_std(df_train, c, ['sold_qty', 'sell_price'], window=28, shift=28, df_grouped=df_grouped)

    # lag per id
    df_grouped = df_train.groupby('id', as_index=False)
    df_train = dataset.item_lag(df_train, ['sold_qty'], n_most=2, after=28, df_grouped=df_grouped)
    
    # exclude rows with nan due to lagged features
    df_train.dropna(subset=[c for c in df_train.columns if c != 'sold_qty'], inplace=True)
    
    # remove columns with zero variance
    col_std = df_train[df_train.select_dtypes([np.number]).columns].std()
    cols_to_drop = col_std[col_std == 0].index
    df_train.drop(cols_to_drop, axis=1, inplace=True)
    
    del df_grouped, col_std, cols_to_drop
    gc.collect()
    
    print(f'   dataset dim: {df_train.shape}')
    
    print('Prepare training, validation and test sets...')
    # prepare training and test set
    for c in categories:
        df_train[c] = df_train[c].astype('category')

    # drop useless columns
    useless_columns = ['id', 'day_ind', 'date', 'event_type_1']

    # test set
    test_ids = df_train.loc[df_train.sold_qty.isna(), 'id']
    test_day_ind = df_train.loc[df_train.sold_qty.isna(), 'day_ind']
    X_test = df_train.loc[df_train.sold_qty.isna()].drop(useless_columns + ['sold_qty'], axis=1)
    
    # train set
    y_train = df_train.loc[~df_train.sold_qty.isna(), 'sold_qty']
    X_train = df_train.loc[~df_train.sold_qty.isna()].drop(useless_columns + ['sold_qty'], axis=1)
    
    del df_train
    gc.collect()
    
    print(f'   day {df_train.day_ind.min()} to day {df_train.day_ind.max()}')
    print(f'   train set dim: {X_train.shape}')
    print(f'   test set dim: {X_test.shape}')
    
    print('Train model...')
    # train model
    
    model = models.LGBModel()
    model.optimize(X_train, y_train, n_trials=15, metric='mape')
    
    # evaluate
    loss = model.best_value
    losses[store] = loss
    
    print('Make predictions...')
    # predictions on test set
    y_hat_test = model.predict(X_test)
    df_predictions_store = pd.DataFrame({'id': test_ids, 'd': test_day_ind, 'sold_qty': y_hat_test})
    
    # transform to the required format for submissions
    df_predictions_store = df_predictions_store.pivot(index='id', columns='d', values='sold_qty')
    df_predictions_store.columns = [f'F{i + 1}' for i, c in enumerate(df_predictions_store.columns)]
    df_predictions_store.reset_index(inplace=True)
    
    df_predictions = pd.concat([df_predictions, df_predictions_store], axis=0)
    
    del X_train, X_val, y_train, y_val, df_predictions_store
    gc.collect()
    
    print('\n')


# duplicate and append predictions for evaluation period, as validation period is not known yet
df_predictions_eval = df_predictions.copy()
df_predictions_eval.id = df_predictions_eval.id.str.replace('validation', 'evaluation')

df_predictions = pd.concat([df_predictions, df_predictions_eval], axis=0).reset_index(drop=True)
del df_predictions_eval


# save to directory
path = config.get_submission_filename('submit_model_per_store.csv')
df_predictions.to_csv(path, index=False)

df_predictions.head()

Store: CA_1
Load and filter store transactions...
   dataset dim: (600391, 19)
Prepare features...
   dataset dim: (419709, 73)
Prepare training, validation and test sets...
   train set dim: (321132, 68)
   val set dim: (80283, 68)
   test set dim: (18294, 68)
Train model...
   loss: tweedie
   trial 5 - best running loss: 3.843



Mean of empty slice



   trial 10 - best running loss: 3.843
   trial 15 - best running loss: 3.843
   best loss: 3.843
Make predictions...


Store: CA_2
Load and filter store transactions...
   dataset dim: (600240, 19)
Prepare features...
   dataset dim: (419407, 73)
Prepare training, validation and test sets...
   train set dim: (320890, 68)
   val set dim: (80223, 68)
   test set dim: (18294, 68)
Train model...
   loss: tweedie
   trial 5 - best running loss: 3.894
   trial 10 - best running loss: 3.892
   trial 15 - best running loss: 3.888
   trial 20 - best running loss: 3.888
   best loss: 3.888
Make predictions...


Store: CA_3
Load and filter store transactions...
   dataset dim: (600328, 19)
Prepare features...
   dataset dim: (419414, 73)
Prepare training, validation and test sets...
   train set dim: (320896, 68)
   val set dim: (80224, 68)
   test set dim: (18294, 68)
Train model...
   loss: tweedie
   trial 5 - best running loss: 4.547
   trial 10 - best running loss: 4.547
   trial 15 - best

Unnamed: 0,id,F1,F2,F3,F4,F5,F6
0,FOODS_1_001_CA_1_validation,0.5195,0.4767,0.4846,0.4942,0.5041,0.5618
1,FOODS_1_002_CA_1_validation,0.3291,0.2948,0.2942,0.3031,0.2792,0.3333
2,FOODS_1_003_CA_1_validation,0.601,0.4818,0.4956,0.5017,0.5525,0.59
3,FOODS_1_004_CA_1_validation,1.0281,0.9003,0.917,0.9389,1.0656,1.0442
4,FOODS_1_005_CA_1_validation,1.0131,0.7899,0.8097,0.8553,0.9966,1.188
