# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import gc
import random

import urllib.request
import numpy as np
import pandas as pd
from pandas.plotting import autocorrelation_plot
import statsmodels.api as sm

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, StrMethodFormatter
import matplotlib
import seaborn as sns

import lightgbm as lgb
import optuna
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

sys.path.append('..')
from src import config, dataset, models

plt.rcParams.update(plt.rcParamsDefault)
plt.style.use(['ggplot', 'bmh'])
matplotlib.rcParams['figure.figsize'] = (10.0, 10.0)

display_settings = {
    'max_columns': 999,
    'expand_frame_repr': True,
    'max_rows': 999,
    'precision': 4,
    'show_dimensions': True
}

for op, value in display_settings.items():
    pd.set_option("display.{}".format(op), value)


def fix_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)

fix_seed(1234)
%matplotlib inline

# Model

For each store, an lgbm model is optimized. Inference is done for each store's ids and concatenated at the end.
The idea is to allow more data to be used as the dataset is quite large, as well as to specialize models to specific stores and thus reduce variance of data they train on.

In [4]:
path = config.get_processed_filename('data_merged.parquet')
stores = config.STORES
categories = ['item_id', 'dept_id', 'cat_id', 'wday', 'snap']

# prediction horizon
horizon = 28

# The dataset is quite large with 46 million records. We use only a fraction of it for training.
last_day = 1913
start_day = last_day - 3 * horizon

losses = {}
df_predictions = pd.DataFrame()

#stores = ['CA_1']

for store in stores:
    print(f'Store: {store}')
    print('Load and filter store transactions...')
    # filter
    df_train = pd.read_parquet(path)
    df_train = df_train[df_train.store_id == store]
    df_train = df_train.drop('store_id', axis=1)
    df_train = df_train[df_train.day_ind >=start_day]
    
    print(f'   dataset dim: {df_train.shape}')
    
    print('Prepare features...')
    # convert categories to ordinal
    df_train = dataset.categorical_to_ordinal(df_train)
    df_train = df_train.sort_values('date')
    
    # time dependent features
    for c in categories:
        df_train = dataset.hierarchy_stats(df_train, c, ['sold_qty', 'sell_price'])

        df_grouped = df_train.groupby([c, 'date'])
        df_train = dataset.hierarchy_rolling_mean(df_train, c, ['sold_qty', 'sell_price'], window=7, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_std(df_train, c, ['sold_qty', 'sell_price'], window=7, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_mean(df_train, c, ['sold_qty', 'sell_price'], window=28, shift=28, df_grouped=df_grouped)
        df_train = dataset.hierarchy_rolling_std(df_train, c, ['sold_qty', 'sell_price'], window=28, shift=28, df_grouped=df_grouped)

    # lag per id
    df_grouped = df_train.groupby('id', as_index=False)
    df_train = dataset.item_lag(df_train, ['sold_qty'], n_most=2, after=28, df_grouped=df_grouped)
    
    # exclude rows with nan due to lagged features
    df_train.dropna(subset=[c for c in df_train.columns if c != 'sold_qty'], inplace=True)
    
    # remove columns with zero variance
    col_std = df_train[df_train.select_dtypes([np.number]).columns].std()
    cols_to_drop = col_std[col_std == 0].index
    df_train.drop(cols_to_drop, axis=1, inplace=True)
    
    del df_grouped, col_std, cols_to_drop
    gc.collect()
    
    print(f'   day {df_train.day_ind.min()} to day {df_train.day_ind.max()}')
    print(f'   dataset dim: {df_train.shape}')
    
    print('Prepare training, validation and test sets...')
    # prepare training and test set
    for c in categories:
        df_train[c] = df_train[c].astype('category')

    # drop useless columns
    useless_columns = ['id', 'day_ind', 'date', 'event_type_1']

    # test set
    test_ids = df_train.loc[df_train.sold_qty.isna(), 'id']
    test_day_ind = df_train.loc[df_train.sold_qty.isna(), 'day_ind']
    X_test = df_train.loc[df_train.sold_qty.isna()].drop(useless_columns + ['sold_qty'], axis=1)
    
    # train set
    y_train = df_train.loc[~df_train.sold_qty.isna(), 'sold_qty']
    X_train = df_train.loc[~df_train.sold_qty.isna()].drop(useless_columns + ['sold_qty'], axis=1)
    
    del df_train
    gc.collect()
    
    print(f'   train set dim: {X_train.shape}')
    print(f'   test set dim: {X_test.shape}')
    
    print('Train model...')
    # train model
    
    model = models.LGBModel()
    model.optimize(X_train, y_train, n_trials=15, metric='rmse')
    
    # evaluate
    loss = model.best_value
    losses[store] = loss
    
    print('Make predictions...')
    # predictions on test set
    y_hat_test = model.predict(X_test)
    df_predictions_store = pd.DataFrame({'id': test_ids, 'd': test_day_ind, 'sold_qty': y_hat_test})
    
    # transform to the required format for submissions
    df_predictions_store = df_predictions_store.pivot(index='id', columns='d', values='sold_qty')
    df_predictions_store.columns = [f'F{i + 1}' for i, c in enumerate(df_predictions_store.columns)]
    df_predictions_store.reset_index(inplace=True)
    
    df_predictions = pd.concat([df_predictions, df_predictions_store], axis=0)
    
    del X_train, y_train, df_predictions_store
    gc.collect()
    
    print('\n')

# duplicate and append predictions for evaluation period, as validation period is not known yet
df_predictions_eval = df_predictions.copy()
df_predictions_eval.id = df_predictions_eval.id.str.replace('validation', 'evaluation')

df_predictions = pd.concat([df_predictions, df_predictions_eval], axis=0).reset_index(drop=True)
del df_predictions_eval


# save to directory
path = config.get_submission_filename('submit_model_per_store.csv')
df_predictions.to_csv(path, index=False)

df_predictions.head()

Store: CA_1
Load and filter store transactions...
   dataset dim: (344524, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (232022, 62)
Prepare training, validation and test sets...
   train set dim: (146650, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 2.057



Mean of empty slice



   trial 10 - best running loss: 2.057
   best loss: 2.057
Make predictions...


Store: CA_2
Load and filter store transactions...
   dataset dim: (344511, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231665, 62)
Prepare training, validation and test sets...
   train set dim: (146293, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 1.967



Mean of empty slice



   trial 10 - best running loss: 1.967
   best loss: 1.967
Make predictions...


Store: CA_3
Load and filter store transactions...
   dataset dim: (344511, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231350, 62)
Prepare training, validation and test sets...
   train set dim: (145978, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 2.757



Mean of empty slice



   best loss: 2.757
Make predictions...


Store: CA_4
Load and filter store transactions...
   dataset dim: (344524, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (232193, 62)
Prepare training, validation and test sets...
   train set dim: (146821, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 1.368



Mean of empty slice



   trial 10 - best running loss: 1.351
   best loss: 1.351
Make predictions...


Store: TX_1
Load and filter store transactions...
   dataset dim: (344537, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231644, 62)
Prepare training, validation and test sets...
   train set dim: (146272, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 1.684
   trial 10 - best running loss: 1.684
   best loss: 1.672
Make predictions...


Store: TX_2
Load and filter store transactions...
   dataset dim: (344537, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231431, 62)
Prepare training, validation and test sets...
   train set dim: (146059, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 1.888
   best loss: 1.888
Make predictions...


Store: TX_3
Load and filter store transactions...
   dataset dim: (344537, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (23


Mean of empty slice



   best loss: 1.803
Make predictions...


Store: WI_1
Load and filter store transactions...
   dataset dim: (344524, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231388, 62)
Prepare training, validation and test sets...
   train set dim: (146016, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 1.694
   best loss: 1.682
Make predictions...


Store: WI_2
Load and filter store transactions...
   dataset dim: (344524, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231055, 62)
Prepare training, validation and test sets...
   train set dim: (145683, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 2.793



Mean of empty slice



   best loss: 2.771
Make predictions...


Store: WI_3
Load and filter store transactions...
   dataset dim: (344537, 19)
Prepare features...
   day 1859 to day 1941
   dataset dim: (231683, 62)
Prepare training, validation and test sets...
   train set dim: (146311, 57)
   test set dim: (85372, 57)
Train model...
   metric: rmse
   trial 5 - best running loss: 2.015
   trial 10 - best running loss: 2.015
   best loss: 2.015
Make predictions...




Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.0765,1.0018,1.0005,0.9787,1.0561,1.1663,1.0721,1.0861,1.0388,0.9384,1.0074,1.0523,1.1285,1.0871,1.0992,0.9828,1.0078,1.0185,1.0746,1.0183,1.114,1.0236,1.0794,0.9674,1.0334,1.038,1.1746,1.0747
1,FOODS_1_002_CA_1_validation,0.4601,0.4062,0.4475,0.4537,0.4411,0.4425,0.4807,0.4729,0.4314,0.4405,0.4727,0.4552,0.4389,0.4795,0.4721,0.4812,0.4505,0.4399,0.4339,0.4748,0.4703,0.4448,0.4541,0.4709,0.4677,0.4539,0.5091,0.4747
2,FOODS_1_003_CA_1_validation,0.7138,0.6914,0.754,0.7408,0.6963,0.7095,0.7929,0.7636,0.6713,0.7257,0.6875,0.7501,0.7512,0.7317,0.7747,0.7493,0.694,0.6575,0.787,0.7124,0.7801,0.7016,0.6889,0.6808,0.7733,0.725,0.8079,0.7145
3,FOODS_1_004_CA_1_validation,0.0017,0.0018,0.0018,0.0021,0.0018,0.002,0.0018,0.0017,0.0019,0.0021,0.0019,0.0015,0.002,0.0017,0.002,0.0018,0.0018,0.0018,0.0016,0.0018,0.0022,0.0019,0.0017,0.002,0.0017,0.0019,0.002,0.0019
4,FOODS_1_005_CA_1_validation,1.247,1.1211,1.1432,1.1612,1.2394,1.3689,1.3381,1.2767,1.2345,1.1829,1.146,1.2158,1.3546,1.4168,1.3223,1.1921,1.2345,1.1846,1.2538,1.3726,1.4373,1.2166,1.1577,1.1703,1.1173,1.2785,1.3436,1.3252
