In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [3]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test
# -------------------------------------
def get_base_valid():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('valid_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test
# -------------------------------------

########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [4]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.05,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                    'n_jobs':40
                } 

# Let's look closer on params

## 'boosting_type': 'gbdt'
# we have 'goss' option for faster training
# but it normally leads to underfit.
# Also there is good 'dart' mode
# but it takes forever to train
# and model performance depends 
# a lot on random factor 
# https://www.kaggle.com/c/home-credit-default-risk/discussion/60921

## 'objective': 'tweedie'
# Tweedie Gradient Boosting for Extremely
# Unbalanced Zero-inflated Data
# https://arxiv.org/pdf/1811.10192.pdf
# and many more articles about tweediie
#
# Strange (for me) but Tweedie is close in results
# to my own ugly loss.
# My advice here - make OWN LOSS function
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/140564
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/143070
# I think many of you already using it (after poisson kernel appeared) 
# (kagglers are very good with "params" testing and tuning).
# Try to figure out why Tweedie works.
# probably it will show you new features options
# or data transformation (Target transformation?).

## 'tweedie_variance_power': 1.1
# default = 1.5
# set this closer to 2 to shift towards a Gamma distribution
# set this closer to 1 to shift towards a Poisson distribution
# my CV shows 1.1 is optimal 
# but you can make your own choice

## 'metric': 'rmse'
# Doesn't mean anything to us
# as competition metric is different
# and we don't use early stoppings here.
# So rmse serves just for general 
# model performance overview.
# Also we use "fake" validation set
# (as it makes part of the training set)
# so even general rmse score doesn't mean anything))
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834

## 'subsample': 0.5
# Serves to fight with overfit
# this will randomly select part of data without resampling
# Chosen by CV (my CV can be wrong!)
# Next kernel will be about CV

##'subsample_freq': 1
# frequency for bagging
# default value - seems ok

## 'learning_rate': 0.03
# Chosen by CV
# Smaller - longer training
# but there is an option to stop 
# in "local minimum"
# Bigger - faster training
# but there is a chance to
# not find "global minimum" minimum

## 'num_leaves': 2**11-1
## 'min_data_in_leaf': 2**12-1
# Force model to use more features
# We need it to reduce "recursive"
# error impact.
# Also it leads to overfit
# that's why we use small 
# 'max_bin': 100

## l1, l2 regularizations
# https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
# Good tiny explanation
# l2 can work with bigger num_leaves
# but my CV doesn't show boost
                    
## 'n_estimators': 1400
# CV shows that there should be
# different values for each state/store.
# Current value was chosen 
# for general purpose.
# As we don't use any early stopings
# careful to not overfit Public LB.

##'feature_fraction': 0.5
# LightGBM will randomly select 
# part of features on each iteration (tree).
# We have maaaany features
# and many of them are "duplicates"
# and many just "noise"
# good values here - 0.5-0.7 (by CV)

## 'boost_from_average': False
# There is some "problem"
# to code boost_from_average for 
# custom loss
# 'True' makes training faster
# BUT carefull use it
# https://github.com/microsoft/LightGBM/issues/1514
# not our case but good to know cons

In [5]:
########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1941               # End day of our train set
START_VALID = 1913
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
ORIGINAL = 'data/m5-forecasting-accuracy/'
BASE     = 'data/m5-simple-fe/grid_part_1.pkl'
PRICE    = 'data/m5-simple-fe/grid_part_2.pkl'
CALENDAR = 'data/m5-simple-fe/grid_part_3.pkl'
LAGS     = 'data/m5-lags-features/lags_df_28.pkl'
MEAN_ENC = 'data/m5-custom-features/mean_encoding_df.pkl'


# AUX(pretrained) Models paths
AUX_MODELS = 'data/m5-aux-models/'


#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_evaluation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [6]:
########################### Aux Models
# If you don't want to wait hours and hours
# to have result you can train each store 
# in separate kernel and then just join result.

# If we want to use pretrained models we can 
## skip training 
## (in our case do dummy training
##  to show that we are good with memory
##  and you can safely use this (all kernel) code)
if USE_AUX:
    lgb_params['n_estimators'] = 2
    
# Here is some 'logs' that can compare
#Train CA_1
#[100]	valid_0's rmse: 2.02289
#[200]	valid_0's rmse: 2.0017
#[300]	valid_0's rmse: 1.99239
#[400]	valid_0's rmse: 1.98471
#[500]	valid_0's rmse: 1.97923
#[600]	valid_0's rmse: 1.97284
#[700]	valid_0's rmse: 1.96763
#[800]	valid_0's rmse: 1.9624
#[900]	valid_0's rmse: 1.95673
#[1000]	valid_0's rmse: 1.95201
#[1100]	valid_0's rmse: 1.9476
#[1200]	valid_0's rmse: 1.9434
#[1300]	valid_0's rmse: 1.9392
#[1400]	valid_0's rmse: 1.93446

#Train CA_2
#[100]	valid_0's rmse: 1.88949
#[200]	valid_0's rmse: 1.84767
#[300]	valid_0's rmse: 1.83653
#[400]	valid_0's rmse: 1.82909
#[500]	valid_0's rmse: 1.82265
#[600]	valid_0's rmse: 1.81725
#[700]	valid_0's rmse: 1.81252
#[800]	valid_0's rmse: 1.80736
#[900]	valid_0's rmse: 1.80242
#[1000]	valid_0's rmse: 1.79821
#[1100]	valid_0's rmse: 1.794
#[1200]	valid_0's rmse: 1.78973
#[1300]	valid_0's rmse: 1.78552
#[1400]	valid_0's rmse: 1.78158

### Train and Valid

In [None]:
MODEL_PATH = 'models/1914_1941_valid_d2d/'
for day in range(1,28):

    for store_id in STORES_IDS:
        print('Train', store_id)

        # Get grid for current store
        grid_df, features_columns = get_data_by_store(store_id)
        grid_df['sales'] = grid_df.groupby('item_id')['sales'].shift(-day)
        grid_df = grid_df[grid_df.groupby('item_id').cumcount(ascending=False) > day-1]

        grid_df['sales'] = grid_df['sales'].values * grid_df['sell_price'].values
    #     break
        # Masks for 
        # Train (All data less than 1913)
        # "Validation" (Last 28 days - not real validatio set)
        # Test (All data greater than 1913 day, 
        #       with some gap for recursive features)
        train_mask = grid_df['d']<=END_TRAIN-P_HORIZON
    #     valid_mask = grid_df['d']>(END_TRAIN-100)
        preds_mask = (grid_df['d']<=END_TRAIN)&(grid_df['d']>(END_TRAIN-P_HORIZON-100))

        # Apply masks and save lgb dataset as bin
        # to reduce memory spikes during dtype convertations
        # https://github.com/Microsoft/LightGBM/issues/1032
        # "To avoid any conversions, you should always use np.float32"
        # or save to bin before start training
        # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
        train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                           label=grid_df[train_mask][TARGET])
    #     train_data.save_binary('train_data.bin')
    #     train_data = lgb.Dataset('train_data.bin')

    ##     valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
    ##                        label=grid_df[valid_mask][TARGET])
    #     break
        # Saving part of the dataset for later predictions
        # Removing features that we need to calculate recursively 
        grid_df = grid_df[preds_mask].reset_index(drop=True)
        keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
        grid_df = grid_df[keep_cols]
        if day==1:
            grid_df.to_pickle(MODEL_PATH+'valid_'+store_id+'.pkl')

        del grid_df

        # Launch seeder again to make lgb training 100% deterministic
        # with each "code line" np.random "evolves" 
        # so we need (may want) to "reset" it
        seed_everything(SEED)
        estimator = lgb.train(lgb_params,
                              train_data,
                              valid_sets = [train_data],
                              verbose_eval = 100,
                              )

        # Save model - it's not real '.bin' but a pickle file
        # estimator = lgb.Booster(model_file='model.txt')
        # can only predict with the best iteration (or the saving iteration)
        # pickle.dump gives us more flexibility
        # like estimator.predict(TEST, num_iteration=100)
        # num_iteration - number of iteration want to predict with, 
        # NULL or <= 0 means use best iteration
        model_name = MODEL_PATH+'lgb_model_'+store_id+'_v'+str(VER)+ '_valid' +'_d_'+ str(day+1) +'.bin'
        pickle.dump(estimator, open(model_name, 'wb'))

        # Remove temporary files and objects 
        # to free some hdd space and ram memory
    #     !rm train_data.bin
        del train_data, estimator
        gc.collect()

        # "Keep" models features for predictions
        MODEL_FEATURES = features_columns

Train CA_1
[100]	training's rmse: 7.21419
[200]	training's rmse: 7.05813
[300]	training's rmse: 6.98088
[400]	training's rmse: 6.91319
[500]	training's rmse: 6.85592
[600]	training's rmse: 6.8068
[700]	training's rmse: 6.76109
[800]	training's rmse: 6.71733
[900]	training's rmse: 6.67597
[1000]	training's rmse: 6.63693
[1100]	training's rmse: 6.59963
[1200]	training's rmse: 6.56597
[1300]	training's rmse: 6.53222
[1400]	training's rmse: 6.50006
Train CA_2
[100]	training's rmse: 6.04484
[200]	training's rmse: 5.97659
[300]	training's rmse: 5.93461
[400]	training's rmse: 5.89547
[500]	training's rmse: 5.8614
[600]	training's rmse: 5.8298
[700]	training's rmse: 5.79968
[800]	training's rmse: 5.77202
[900]	training's rmse: 5.74528
[1000]	training's rmse: 5.72043
[1100]	training's rmse: 5.69537
[1200]	training's rmse: 5.67224
[1300]	training's rmse: 5.64945
[1400]	training's rmse: 5.62713
Train CA_3
[100]	training's rmse: 9.18141
[200]	training's rmse: 8.9869
[300]	training's rmse: 8.87428


In [10]:
# all_preds = pd.DataFrame()

# # Join back the Test dataset with 
# # a small part of the training data 
# # to make recursive features
base_test = get_base_valid()
base_test = base_test[base_test['d']<=END_TRAIN]
index = base_test[base_test['d']>END_TRAIN-P_HORIZON].index
base_test.loc[index,'sales']=np.NaN

In [18]:
features_columns = ['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42', 'sales_lag_43', 'sales_lag_44', 'sales_lag_45', 'sales_lag_46', 'sales_lag_47', 'sales_lag_48', 'sales_lag_49', 'sales_lag_50', 'sales_lag_51', 'sales_lag_52', 'sales_lag_53', 'sales_lag_54', 'sales_lag_55', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14', 'rolling_mean_tmp_1_30', 'rolling_mean_tmp_1_60', 'rolling_mean_tmp_7_7', 'rolling_mean_tmp_7_14', 'rolling_mean_tmp_7_30', 'rolling_mean_tmp_7_60', 'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14', 'rolling_mean_tmp_14_30', 'rolling_mean_tmp_14_60']
MODEL_FEATURES = features_columns

In [32]:
########################### Validation
#################################################################################

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_valid()
base_test = base_test[base_test['d']<=END_TRAIN]
index = base_test[base_test['d']>END_TRAIN-P_HORIZON].index
base_test.loc[index,'sales']=np.NaN

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'_valid'+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN-P_HORIZON+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df

all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.52 min round |  0.52 min total |  119098.88 day sales |
Predict | Day: 2
##########  0.53 min round |  1.04 min total |  109788.18 day sales |
Predict | Day: 3
##########  0.51 min round |  1.56 min total |  107688.27 day sales |
Predict | Day: 4
##########  0.54 min round |  2.10 min total |  109148.42 day sales |
Predict | Day: 5
##########  0.51 min round |  2.62 min total |  128925.29 day sales |
Predict | Day: 6
##########  0.49 min round |  3.11 min total |  159062.11 day sales |
Predict | Day: 7
##########  0.55 min round |  3.66 min total |  169049.72 day sales |
Predict | Day: 8
##########  0.54 min round |  4.19 min total |  138665.35 day sales |
Predict | Day: 9
##########  0.52 min round |  4.71 min total |  138257.06 day sales |
Predict | Day: 10
##########  0.53 min round |  5.24 min total |  122230.63 day sales |
Predict | Day: 11
##########  0.54 min round |  5.79 min total |  126526.78 day sales |
Predict | Day: 12
##########  0.52 min ro

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,6.482358,6.236162,6.701166,6.718561,8.779279,9.039212,9.921158,7.263801,7.350158,...,5.814016,9.064327,8.393638,7.707997,6.336895,5.979583,6.636788,7.147444,9.403416,9.710597
1,HOBBIES_1_002_CA_1_evaluation,0.600097,0.562727,0.629739,0.646479,0.831223,1.051731,1.113159,0.999537,1.038880,...,0.903542,1.245095,1.183824,0.725068,0.773615,0.787263,0.949628,1.060926,1.486150,1.485043
2,HOBBIES_1_003_CA_1_evaluation,0.882194,1.040188,0.951171,1.279392,1.603858,2.031354,1.800425,1.095930,1.043535,...,1.623209,1.920087,1.886229,1.217070,1.059234,1.117274,1.028930,1.629121,2.115918,1.845632
3,HOBBIES_1_004_CA_1_evaluation,7.418280,6.072367,6.153833,7.622743,9.053872,13.759812,13.597225,8.913609,6.845602,...,8.397219,12.232644,17.213447,8.217616,6.611014,6.412223,5.932662,9.683865,16.611975,16.378078
4,HOBBIES_1_005_CA_1_evaluation,2.728031,2.209253,2.419763,2.311013,3.089582,4.016515,4.695631,3.034715,2.704604,...,2.717166,4.238723,4.031254,2.952758,2.420250,3.025157,2.897004,3.311736,4.581754,4.303051
5,HOBBIES_1_006_CA_1_evaluation,0.842724,0.705904,0.787184,0.744418,0.753112,1.172069,1.101901,0.886697,0.763938,...,0.677872,0.850651,0.820253,0.671503,0.729616,0.675620,0.697883,0.761035,0.865400,0.838201
6,HOBBIES_1_007_CA_1_evaluation,1.977278,2.243485,2.229883,2.113231,2.761290,2.673881,2.866680,2.985094,2.590095,...,2.326166,3.313283,3.230577,2.488817,2.327516,2.568211,3.350503,2.610588,2.930315,3.056946
7,HOBBIES_1_008_CA_1_evaluation,3.275450,3.767714,4.165330,3.834317,3.544737,3.958627,3.549673,3.246697,3.646549,...,3.306345,4.191860,3.696659,3.725436,3.110286,3.823483,3.972088,3.721662,3.742881,3.383746
8,HOBBIES_1_009_CA_1_evaluation,1.493750,1.763059,1.569225,1.678195,1.677069,2.307041,2.722686,1.569621,1.507794,...,1.483135,1.941993,2.128764,1.904336,1.767301,1.494685,1.767281,1.518336,2.034702,1.588794
9,HOBBIES_1_010_CA_1_evaluation,1.867338,1.366099,1.499768,1.237636,1.878575,2.799118,3.215014,2.041961,1.959713,...,2.611325,2.632894,2.861782,1.885222,1.652087,1.859420,1.842648,2.343339,2.833252,3.158103


In [34]:
all_preds.to_pickle('revenue_1914_1941_valid.pkl')

In [21]:
END_TRAIN-P_HORIZON+PREDICT_DAY

1914

In [8]:
########################### Predict
#################################################################################

# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.46 min round |  0.46 min total |  66739.01 day sales |
Predict | Day: 2
##########  0.46 min round |  0.92 min total |  61946.21 day sales |
Predict | Day: 3
##########  0.48 min round |  1.40 min total |  60890.03 day sales |
Predict | Day: 4
##########  0.47 min round |  1.87 min total |  60401.87 day sales |
Predict | Day: 5
##########  0.49 min round |  2.36 min total |  67106.78 day sales |
Predict | Day: 6
##########  0.47 min round |  2.83 min total |  79471.40 day sales |
Predict | Day: 7
##########  0.46 min round |  3.29 min total |  79094.91 day sales |
Predict | Day: 8
##########  0.48 min round |  3.77 min total |  71534.90 day sales |
Predict | Day: 9
##########  0.47 min round |  4.24 min total |  57836.34 day sales |
Predict | Day: 10
##########  0.51 min round |  4.75 min total |  62016.09 day sales |
Predict | Day: 11
##########  0.48 min round |  5.22 min total |  62791.58 day sales |
Predict | Day: 12
##########  0.51 min round |  5.74

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,5.207190,5.138121,4.400951,4.392455,5.132807,5.950619,6.173403,6.544581,4.860880,...,3.669552,4.467991,4.291506,3.639636,3.857000,3.388294,3.653459,4.102362,5.009054,5.159707
1,HOBBIES_1_002_CA_1_evaluation,0.602291,0.494456,0.434571,0.429977,0.543416,0.710714,0.687504,0.486123,0.409288,...,0.655208,0.710007,0.807517,0.550647,0.558158,0.609211,0.698911,0.791228,0.951531,0.931072
2,HOBBIES_1_003_CA_1_evaluation,1.111036,0.991202,0.927428,0.913484,1.398491,1.712292,1.672513,1.216471,1.063799,...,1.488850,1.753310,1.851893,1.104525,1.069695,1.039736,1.173029,1.485221,1.708653,1.675721
3,HOBBIES_1_004_CA_1_evaluation,3.915078,3.069219,3.362771,2.871781,3.777679,4.888392,4.765363,4.520139,2.874497,...,2.912935,3.048991,3.171907,2.474697,1.907877,2.049314,1.902148,2.133284,3.187886,2.582829
4,HOBBIES_1_005_CA_1_evaluation,1.603688,1.442117,1.345851,1.432300,1.358102,1.801673,2.055293,2.037031,1.641743,...,1.543949,1.634892,1.597915,1.323068,1.101467,1.073975,1.175613,1.437309,1.952324,1.612276
5,HOBBIES_1_006_CA_1_evaluation,0.503552,0.445690,0.494055,0.462479,0.521225,0.684403,0.663151,0.565871,0.534596,...,0.583562,0.585030,0.638270,0.531151,0.521827,0.475570,0.525241,0.586263,0.651607,0.616558
6,HOBBIES_1_007_CA_1_evaluation,1.569720,1.512544,1.436524,1.555791,1.444319,2.076529,2.035488,1.817107,1.430348,...,1.743894,2.161983,1.861117,1.445364,1.493865,1.471651,1.316799,1.946341,2.297719,2.368111
7,HOBBIES_1_008_CA_1_evaluation,1.480632,1.433023,1.338429,1.594288,1.647739,1.904967,1.944466,1.910679,1.877995,...,1.493235,1.936831,1.719648,1.711917,1.549502,1.505949,1.737401,1.872393,2.077690,1.819815
8,HOBBIES_1_009_CA_1_evaluation,0.697321,0.923652,0.865182,0.730409,0.775734,0.914825,0.867264,0.740061,0.736995,...,0.848646,0.878559,0.714649,0.833536,0.646239,0.693914,0.688028,0.665993,0.783157,0.815451
9,HOBBIES_1_010_CA_1_evaluation,0.732143,0.823761,0.775209,0.726438,0.904026,1.098475,1.114005,0.948226,0.816818,...,0.841297,0.920543,1.068300,0.842206,0.775225,0.770984,0.814092,1.004487,1.055288,1.228273


In [9]:
########################### Export
#################################################################################
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'.csv', index=False)

In [10]:
# Summary

# Of course here is no magic at all.
# No "Novel" features and no brilliant ideas.
# We just carefully joined all
# our previous fe work and created a model.

# Also!
# In my opinion this strategy is a "dead end".
# Overfits a lot LB and with 1 final submission 
# you have no option to risk.


# Improvement should come from:
# Loss function
# Data representation
# Stable CV
# Good features reduction strategy
# Predictions stabilization with NN
# Trend prediction
# Real zero sales detection/classification


# Good kernels references 
## (the order is random and the list is not complete):
# https://www.kaggle.com/ragnar123/simple-lgbm-groupkfold-cv
# https://www.kaggle.com/jpmiller/grouping-items-by-stockout-pattern
# https://www.kaggle.com/headsortails/back-to-predict-the-future-interactive-m5-eda
# https://www.kaggle.com/sibmike/m5-out-of-stock-feature
# https://www.kaggle.com/mayer79/m5-forecast-attack-of-the-data-table
# https://www.kaggle.com/yassinealouini/seq2seq
# https://www.kaggle.com/kailex/m5-forecaster-v2
# https://www.kaggle.com/aerdem4/m5-lofo-importance-on-gpu-via-rapids-xgboost


# Features were created in these kernels:
## 
# Mean encodings and PCA options
# https://www.kaggle.com/kyakovlev/m5-custom-features
##
# Lags and rolling lags
# https://www.kaggle.com/kyakovlev/m5-lags-features
##
# Base Grid and base features (calendar/price/etc)
# https://www.kaggle.com/kyakovlev/m5-simple-fe


# Personal request
# Please don't upvote any ensemble and copypaste kernels
## The worst case is ensemble without any analyse.
## The best choice - just ignore it.
## I would like to see more kernels with interesting and original approaches.
## Don't feed copypasters with upvotes.

## It doesn't mean that you should not fork and improve others kernels
## but I would like to see params and code tuning based on some CV and analyse
## and not only on LB probing.
## Small changes could be shared in comments and authors can improve their kernel.

## Feel free to criticize this kernel as my knowlege is very limited
## and I can be wrong in code and descriptions. 
## Thank you.