In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from pathlib import Path
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
from fastai.vision import *

In [4]:
pd.options.display.max_columns = 50

## Setup the path

In [5]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [6]:
def ls(self):
    return list(self.iterdir())
setattr(Path, 'ls', ls)

In [7]:
path.ls()

[PosixPath('/kaggle/m5_forecasting/sales_train_validation.csv'),
 PosixPath('/kaggle/m5_forecasting/m5_model_0.lgb'),
 PosixPath('/kaggle/m5_forecasting/m5_model_1.lgb'),
 PosixPath('/kaggle/m5_forecasting/m5_model_3.lgb'),
 PosixPath('/kaggle/m5_forecasting/walmartTrends0.csv'),
 PosixPath('/kaggle/m5_forecasting/m5_model_2.lgb'),
 PosixPath('/kaggle/m5_forecasting/m5_model.lgb'),
 PosixPath('/kaggle/m5_forecasting/calendar.csv'),
 PosixPath('/kaggle/m5_forecasting/sample_submission.csv'),
 PosixPath('/kaggle/m5_forecasting/m5_model_4.lgb'),
 PosixPath('/kaggle/m5_forecasting/sell_prices.csv')]

## Read Data

In [8]:
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
CAL_DTYPES = {"event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }

In [9]:
sales_train_validation = pd.read_csv(path/"sales_train_validation.csv", nrows=10)

In [10]:
def read_data():
    prices = pd.read_csv(path/"sell_prices.csv", dtype = PRICE_DTYPES)
    cal = pd.read_csv(path/"calendar.csv", dtype = CAL_DTYPES)
    walmart_trends = pd.read_csv(path/"walmartTrends0.csv")
    return prices, cal, walmart_trends

In [11]:
prices, cal, walmart_trends = read_data()

In [12]:
cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0


#### Pre-process calendar

In [13]:
cal["date"] = pd.to_datetime(cal["date"], infer_datetime_format=True)

In [14]:
cal['date'].min(), cal['date'].max()

(Timestamp('2011-01-29 00:00:00'), Timestamp('2016-06-19 00:00:00'))

In [15]:
cal.head(40)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0
5,2011-02-03,11101,Thursday,6,2,2011,d_6,,,,,1.0,1.0,1.0
6,2011-02-04,11101,Friday,7,2,2011,d_7,,,,,1.0,0.0,0.0
7,2011-02-05,11102,Saturday,1,2,2011,d_8,,,,,1.0,1.0,1.0
8,2011-02-06,11102,Sunday,2,2,2011,d_9,SuperBowl,Sporting,,,1.0,1.0,1.0
9,2011-02-07,11102,Monday,3,2,2011,d_10,,,,,1.0,1.0,0.0


In [16]:
def create_event_map(field):
    return {v: k for k, v in enumerate(cal[field].unique())}

In [17]:
event_name_1_map = create_event_map('event_name_1')
cal.replace({'event_name_1': event_name_1_map}, inplace=True)

In [18]:
event_name_1_map

{nan: 0,
 'SuperBowl': 1,
 'ValentinesDay': 2,
 'PresidentsDay': 3,
 'LentStart': 4,
 'LentWeek2': 5,
 'StPatricksDay': 6,
 'Purim End': 7,
 'OrthodoxEaster': 8,
 'Pesach End': 9,
 'Cinco De Mayo': 10,
 "Mother's day": 11,
 'MemorialDay': 12,
 'NBAFinalsStart': 13,
 'NBAFinalsEnd': 14,
 "Father's day": 15,
 'IndependenceDay': 16,
 'Ramadan starts': 17,
 'Eid al-Fitr': 18,
 'LaborDay': 19,
 'ColumbusDay': 20,
 'Halloween': 21,
 'EidAlAdha': 22,
 'VeteransDay': 23,
 'Thanksgiving': 24,
 'Christmas': 25,
 'Chanukah End': 26,
 'NewYear': 27,
 'OrthodoxChristmas': 28,
 'MartinLutherKingDay': 29,
 'Easter': 30}

In [19]:
def get_elapsed(dt, event_name='Christmas', col='event_name_1', event_map=event_name_1_map, before=False):
    dt.sort_values(['date'], ascending=[(not before)], inplace=True)
    day1 = np.timedelta64(1, 'D')
    last_date = np.datetime64()
    res = []
    event = event_map[event_name]
    for v,d in zip(dt[col].values, dt.date.values):
        if v == event:
            last_date = d
        elapsed = ((d-last_date).astype('timedelta64[D]') / day1)
        res.append(elapsed)
    field_name = f"{'before' if before else 'after'}_{event_name.lower().replace(' ', '_')}"
    dt[field_name] = res

In [20]:
# get_elapsed(cal, 'Christmas', 'event_name_1', event_name_1_map, False)
# get_elapsed(cal, 'Easter', 'event_name_1', event_name_1_map, False)

In [21]:
def numericalize(df, type_map):
    for col, col_dtype in type_map.items():
        if col_dtype == "category":
            df[col] = df[col].cat.codes.astype('int16')
            df[col] -= df[col].min()

In [22]:
numericalize(prices, PRICE_DTYPES)
numericalize(cal, CAL_DTYPES)

In [23]:
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [24]:
pred_days = 28
max_lags = pred_days * 2 + 1
print('max_lags', max_lags)
num_cols = [c for c in pd.read_csv(path/"sales_train_validation.csv", nrows=2).columns if c.find('d_') == 0]
tr_last = len(num_cols)
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
# For more training data use a lower value
FIRST_DAY=350

max_lags 57


In [25]:
def read_dt(is_train = True, nrows = None, first_day = 1200):
    start_day = max(1 if is_train else tr_last - max_lags, first_day)
    print('start_day', start_day)
    dtype = {num: 'float32' for num in num_cols}
    dtype.update({cat: 'category' for cat in catcols if cat != 'id'})
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    dt = pd.read_csv(path/"sales_train_validation.csv", nrows=nrows, usecols = catcols + numcols, dtype=dtype)
    for col in catcols:
        if col != 'id':
            dt[col] = dt[col].cat.codes.astype('int16')
            dt[col] -= dt[col].min()
    if not is_train:
        for day in range(tr_last + 1, tr_last + 1 + pred_days):
            dt[f'd_{day}'] = np.nan
            
    dt = dt.melt(id_vars=catcols, value_vars=[col for col in dt.columns if col.startswith("d_")], var_name='d', value_name='sales')
    dt = dt.merge(cal, on='d', copy=False)
    dt = dt.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], copy=False)
    return dt

In [26]:
%%time

dt = read_dt(first_day=FIRST_DAY)

start_day 350
CPU times: user 13.3 s, sys: 2.4 s, total: 15.7 s
Wall time: 15.9 s


In [27]:
dt.shape

(40718219, 22)

In [28]:
dt

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.50
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40718214,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.98
40718215,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
40718216,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
40718217,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.00


In [29]:
dt.date.min(), dt.date.max()

(Timestamp('2012-01-13 00:00:00'), Timestamp('2016-04-24 00:00:00'))

## Create features

In [30]:
def create_features(dt):
    lags = [7, 28]
    lag_cols = [f'sales_lag_{lag}' for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    for win in lags:
        for lag, lag_col in zip(lags, lag_cols):
            grouped = dt[['id', lag_col]].groupby('id')[lag_col]
            dt[f'rmean_{lag}_{win}'] = grouped.transform(lambda x : x.rolling(win).mean())

In [31]:
%%time
create_features(dt)

CPU times: user 1min 28s, sys: 5.88 s, total: 1min 34s
Wall time: 1min 34s


In [32]:
attr = ['Dayofyear','Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
# attr = []
date_features = {
    "wday": "weekday",
    "week": "weekofyear",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day"
}

for f in attr:
    date_features[f.lower()] = f.lower()
    
date_features

{'wday': 'weekday',
 'week': 'weekofyear',
 'month': 'month',
 'quarter': 'quarter',
 'year': 'year',
 'mday': 'day',
 'dayofyear': 'dayofyear',
 'is_month_end': 'is_month_end',
 'is_month_start': 'is_month_start',
 'is_quarter_end': 'is_quarter_end',
 'is_quarter_start': 'is_quarter_start',
 'is_year_end': 'is_year_end',
 'is_year_start': 'is_year_start'}

In [33]:
def prepare_date_cols(dt):
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in dt.columns:
            dt[date_feature_name] = dt[date_feature_name].astype('int16')
        else:
            dt[date_feature_name] = getattr(dt['date'].dt, date_feature_func).astype('int16')

In [34]:
%%time
prepare_date_cols(dt)

CPU times: user 22 s, sys: 300 ms, total: 22.3 s
Wall time: 22.3 s


In [35]:
dt.dropna(inplace = True)

In [36]:
dt

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday,dayofyear,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
869062,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_405,0.0,2012-03-08,11206,4,6,3,2012,7,3,0,0,1.0,0.0,1.0,3.97,0.0,0.0,0.000000,0.142857,0.214286,0.214286,10,1,8,68,0,0,0,0,0,0
869063,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_406,0.0,2012-03-09,11206,0,7,3,2012,0,0,0,0,1.0,1.0,1.0,3.97,0.0,1.0,0.000000,0.142857,0.178571,0.250000,10,1,9,69,0,0,0,0,0,0
869069,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_405,1.0,2012-03-08,11206,4,6,3,2012,7,3,0,0,1.0,0.0,1.0,4.34,3.0,2.0,1.857143,1.142857,1.392857,1.678571,10,1,8,68,0,0,0,0,0,0
869070,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_406,2.0,2012-03-09,11206,0,7,3,2012,0,0,0,0,1.0,1.0,1.0,4.34,1.0,0.0,2.000000,1.000000,1.392857,1.607143,10,1,9,69,0,0,0,0,0,0
869076,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_405,0.0,2012-03-08,11206,4,6,3,2012,7,3,0,0,1.0,0.0,1.0,2.98,0.0,0.0,0.000000,1.857143,0.750000,0.642857,10,1,8,68,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40718214,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.98,0.0,1.0,1.000000,0.714286,0.928571,1.250000,16,2,24,115,0,0,0,0,0,0
40718215,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.28,0.0,2.0,0.857143,1.142857,1.035714,1.107143,16,2,23,114,0,0,0,0,0,0
40718216,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.28,1.0,4.0,0.714286,1.571429,1.035714,1.250000,16,2,24,115,0,0,0,0,0,0
40718217,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.00,0.0,0.0,0.000000,2.285714,1.821429,1.785714,16,2,23,114,0,0,0,0,0,0


In [37]:
dt[dt.date == datetime(2014, 1, 1)].head(30)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday,dayofyear,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
15789428,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1069,1.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,8.26,0.0,1.0,0.714286,1.0,0.857143,0.714286,1,1,1,1,0,1,0,1,0,1
15789435,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,3.97,0.0,0.0,0.285714,0.0,0.357143,0.071429,1,1,1,1,0,1,0,1,0,1
15789442,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_1069,2.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,4.64,0.0,1.0,1.857143,3.142857,2.464286,2.107143,1,1,1,1,0,1,0,1,0,1
15789449,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_1069,1.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,3.08,0.0,1.0,1.428571,1.0,1.071429,0.75,1,1,1,1,0,1,0,1,0,1
15789456,HOBBIES_1_006_CA_1_validation,5,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,1.0,0.0,1.0,0.285714,0.714286,0.607143,1.035714,1,1,1,1,0,1,0,1,0,1
15789463,HOBBIES_1_007_CA_1_validation,6,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,7.88,0.0,0.0,0.714286,0.142857,0.392857,0.392857,1,1,1,1,0,1,0,1,0,1
15789470,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1069,3.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,0.46,0.0,11.0,5.428571,5.714286,6.214286,8.25,1,1,1,1,0,1,0,1,0,1
15789477,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,1.77,0.0,1.0,1.571429,1.285714,1.428571,1.464286,1,1,1,1,0,1,0,1,0,1
15789484,HOBBIES_1_010_CA_1_validation,9,0,0,0,0,d_1069,3.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,2.97,0.0,2.0,0.571429,1.142857,0.714286,0.785714,1,1,1,1,0,1,0,1,0,1
15789491,HOBBIES_1_011_CA_1_validation,10,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,27,2,0,0,1.0,1.0,0.0,3.48,0.0,0.0,0.0,0.0,0.107143,0.107143,1,1,1,1,0,1,0,1,0,1


## Training

In [38]:
dt.columns

Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'sales_lag_7',
       'sales_lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28', 'rmean_28_28',
       'week', 'quarter', 'mday', 'dayofyear', 'is_month_end',
       'is_month_start', 'is_quarter_end', 'is_quarter_start', 'is_year_end',
       'is_year_start'],
      dtype='object')

In [39]:
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'event_name_1', 
             'event_type_1', 'event_name_2', 'event_type_2']
ignore_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = [c for c in dt.columns if c not in ignore_cols]
X = dt[train_cols]
Y = dt['sales']
num_splits = 5

In [40]:
del dt
gc.collect()

80

In [41]:
lgb_params = {
        "objective" : "poisson",
        "learning_rate" : 0.08,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "nthread" : 12,
        "metric": ["rmse"],
        'verbosity': 1,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
}

In [None]:
%%time

kf = KFold(n_splits=num_splits, shuffle=True)
models = []
for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f'Fold: {fold+1}')
    x_train, x_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[valid_idx]
    train_set = lgb.Dataset(x_train, y_train, categorical_feature=cat_feats)
    val_set = lgb.Dataset(x_val, y_val, categorical_feature=cat_feats)
    models.append(lgb.train(lgb_params, train_set, valid_sets=[train_set, val_set], verbose_eval=20, 
                             categorical_feature=cat_feats, num_boost_round=360))

Fold: 1
[20]	training's rmse: 2.94778	valid_1's rmse: 2.93606
[40]	training's rmse: 2.54996	valid_1's rmse: 2.56817
[60]	training's rmse: 2.45532	valid_1's rmse: 2.48175
[80]	training's rmse: 2.43033	valid_1's rmse: 2.45897
[100]	training's rmse: 2.41864	valid_1's rmse: 2.4483
[120]	training's rmse: 2.40495	valid_1's rmse: 2.43617
[140]	training's rmse: 2.39534	valid_1's rmse: 2.42727
[160]	training's rmse: 2.38476	valid_1's rmse: 2.41835
[180]	training's rmse: 2.3735	valid_1's rmse: 2.40929
[200]	training's rmse: 2.36389	valid_1's rmse: 2.40079
[220]	training's rmse: 2.35674	valid_1's rmse: 2.39568
[240]	training's rmse: 2.35002	valid_1's rmse: 2.39071
[260]	training's rmse: 2.34326	valid_1's rmse: 2.38568
[280]	training's rmse: 2.33592	valid_1's rmse: 2.37993
[300]	training's rmse: 2.32911	valid_1's rmse: 2.37498
[320]	training's rmse: 2.32213	valid_1's rmse: 2.37006
[340]	training's rmse: 2.31721	valid_1's rmse: 2.36663
[360]	training's rmse: 2.31113	valid_1's rmse: 2.36264
Fold: 2


In [None]:
# del X, Y, valid_idx, train_idx
# gc.collect()

In [None]:
for i, m_lgb in enumerate(models):
    m_lgb.save_model(str(path/f'm5_model_{i}.lgb'))

## Prediction

In [None]:
from tqdm.notebook import tqdm

In [None]:
alphas = [1.028, 1.023, 1.018]
weights = [1 / len(alphas)] * len(alphas)
assert sum(weights) == 1.0
fday = datetime(2016, 4, 25) 
assert datetime(2011, 1, 29) + timedelta(days=1914 - 1) == fday

In [None]:
def run_predictions(models, df):
    y_pred = np.zeros((len(df), 1))
    print(y_pred.shape)
    for i, model in enumerate(models):
        y_pred += model.predict(df).reshape([len(df), 1])
    return y_pred / len(models)

In [None]:
cols = [f'F{i}' for i in range(1, pred_days + 1)]
sub = pd.DataFrame()
te = read_dt(False)

for icount, (alpha, weight) in tqdm(enumerate(zip(alphas, weights)), total=len(alphas)):
    for tdelta in tqdm(range(0, pred_days), total=pred_days):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_features(tst)
        prepare_date_cols(tst)
        tst = tst.loc[tst.date == day, train_cols]
        prediction = run_predictions(models, tst)
        print('prediction.shape', prediction.shape)
        te.loc[te.date == day, 'sales'] = alpha * prediction # magic multiplier by kyakovlev
    
    te_sub = te.loc[te.date >= fday, ['id', 'sales']].copy()
    te_sub['F'] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()['sales'][cols]
    te_sub.fillna(0., inplace=True)
    te_sub.sort_values(["id"], inplace=True)
    te_sub.reset_index(drop=False, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    
    if icount == 0:
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols] * weight

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

In [None]:
!wc -l submission.csv

In [None]:
pd.read_csv('submission.csv')