In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from pathlib import Path
from tqdm.notebook import tqdm

In [3]:
from fastai.vision import *

In [4]:
pd.options.display.max_columns = 50

## Setup the path

In [5]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [6]:
def ls(self):
    return list(self.iterdir())
setattr(Path, 'ls', ls)

In [7]:
path.ls()

[PosixPath('/kaggle/m5_forecasting/sales_train_validation.csv'),
 PosixPath('/kaggle/m5_forecasting/calendar.csv'),
 PosixPath('/kaggle/m5_forecasting/sample_submission.csv'),
 PosixPath('/kaggle/m5_forecasting/sell_prices.csv')]

## Read Data

In [8]:
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
CAL_DTYPES = {"weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }

In [9]:
def read_data():
    prices = pd.read_csv(path/"sell_prices.csv", dtype = PRICE_DTYPES)
    cal = pd.read_csv(path/"calendar.csv", dtype = CAL_DTYPES)
    return prices, cal

In [10]:
prices, cal = read_data()

### Pre-process calendar

In [11]:
cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0


In [12]:
def create_event_map(field):
    return {v: k for k, v in enumerate(cal[field].unique())}

In [13]:
event_name_1_map = create_event_map('event_name_1')
event_name_2_map = create_event_map('event_name_2')
event_type_1_map = create_event_map('event_type_1')
event_type_2_map = create_event_map('event_type_2')

In [14]:
cal.replace({'event_name_1': event_name_1_map}, inplace=True)
cal.replace({'event_name_2': event_name_2_map}, inplace=True)
cal.replace({'event_type_1': event_type_1_map}, inplace=True)
cal.replace({'event_type_2': event_type_2_map}, inplace=True)

In [15]:
cal["date"] = pd.to_datetime(cal["date"], infer_datetime_format=True)

In [16]:
def numericalize(df, type_map):
    for col, col_dtype in type_map.items():
        if col_dtype == "category":
            df[col] = df[col].cat.codes.astype('int16')
            df[col] -= df[col].min()

In [17]:
numericalize(prices, PRICE_DTYPES)
numericalize(cal, CAL_DTYPES)

In [18]:
cal

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,2,1,1,2011,d_1,0,0,0,0,0.0,0.0,0.0
1,2011-01-30,11101,3,2,1,2011,d_2,0,0,0,0,0.0,0.0,0.0
2,2011-01-31,11101,1,3,1,2011,d_3,0,0,0,0,0.0,0.0,0.0
3,2011-02-01,11101,5,4,2,2011,d_4,0,0,0,0,1.0,1.0,0.0
4,2011-02-02,11101,6,5,2,2011,d_5,0,0,0,0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,6,5,6,2016,d_1965,0,0,0,0,0.0,1.0,1.0
1965,2016-06-16,11620,4,6,6,2016,d_1966,0,0,0,0,0.0,0.0,0.0
1966,2016-06-17,11620,0,7,6,2016,d_1967,0,0,0,0,0.0,0.0,0.0
1967,2016-06-18,11621,2,1,6,2016,d_1968,0,0,0,0,0.0,0.0,0.0


In [19]:
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [20]:
pred_days = 28
max_lags = pred_days * 2 + 1
print('max_lags', max_lags)
num_cols = [c for c in pd.read_csv(path/"sales_train_validation.csv", nrows=2).columns if c.find('d_') == 0]
tr_last = len(num_cols)
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
# For more training data use a lower value
FIRST_DAY=900

max_lags 57


In [21]:
def read_dt(is_train = True, first_day = 1200):
    start_day = max(1 if is_train else tr_last - max_lags - 365, first_day)
    print('start_day', start_day)
    dtype = {num: 'float32' for num in num_cols}
    dtype.update({cat: 'category' for cat in catcols if cat != 'id'})
    print('tr_last', tr_last + 1)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    dt = pd.read_csv(path/"sales_train_validation.csv", usecols = catcols + numcols, dtype=dtype)
    for col in catcols:
        if col != 'id':
            dt[col] = dt[col].cat.codes.astype('int16')
            dt[col] -= dt[col].min()
    if not is_train:
        for day in range(tr_last + 1, tr_last + 1 + pred_days):
            dt[f'd_{day}'] = np.nan
            
    dt = dt.melt(id_vars=catcols, value_vars=[col for col in dt.columns if col.startswith("d_")], var_name='d', value_name='sales')
    dt = dt.merge(cal, on='d', copy=False)
    print('dt.date.max()', dt.date.max())
    dt = dt.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], copy=False)
    return dt

In [22]:
%%time

dt = read_dt(first_day=FIRST_DAY)

start_day 900
tr_last 1914
dt.date.max() 2016-04-24 00:00:00
CPU times: user 9.09 s, sys: 1.56 s, total: 10.7 s
Wall time: 10.8 s


In [23]:
dt

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_900,0.0,2013-07-16,11325,5,4,7,2013,0,0,0,0,0.0,0.0,0.0,9.58
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_901,0.0,2013-07-17,11325,6,5,7,2013,0,0,0,0,0.0,0.0,0.0,9.58
2,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_902,1.0,2013-07-18,11325,4,6,7,2013,0,0,0,0,0.0,0.0,0.0,9.58
3,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_903,0.0,2013-07-19,11325,0,7,7,2013,0,0,0,0,0.0,0.0,0.0,9.58
4,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_900,0.0,2013-07-16,11325,5,4,7,2013,0,0,0,0,0.0,0.0,0.0,3.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29115881,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.98
29115882,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
29115883,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.28
29115884,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.00


In [24]:
attr = ['Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'days_in_month']
date_features = {
    "wday": "weekday",
    "week": "weekofyear",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day"
}

for f in attr:
    date_features[f.lower()] = f.lower()
    
date_features

{'wday': 'weekday',
 'week': 'weekofyear',
 'month': 'month',
 'quarter': 'quarter',
 'year': 'year',
 'mday': 'day',
 'dayofyear': 'dayofyear',
 'is_month_end': 'is_month_end',
 'is_month_start': 'is_month_start',
 'is_quarter_end': 'is_quarter_end',
 'is_quarter_start': 'is_quarter_start',
 'is_year_end': 'is_year_end',
 'is_year_start': 'is_year_start',
 'days_in_month': 'days_in_month'}

In [25]:
def prepare_date_cols(dt):
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in dt.columns:
            dt[date_feature_name] = dt[date_feature_name].astype('int16')
        else:
            dt[date_feature_name] = getattr(dt['date'].dt, date_feature_func).astype('int16')

In [26]:
%%time
prepare_date_cols(dt)

CPU times: user 17.2 s, sys: 246 ms, total: 17.4 s
Wall time: 17.4 s


In [27]:
dt.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week,quarter,mday,dayofyear,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start,days_in_month
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_900,0.0,2013-07-16,11325,5,4,7,2013,0,0,0,0,0.0,0.0,0.0,9.58,29,3,16,197,0,0,0,0,0,0,31
1,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_901,0.0,2013-07-17,11325,6,5,7,2013,0,0,0,0,0.0,0.0,0.0,9.58,29,3,17,198,0,0,0,0,0,0,31
2,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_902,1.0,2013-07-18,11325,4,6,7,2013,0,0,0,0,0.0,0.0,0.0,9.58,29,3,18,199,0,0,0,0,0,0,31
3,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_903,0.0,2013-07-19,11325,0,7,7,2013,0,0,0,0,0.0,0.0,0.0,9.58,29,3,19,200,0,0,0,0,0,0,31
4,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_900,0.0,2013-07-16,11325,5,4,7,2013,0,0,0,0,0.0,0.0,0.0,3.97,29,3,16,197,0,0,0,0,0,0,31


In [28]:
def get_elapsed(dt, event_name='Christmas', col='event_name_1', event_map=event_name_1_map, before=False):
    first_order = (not before)
    dt.sort_values(['store_id', 'date'], ascending=[False, first_order], inplace=True)
    day1 = np.timedelta64(1, 'D')
    last_date = np.datetime64()
    last_store = 0
    res = []
    event = event_map[event_name]
    for s,v,d in zip(dt.store_id.values, dt[col].values, dt.date.values):
        if s != last_store:
            last_date = np.datetime64()
            last_store = s
        if v == event:
            last_date = d
        elapsed = ((d-last_date).astype('timedelta64[D]') / day1)
        res.append(elapsed)
    field_name = f"{'before' if before else 'after'}_{event_name.lower().replace(' ', '_')}"
    dt[field_name] = res

In [29]:
event_name_1_map

{nan: 0,
 'SuperBowl': 1,
 'ValentinesDay': 2,
 'PresidentsDay': 3,
 'LentStart': 4,
 'LentWeek2': 5,
 'StPatricksDay': 6,
 'Purim End': 7,
 'OrthodoxEaster': 8,
 'Pesach End': 9,
 'Cinco De Mayo': 10,
 "Mother's day": 11,
 'MemorialDay': 12,
 'NBAFinalsStart': 13,
 'NBAFinalsEnd': 14,
 "Father's day": 15,
 'IndependenceDay': 16,
 'Ramadan starts': 17,
 'Eid al-Fitr': 18,
 'LaborDay': 19,
 'ColumbusDay': 20,
 'Halloween': 21,
 'EidAlAdha': 22,
 'VeteransDay': 23,
 'Thanksgiving': 24,
 'Christmas': 25,
 'Chanukah End': 26,
 'NewYear': 27,
 'OrthodoxChristmas': 28,
 'MartinLutherKingDay': 29,
 'Easter': 30}

In [30]:
event_name_1_events = ['OrthodoxEaster', 'Easter', "Mother's day", 'Christmas', 'Ramadan starts', 'StPatricksDay']
def process_elapsed(dt):
    for event_name in tqdm(event_name_1_events, total=len(event_name_1_events)):
        print('event name', event_name)
        get_elapsed(dt, event_name, 'event_name_1', event_name_1_map, False)
#         get_elapsed(dt, event_name, 'event_name_1', event_name_1_map, True)

In [31]:
%%time
process_elapsed(dt)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

event name OrthodoxEaster
event name Easter
event name Mother's day
event name Christmas
event name Ramadan starts
event name StPatricksDay

CPU times: user 14min 1s, sys: 9.73 s, total: 14min 11s
Wall time: 14min 11s


In [32]:
# check
dt[(dt.id == 'HOBBIES_1_001_WI_3_validation') & ((dt.date == '2014-12-23') | (dt.date == '2014-12-24') | (dt.date == '2014-12-25') | (dt.date == '2014-12-26'))][['date', 'after_christmas', 'after_easter', 'after_ramadan_starts']]

Unnamed: 0,date,after_christmas,after_easter,after_ramadan_starts
14411385,2014-12-23,363.0,247.0,177.0
14411386,2014-12-24,364.0,248.0,178.0
14411387,2014-12-25,0.0,249.0,179.0
14411388,2014-12-26,1.0,250.0,180.0


## Create features

In [33]:
def create_features(dt):
    lags = [7, 28]
    lag_cols = [f'sales_lag_{lag}' for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    for win in lags:
        for lag, lag_col in zip(lags, lag_cols):
            dt[f'rmean_{lag}_{win}'] = dt[['id', lag_col]].groupby('id')[lag_col].transform(lambda x : x.rolling(win).mean())

In [34]:
%%time
create_features(dt)

CPU times: user 1min 17s, sys: 3.9 s, total: 1min 21s
Wall time: 1min 21s


In [35]:
dt[dt.event_name_1 == event_name_1_map['Christmas']]

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week,quarter,mday,dayofyear,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start,days_in_month,after_orthodoxeaster,after_easter,after_mother's_day,after_christmas,after_ramadan_starts,after_stpatricksday,sales_lag_7,sales_lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28
4168734,HOBBIES_1_001_WI_3_validation,0,0,9,0,2,d_1062,0.0,2013-12-25,11348,6,5,12,2013,25,3,0,0,0.0,0.0,0.0,8.26,52,4,25,359,0,0,0,0,0,0,31,,,,0.0,,,0.0,0.0,0.142857,0.142857,0.250000,0.214286
4168741,HOBBIES_1_002_WI_3_validation,1,0,9,0,2,d_1062,0.0,2013-12-25,11348,6,5,12,2013,25,3,0,0,0.0,0.0,0.0,3.97,52,4,25,359,0,0,0,0,0,0,31,,,,0.0,,,0.0,0.0,0.285714,0.285714,0.321429,0.178571
4168748,HOBBIES_1_004_WI_3_validation,3,0,9,0,2,d_1062,0.0,2013-12-25,11348,6,5,12,2013,25,3,0,0,0.0,0.0,0.0,4.64,52,4,25,359,0,0,0,0,0,0,31,,,,0.0,,,2.0,2.0,4.571429,2.857143,3.750000,3.500000
4168755,HOBBIES_1_005_WI_3_validation,4,0,9,0,2,d_1062,0.0,2013-12-25,11348,6,5,12,2013,25,3,0,0,0.0,0.0,0.0,3.08,52,4,25,359,0,0,0,0,0,0,31,,,,0.0,,,0.0,2.0,0.714286,0.714286,0.392857,0.607143
4168762,HOBBIES_1_006_WI_3_validation,5,0,9,0,2,d_1062,0.0,2013-12-25,11348,6,5,12,2013,25,3,0,0,0.0,0.0,0.0,1.00,52,4,25,359,0,0,0,0,0,0,31,,,,0.0,,,0.0,1.0,0.428571,1.000000,0.571429,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25235229,FOODS_3_823_CA_1_validation,3044,6,0,2,0,d_1792,0.0,2015-12-25,11547,0,7,12,2015,25,3,0,0,0.0,0.0,0.0,2.50,52,4,25,359,0,0,0,0,0,0,31,257.0,264.0,229.0,0.0,190.0,283.0,6.0,0.0,4.285714,1.142857,2.750000,1.214286
25235236,FOODS_3_824_CA_1_validation,3045,6,0,2,0,d_1792,0.0,2015-12-25,11547,0,7,12,2015,25,3,0,0,0.0,0.0,0.0,2.68,52,4,25,359,0,0,0,0,0,0,31,257.0,264.0,229.0,0.0,190.0,283.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
25235243,FOODS_3_825_CA_1_validation,3046,6,0,2,0,d_1792,0.0,2015-12-25,11547,0,7,12,2015,25,3,0,0,0.0,0.0,0.0,3.98,52,4,25,359,0,0,0,0,0,0,31,257.0,264.0,229.0,0.0,190.0,283.0,0.0,0.0,0.857143,0.714286,1.000000,0.607143
25235250,FOODS_3_826_CA_1_validation,3047,6,0,2,0,d_1792,0.0,2015-12-25,11547,0,7,12,2015,25,3,0,0,0.0,0.0,0.0,1.28,52,4,25,359,0,0,0,0,0,0,31,257.0,264.0,229.0,0.0,190.0,283.0,2.0,0.0,0.285714,0.142857,0.535714,0.607143


In [36]:
dt.dropna(inplace = True)

## Training Preparation

In [38]:
dt.columns

Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'week', 'quarter',
       'mday', 'dayofyear', 'is_month_end', 'is_month_start', 'is_quarter_end',
       'is_quarter_start', 'is_year_end', 'is_year_start', 'days_in_month',
       'after_orthodoxeaster', 'after_easter', 'after_mother's_day',
       'after_christmas', 'after_ramadan_starts', 'after_stpatricksday',
       'sales_lag_7', 'sales_lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28',
       'rmean_28_28'],
      dtype='object')

In [39]:
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'event_name_1', 
             'event_type_1', 'event_name_2', 'event_type_2']
ignore_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = [c for c in dt.columns if c not in ignore_cols]
X = dt[train_cols]
Y = dt['sales']

In [40]:
valid_size = int(X.shape[0] * 0.1)
np.random.seed(777)

valid_idx = np.random.choice(X.index.values, valid_size, replace=False)
train_idx = np.setdiff1d(X.index.values, valid_idx)
assert valid_idx.size + train_idx.size == X.shape[0]

In [41]:
train_data = lgb.Dataset(X.loc[train_idx], Y.loc[train_idx], categorical_feature=cat_feats, free_raw_data=False)
valid_data = lgb.Dataset(X.loc[valid_idx], Y.loc[valid_idx], categorical_feature=cat_feats, free_raw_data=False)

In [42]:
del dt, X, Y, valid_idx, train_idx
gc.collect()

40

## Training

In [43]:
lgb_params = {
        "objective" : "poisson",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
}

In [44]:
%%time
m_lgb = lgb.train(lgb_params, train_data, valid_sets=[valid_data], verbose_eval=20, categorical_feature=cat_feats, num_boost_round=800)

[20]	valid_0's rmse: 2.51046
[40]	valid_0's rmse: 2.22514
[60]	valid_0's rmse: 2.15019
[80]	valid_0's rmse: 2.12566
[100]	valid_0's rmse: 2.11343
[120]	valid_0's rmse: 2.10676
[140]	valid_0's rmse: 2.10001
[160]	valid_0's rmse: 2.09266
[180]	valid_0's rmse: 2.08685
[200]	valid_0's rmse: 2.08176
[220]	valid_0's rmse: 2.07637
[240]	valid_0's rmse: 2.07286
[260]	valid_0's rmse: 2.06991
[280]	valid_0's rmse: 2.068
[300]	valid_0's rmse: 2.06554
[320]	valid_0's rmse: 2.06276
[340]	valid_0's rmse: 2.06083
[360]	valid_0's rmse: 2.05888
[380]	valid_0's rmse: 2.05788
[400]	valid_0's rmse: 2.05561
[420]	valid_0's rmse: 2.05369
[440]	valid_0's rmse: 2.05179
[460]	valid_0's rmse: 2.04972
[480]	valid_0's rmse: 2.04816
[500]	valid_0's rmse: 2.04634
[520]	valid_0's rmse: 2.04511
[540]	valid_0's rmse: 2.04385
[560]	valid_0's rmse: 2.0419
[580]	valid_0's rmse: 2.04066
[600]	valid_0's rmse: 2.0386
[620]	valid_0's rmse: 2.03675
[640]	valid_0's rmse: 2.03527
[660]	valid_0's rmse: 2.03437
[680]	valid_0's rm

In [45]:
m_lgb.save_model('m5_model.lgb')

<lightgbm.basic.Booster at 0x7fe33c41d790>

## Prediction

In [46]:
alphas = [1.028, 1.023, 1.018]
weights = [1 / len(alphas)] * len(alphas)
assert sum(weights) == 1.0
fday = datetime(2016, 4, 25) 
assert datetime(2011, 1, 29) + timedelta(days=1914 - 1) == fday

In [47]:
cols = [f'F{i}' for i in range(1, pred_days + 1)]
sub = pd.DataFrame()
te = read_dt(False)
prepare_date_cols(te)
process_elapsed(te)

start_day 1491
tr_last 1914
dt.date.max() 2016-05-22 00:00:00


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

event name OrthodoxEaster
event name Easter
event name Mother's day
event name Christmas
event name Ramadan starts
event name StPatricksDay



In [52]:
# check
te[(te.id == 'HOBBIES_1_001_WI_3_validation') & ((te.date == '2016-03-23') | (te.date == '2016-03-24') | (te.date == '2016-03-25') | (te.date == '2016-03-26'))][
    ['date', *[c for c in te.columns if c.find('after') > -1]]]

Unnamed: 0,date,after_orthodoxeaster,after_easter,after_mother's_day,after_christmas,after_ramadan_starts,after_stpatricksday
11900421,2016-03-23,346.0,353.0,318.0,89.0,279.0,6.0
11900422,2016-03-24,347.0,354.0,319.0,90.0,280.0,7.0
11900423,2016-03-25,348.0,355.0,320.0,91.0,281.0,8.0
12113847,2016-03-26,349.0,356.0,321.0,92.0,282.0,9.0


In [53]:
for icount, (alpha, weight) in tqdm(enumerate(zip(alphas, weights)), total=len(alphas)):
    for tdelta in tqdm(range(0, pred_days), total=pred_days):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_features(tst)
        tst = tst.loc[tst.date == day, train_cols]
        te.loc[te.date == day, 'sales'] = alpha * m_lgb.predict(tst) # magic multiplier by kyakovlev
    
    te_sub = te.loc[te.date >= fday, ['id', 'sales']].copy()
    te_sub['F'] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()['sales'][cols]
    te_sub.fillna(0., inplace=True)
    te_sub.sort_values(["id"], inplace=True)
    te_sub.reset_index(drop=False, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    
    if icount == 0:
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols] * weight

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
# For now evaluation is just a copy.
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00



HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00



HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00




In [54]:
# check
te[(te.id == 'HOBBIES_1_001_WI_3_validation') & ((te.date == '2016-04-23') | (te.date == '2016-04-24') | (te.date == '2016-04-25') | (te.date == '2016-04-26'))][
    ['date', *[c for c in te.columns if c.find('after') > -1]]]

Unnamed: 0,date,after_orthodoxeaster,after_easter,after_mother's_day,after_christmas,after_ramadan_starts,after_stpatricksday
12967567,2016-04-23,377.0,27.0,349.0,120.0,310.0,37.0
12967568,2016-04-24,378.0,28.0,350.0,121.0,311.0,38.0
12967569,2016-04-25,379.0,29.0,351.0,122.0,312.0,39.0
12967570,2016-04-26,380.0,30.0,352.0,123.0,313.0,40.0


In [55]:
!wc -l submission.csv

60981 submission.csv


In [56]:
pd.read_csv('submission.csv')

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.775202,0.848794,0.821733,0.821344,1.090897,1.324935,0.999181,0.802272,0.806371,0.760746,0.807450,1.071683,1.387498,1.050068,0.928031,0.829340,0.902753,0.891602,1.077443,1.373861,1.232467,0.962229,0.853324,0.846029,0.856369,1.108729,1.314884,1.238077
1,FOODS_1_001_CA_2_validation,0.853842,0.787821,0.756162,0.764045,1.042874,1.300756,0.720310,0.728634,0.805825,0.800217,0.943854,1.200724,1.470794,1.253515,1.037489,0.978464,1.028597,1.025973,1.373515,1.688156,1.114765,0.720307,0.650085,0.673670,0.667583,0.846487,1.060070,1.371352
2,FOODS_1_001_CA_3_validation,0.886629,0.788446,0.675055,0.727228,0.949402,1.365992,1.096846,0.853654,0.952291,0.811122,0.955537,1.127401,1.740919,1.293123,0.990683,0.866365,0.885589,0.907072,1.171310,1.712114,1.208442,0.771421,0.683687,0.684595,0.716272,0.966643,1.379446,1.105581
3,FOODS_1_001_CA_4_validation,0.468857,0.393133,0.392706,0.398710,0.409569,0.450557,0.378635,0.320641,0.349186,0.335023,0.388424,0.355183,0.409825,0.386341,0.338822,0.321455,0.362593,0.367042,0.378491,0.477793,0.461959,0.355555,0.343355,0.339745,0.347317,0.397867,0.453962,0.411749
4,FOODS_1_001_TX_1_validation,0.226179,0.224970,0.207032,0.210898,0.205555,0.216792,0.150654,0.413263,0.407514,0.325608,0.371212,0.378443,0.371111,0.324840,0.368400,0.349298,0.369299,0.368226,0.323784,0.347383,0.342996,0.275538,0.251540,0.257273,0.256858,0.278107,0.300794,0.298798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,HOUSEHOLD_2_516_TX_2_evaluation,0.270415,0.248563,0.263781,0.258258,0.297192,0.411834,0.413057,0.273893,0.262345,0.255026,0.264549,0.312838,0.375828,0.440756,0.275911,0.267873,0.263807,0.269734,0.314828,0.403410,0.396112,0.279224,0.262234,0.268852,0.268526,0.322555,0.408116,0.375001
60976,HOUSEHOLD_2_516_TX_3_evaluation,0.143618,0.134992,0.142240,0.136873,0.152033,0.165185,0.169118,0.136166,0.121155,0.109327,0.120010,0.135094,0.176927,0.190635,0.129794,0.120818,0.118888,0.124898,0.143171,0.178432,0.169383,0.132075,0.130713,0.131308,0.140294,0.159964,0.194897,0.187715
60977,HOUSEHOLD_2_516_WI_1_evaluation,0.066522,0.064754,0.064147,0.070807,0.092724,0.106821,0.115619,0.086268,0.078205,0.074771,0.079861,0.110362,0.127231,0.128386,0.084407,0.077681,0.076780,0.081093,0.105695,0.124411,0.118370,0.082884,0.077382,0.080376,0.087695,0.118240,0.130889,0.124225
60978,HOUSEHOLD_2_516_WI_2_evaluation,0.056646,0.058090,0.057545,0.071970,0.077147,0.087268,0.102801,0.080907,0.077922,0.072480,0.078837,0.092366,0.098367,0.111058,0.081696,0.077847,0.075848,0.080245,0.096994,0.103116,0.097564,0.084067,0.081929,0.082037,0.079837,0.102320,0.108928,0.103309
