In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from scripts import m5_common
from tqdm.notebook import tqdm

pd.options.display.max_columns = 50

## Main Variables

In [3]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [4]:
h = 28 
max_lags = h * 2 + 1
tr_last = 1913
fday = datetime(2016, 4, 25)

## Load data

In [5]:
%%time

prices, cal = m5_common.prepare_tables(path)

CPU times: user 1.59 s, sys: 197 ms, total: 1.78 s
Wall time: 2.01 s


In [6]:
event_name_1_map, event_type_1_map = m5_common.replace_cal_cols(cal)

In [7]:
cal[(cal.date > '2012-01-01') & (cal.date < '2012-01-05')]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
338,2012-01-02,11149,1,3,1,2012,d_339,0,0,0,0,1.0,0.0,1.0
339,2012-01-03,11149,5,4,1,2012,d_340,0,0,0,0,1.0,1.0,1.0
340,2012-01-04,11149,6,5,1,2012,d_341,0,0,0,0,1.0,0.0,0.0


In [8]:
uint8_types= ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'month', 'wday', 'weekday', 
              'snap_CA', 'snap_TX', 'snap_WI']
m5_common.convert_uint8(cal, uint8_types)

In [9]:
m5_common.add_days_before(cal)

In [10]:
FIRST_DAY = 1

In [11]:
%%time

df = m5_common.create_dt(cal, prices, is_train=True, first_day=FIRST_DAY, tr_last=tr_last, path=path)

CPU times: user 41.6 s, sys: 5.54 s, total: 47.1 s
Wall time: 47.3 s


In [12]:
def create_fea(dt):
    
    wins = [7, 28]
    lags = [7, 28]
    
    grouped_sales = dt[["id","sales"]].groupby("id")["sales"]
    
    for win in tqdm(wins, total=len(wins)):
        mean_col = f'mean_{win}'
        emean_col = f'e{mean_col}' # exponential mean average
        esmean_col = f'es{mean_col}'
        dt[emean_col] = grouped_sales.transform(lambda x : x.ewm(span=win, adjust=False).mean())
        dt[esmean_col] = grouped_sales.transform(lambda x : x.ewm(alpha=1/win, adjust=False).mean())
        for lag in lags:
            dt[f'emean_{win}_{lag}'] = dt[["id", emean_col]].groupby("id").shift(lag)
            dt[f'esmean_{win}_{lag}'] = dt[["id", esmean_col]].groupby("id").shift(lag)
        del dt[emean_col]
        del dt[esmean_col]
            
    ra = [1, 2]
    for simple_lag in ra:
        dt[f'lag_{simple_lag}'] = dt[["id","sales"]].groupby("id")["sales"].shift(simple_lag)

In [13]:
%%time

create_fea(df)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


CPU times: user 1min 26s, sys: 6.32 s, total: 1min 33s
Wall time: 1min 33s


In [14]:
df.dropna(inplace = True)

In [15]:
%%time

df['sales_positive'] = (df['sales'] > 0).astype('uint8')

CPU times: user 208 ms, sys: 19.7 ms, total: 228 ms
Wall time: 228 ms


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 342559 to 46025082
Data columns (total 38 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   item_id           int16         
 2   dept_id           int16         
 3   store_id          int16         
 4   cat_id            int16         
 5   state_id          int16         
 6   d                 object        
 7   sales             float16       
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           uint8         
 11  wday              uint8         
 12  month             uint8         
 13  year              int16         
 14  event_name_1      uint8         
 15  event_type_1      uint8         
 16  event_name_2      uint8         
 17  event_type_2      uint8         
 18  snap_CA           uint8         
 19  snap_TX           uint8         
 20  snap_WI           uint8         
 21  b

In [17]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id', "event_name_1", "event_name_2", "event_type_1",  "event_type_2", 'snap_CA', 'snap_TX', 'snap_WI']
useless_cols = ["id", "date", "sales", "sales_positive", "d", "wm_yr_wk", "weekday", "revenue"]

train_cols = df.columns[~df.columns.isin(useless_cols)]

In [18]:
del df
gc.collect()

114

### Load Models

In [19]:
m_lgb_bin = lgb.Booster(model_file=str(path/"m5_model_binary.lgb"))

In [20]:
m_lgb = lgb.Booster(model_file=str(path/"m5_model.lgb"))

### Prediction

In [44]:
%%time

max_lags = h * 2 + 1
sub = 0.
cols = [f"F{i}" for i in range(1,29)]
te = m5_common.create_dt(cal, prices, False, first_day=FIRST_DAY, path=path)
zero_threshold = 0.1

for tdelta in tqdm(range(0, h), total=h):
    day = fday + timedelta(days=tdelta)
    print(tdelta, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_fea(tst)
    tst = tst.loc[tst.date == day, train_cols]
    te.loc[te.date == day, "sales_bin"] = m_lgb_bin.predict(tst)
    te.loc[te.date == day, "sales"] = m_lgb.predict(tst)
    if tdelta == 3:
        break

te["sales_bin_filter"] = (te["sales_bin"] >= zero_threshold).astype('float16')
te["sales_normal"] = te["sales"].copy()
te["sales_trim"] = np.where(te["sales"] > 0.01, te["sales"], 0.0).astype('float16')
te["sales"] = te["sales_bin_filter"] * te["sales"].astype('float16')
        
te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
                                                                      "id"].str.replace("validation$", "evaluation")
te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
te_sub.fillna(0., inplace = True)
te_sub.sort_values("id", inplace = True)
te_sub.reset_index(drop=True, inplace = True)
sub = te_sub
    

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


1 2016-04-26 00:00:00


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


2 2016-04-27 00:00:00


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


3 2016-04-28 00:00:00


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


CPU times: user 11min 23s, sys: 8.31 s, total: 11min 32s
Wall time: 4min 35s


In [47]:
check_sample = (te[te.date == day][["sales_bin", "sales_bin_filter", 'sales_normal', 'sales_trim', 'sales']])[0:100]

In [48]:
check_sample.head(50)

Unnamed: 0,sales_bin,sales_bin_filter,sales_normal,sales_trim,sales
45978266,0.427366,1.0,0.664062,0.664062,0.664062
45999609,0.57356,1.0,0.991211,0.991211,0.991211
46020952,0.338521,1.0,0.857422,0.857422,0.857422
46042295,0.185979,1.0,0.292969,0.292969,0.292969
46063638,0.257941,1.0,0.273193,0.273193,0.273193
46084981,0.279399,1.0,0.434082,0.434082,0.434082
46106324,0.244696,1.0,0.384277,0.384277,0.384277
46127667,0.254187,1.0,0.399414,0.399414,0.399414
46149010,0.228136,1.0,0.399658,0.399658,0.399658
46170353,0.299074,1.0,0.341309,0.341309,0.341309


In [None]:
plt.plot(np.arange(len(check_sample)), check_sample['sales_bin'])
plt.plot(np.arange(len(check_sample)), check_sample['sales_normal'])

In [None]:
sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

In [None]:
from collections import Counter
c = Counter(sub['F1'])

In [None]:
c[0] / len(sub['F1'])