In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from scripts import m5_common
from tqdm.notebook import tqdm

pd.options.display.max_columns = 50

## Main Variables

In [3]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [4]:
h = 28 
max_lags = h * 2 + 1
tr_last = 1913
fday = datetime(2016, 4, 25)

## Load data

In [5]:
%%time

prices, cal = m5_common.prepare_tables(path)

CPU times: user 1.41 s, sys: 156 ms, total: 1.56 s
Wall time: 1.56 s


In [6]:
event_name_1_map, event_type_1_map = m5_common.replace_cal_cols(cal)

In [7]:
cal[(cal.date > '2012-01-01') & (cal.date < '2012-01-05')]

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
338,2012-01-02,11149,1,3,1,2012,d_339,0,0,0,0,1.0,0.0,1.0
339,2012-01-03,11149,5,4,1,2012,d_340,0,0,0,0,1.0,1.0,1.0
340,2012-01-04,11149,6,5,1,2012,d_341,0,0,0,0,1.0,0.0,0.0


In [8]:
uint8_types= ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'month', 'wday', 'weekday', 
              'snap_CA', 'snap_TX', 'snap_WI']
m5_common.convert_uint8(cal, uint8_types)

In [9]:
m5_common.add_days_before(cal)

In [10]:
FIRST_DAY = 1

In [11]:
%%time

df = m5_common.create_dt(cal, prices, is_train=True, first_day=FIRST_DAY, tr_last=tr_last, path=path)

CPU times: user 41.9 s, sys: 5.7 s, total: 47.6 s
Wall time: 47.6 s


In [12]:
def create_fea(dt):
    
    wins = [7, 28]
    lags = [7, 28]
    
    grouped_sales = dt[["id","sales"]].groupby("id")["sales"]
    
    for win in tqdm(wins, total=len(wins)):
        mean_col = f'mean_{win}'
        emean_col = f'e{mean_col}' # exponential mean average
        esmean_col = f'es{mean_col}'
        dt[emean_col] = grouped_sales.transform(lambda x : x.ewm(span=win, adjust=False).mean())
        dt[esmean_col] = grouped_sales.transform(lambda x : x.ewm(alpha=1/win, adjust=False).mean())
        for lag in lags:
            dt[f'emean_{win}_{lag}'] = dt[["id", emean_col]].groupby("id").shift(lag)
            dt[f'esmean_{win}_{lag}'] = dt[["id", esmean_col]].groupby("id").shift(lag)
        del dt[emean_col]
        del dt[esmean_col]
            
    ra = [1, 2]
    for simple_lag in ra:
        dt[f'lag_{simple_lag}'] = dt[["id","sales"]].groupby("id")["sales"].shift(simple_lag)

In [13]:
%%time

create_fea(df)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


CPU times: user 1min 30s, sys: 6.78 s, total: 1min 37s
Wall time: 1min 37s


In [14]:
df.dropna(inplace = True)

In [15]:
%%time

df['sales_positive'] = (df['sales'] > 0).astype('uint8')

CPU times: user 225 ms, sys: 8.12 ms, total: 233 ms
Wall time: 233 ms


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 342559 to 46025082
Data columns (total 38 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   item_id           int16         
 2   dept_id           int16         
 3   store_id          int16         
 4   cat_id            int16         
 5   state_id          int16         
 6   d                 object        
 7   sales             float16       
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           uint8         
 11  wday              uint8         
 12  month             uint8         
 13  year              int16         
 14  event_name_1      uint8         
 15  event_type_1      uint8         
 16  event_name_2      uint8         
 17  event_type_2      uint8         
 18  snap_CA           uint8         
 19  snap_TX           uint8         
 20  snap_WI           uint8         
 21  b

In [17]:
np.random.seed(777)

cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id', "event_name_1", "event_name_2", "event_type_1",  "event_type_2", 'snap_CA', 'snap_TX', 'snap_WI']
useless_cols = ["id", "date", "sales", "sales_positive", "d", "wm_yr_wk", "weekday", "revenue"]

train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales_positive"]

size_valid_set = 2_000_000
fake_valid_inds = np.random.choice(X_train.index.values, size_valid_set, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)

In [18]:
del df
gc.collect()

26

In [19]:
leave_size = 11

params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'binary_error'],
    'verbosity': 20,
    "n_estimators": 1000,
    'learning_rate': 0.035,
    'num_leaves': 2**leave_size-1,
    "min_data_in_leaf": 2**(leave_size + 1)-1
}

In [20]:
m_lgb = lgb.LGBMClassifier(**params)

In [21]:
%%time

X = X_train.loc[train_inds]
y = y_train.loc[train_inds]

m_lgb.fit(X=X, y=y, 
          eval_set=[(X, y), (X_train.loc[fake_valid_inds], y_train.loc[fake_valid_inds])],
          eval_names=['train sales', 'valid sales'], 
          eval_metric=params['metric'],
          verbose=params['verbosity'],
          early_stopping_rounds=100,
          categorical_feature=cat_feats)

New categorical_feature is ['cat_id', 'dept_id', 'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2', 'item_id', 'snap_CA', 'snap_TX', 'snap_WI', 'state_id', 'store_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[20]	train sales's binary_logloss: 0.518209	train sales's binary_error: 0.225768	valid sales's binary_logloss: 0.518904	valid sales's binary_error: 0.226559
[40]	train sales's binary_logloss: 0.469378	train sales's binary_error: 0.217699	valid sales's binary_logloss: 0.470835	valid sales's binary_error: 0.218516
[60]	train sales's binary_logloss: 0.45058	train sales's binary_error: 0.215895	valid sales's binary_logloss: 0.45293	valid sales's binary_error: 0.217164
[80]	train sales's binary_logloss: 0.442315	train sales's binary_error: 0.214769	valid sales's binary_logloss: 0.445659	valid sales's binary_error: 0.216463
[100]	train sales's binary_logloss: 0.438115	train sales's binary_error: 0.213892	valid sales's binary_logloss: 0.442454	valid sales's binary_error: 0.215969
[120]	train sales's binary_logloss: 0.435474	train sales's binary_error: 0.213038	valid sales's binary_logloss: 0.440837	valid sales's binary_error: 0.215

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.035, max_depth=-1,
               metric=['binary_logloss', 'binary_error'], min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=4095,
               min_split_gain=0.0, n_estimators=1000, n_jobs=-1,
               num_leaves=2047, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, verbosity=20)

In [22]:
m_lgb.booster_.save_model(str(path/f"m5_model_binary.lgb"))

<lightgbm.basic.Booster at 0x7fa8f8ab5990>

In [23]:
m_lgb_bin = lgb.Booster(model_file=str(path/"m5_model_binary.lgb"), params=params)

### Check prediction

In [24]:
te = m5_common.create_dt(cal, prices, False, first_day=FIRST_DAY, path=path)
tdelta = 0
day = fday + timedelta(days=tdelta)
print(day)
tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
create_fea(tst)
tst = tst.loc[tst.date == day, train_cols]
pred = m_lgb_bin.predict(tst)
pred = (pred >= 0.5).astype(int)

2016-04-25 00:00:00


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [25]:
np.min(pred), np.max(pred)

(0, 1)

In [26]:
from  collections import Counter
check_counter = Counter(pred)

In [27]:
check_counter

Counter({0: 19263, 1: 11227})

In [28]:
check_counter[0] / (check_counter[1] + check_counter[0])

0.6317809117743522