In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from scripts import m5_common
from tqdm.notebook import tqdm
from typing import Union

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [4]:
from tqdm.notebook import tqdm
import copy

In [5]:
import mlflow

### Path

In [6]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [7]:
h = 28 
tr_last = 1913
fday = datetime(2016, 4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

### Connect to MLflow server

In [8]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000
# server in /opt/mlflow_server/start.sh
remote_server_uri = "http://localhost:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

In [9]:
mlflow.set_experiment('M5_Pytorch')

### Prepare Data

In [10]:
%%time

prices, cal = m5_common.prepare_tables(path)

CPU times: user 1.42 s, sys: 160 ms, total: 1.58 s
Wall time: 1.58 s


In [11]:
event_name_1_map, event_type_1_map = m5_common.replace_cal_cols(cal)

In [12]:
uint8_types= ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'month', 'wday', 'weekday', 
              'snap_CA', 'snap_TX', 'snap_WI']
m5_common.convert_uint8(cal, uint8_types)

In [13]:
m5_common.add_days_before(cal)

In [14]:
FIRST_DAY = 1 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [15]:
%%time

df = m5_common.create_dt(cal, prices, is_train=True, first_day= FIRST_DAY, tr_last=tr_last, path=path)

CPU times: user 42 s, sys: 5.8 s, total: 47.8 s
Wall time: 47.8 s


In [16]:
def replace_cats(dt):
    m5_common.replace_cat(dt, 'wday')
    m5_common.replace_cat(dt, 'month')
    m5_common.replace_cat(dt, 'year')
    m5_common.replace_cat(dt, 'mday')

In [17]:
replace_cats(df)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11}
{2011: 0, 2012: 1, 2013: 2, 2014: 3, 2015: 4, 2016: 5}
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30}


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46027957 entries, 4081 to 46025082
Data columns (total 27 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   item_id           int16         
 2   dept_id           int16         
 3   store_id          int16         
 4   cat_id            int16         
 5   state_id          int16         
 6   d                 object        
 7   sales             float16       
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           uint8         
 11  wday              uint8         
 12  month             uint8         
 13  year              uint8         
 14  event_name_1      uint8         
 15  event_type_1      uint8         
 16  event_name_2      uint8         
 17  event_type_2      uint8         
 18  snap_CA           uint8         
 19  snap_TX           uint8         
 20  snap_WI           uint8         
 21  bef

### Feature Engineering

In [19]:
def mean_cols(dt, cols):
    dt[f"mean_{'_'.join(cols)}"] = dt[cols].mean(axis=1)

def create_fea(dt, dropna=True):
    
    wins = [7, 28]
    lags = [7, 28]
    
    grouped_sales = dt[["id","sales"]].groupby("id")["sales"]
    
    for win in wins:
        mean_col = f'mean_{win}'
        emean_col = f'e{mean_col}' # exponential mean average
        esmean_col = f'es{mean_col}'
        dt[emean_col] = grouped_sales.transform(lambda x : x.ewm(span=win, adjust=False).mean())
        dt[esmean_col] = grouped_sales.transform(lambda x : x.ewm(alpha=1/win, adjust=False).mean())
        for lag in lags:
            dt[f'emean_{win}_{lag}'] = dt[["id", emean_col]].groupby("id").shift(lag)
            dt[f'esmean_{win}_{lag}'] = dt[["id", esmean_col]].groupby("id").shift(lag)
        del dt[emean_col]
        del dt[esmean_col]
            
    ra = [1, 2, 3, 4, 5, 6, 7]
    for simple_lag in ra:
        dt[f'lag_{simple_lag}'] = dt[["id","sales"]].groupby("id")["sales"].shift(simple_lag)
        
#     mean_cols(dt, ['lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7'])
    
    if dropna == True:
        dt.dropna(inplace = True)

In [20]:
%%time

create_fea(df)

CPU times: user 1min 52s, sys: 10.4 s, total: 2min 2s
Wall time: 2min 2s


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 342559 to 46025082
Data columns (total 42 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   item_id           int16         
 2   dept_id           int16         
 3   store_id          int16         
 4   cat_id            int16         
 5   state_id          int16         
 6   d                 object        
 7   sales             float16       
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           uint8         
 11  wday              uint8         
 12  month             uint8         
 13  year              uint8         
 14  event_name_1      uint8         
 15  event_type_1      uint8         
 16  event_name_2      uint8         
 17  event_type_2      uint8         
 18  snap_CA           uint8         
 19  snap_TX           uint8         
 20  snap_WI           uint8         
 21  b

In [22]:
df.dropna(inplace = True)
df.shape

(45174237, 42)

In [23]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,esmean_28_7,emean_28_28,esmean_28_28,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
342559,FOODS_1_001_CA_1_evaluation,1536,4,0,2,0,d_29,2.0,2011-02-26,11105,...,2.021484,3.000000,3.000000,4.0,2.0,2.0,2.0,0.0,2.0,1.0
342560,FOODS_1_001_CA_1_evaluation,1536,4,0,2,0,d_30,2.0,2011-02-27,11105,...,2.021484,2.792969,2.892578,2.0,4.0,2.0,2.0,2.0,0.0,2.0
342561,FOODS_1_001_CA_1_evaluation,1536,4,0,2,0,d_31,0.0,2011-02-28,11105,...,1.949219,2.599609,2.789062,2.0,2.0,4.0,2.0,2.0,2.0,0.0
342562,FOODS_1_001_CA_1_evaluation,1536,4,0,2,0,d_32,2.0,2011-03-01,11105,...,1.951172,2.490234,2.726562,0.0,2.0,2.0,4.0,2.0,2.0,2.0
342563,FOODS_1_001_CA_1_evaluation,1536,4,0,2,0,d_33,1.0,2011-03-02,11105,...,1.953125,2.593750,2.771484,2.0,0.0,2.0,2.0,4.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45956915,HOUSEHOLD_2_516_WI_3_evaluation,2047,3,9,1,2,d_1909,0.0,2016-04-20,11612,...,0.038788,0.079956,0.083252,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45956916,HOUSEHOLD_2_516_WI_3_evaluation,2047,3,9,1,2,d_1910,0.0,2016-04-21,11612,...,0.037384,0.074463,0.080261,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45956917,HOUSEHOLD_2_516_WI_3_evaluation,2047,3,9,1,2,d_1911,0.0,2016-04-22,11612,...,0.036041,0.069336,0.077393,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46025081,HOUSEHOLD_2_516_WI_3_evaluation,2047,3,9,1,2,d_1912,0.0,2016-04-23,11613,...,0.034760,0.064514,0.074646,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training Data Preparation

In [24]:
if not 'selected_features' in locals():
    selected_features = ['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd', 'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 
                         'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'before_christmas', 'sell_price', 'Dayofyear', 'week', 'mday', 'lag_price_1', 
                         'emean_7_7', 'esmean_7_7', 'emean_7_28', 'esmean_7_28', 'emean_28_7', 'esmean_28_7', 'emean_28_28', 'esmean_28_28', 'lag_1', 'lag_2', 'mean_lag_3_lag_4_lag_5_lag_6_lag_7']

In [25]:
def remove_features(dt, selected_features):
    for col in dt.columns:
        if col not in selected_features:
            del dt[col]

In [26]:
remove_features(df, selected_features)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 342559 to 46025082
Data columns (total 37 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   item_id           int16         
 2   dept_id           int16         
 3   store_id          int16         
 4   cat_id            int16         
 5   state_id          int16         
 6   d                 object        
 7   sales             float16       
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           uint8         
 11  wday              uint8         
 12  month             uint8         
 13  year              uint8         
 14  event_name_1      uint8         
 15  event_type_1      uint8         
 16  event_name_2      uint8         
 17  event_type_2      uint8         
 18  snap_CA           uint8         
 19  snap_TX           uint8         
 20  snap_WI           uint8         
 21  b

In [28]:
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
if not 'cat_feats' in locals():
    cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id', 
                 "event_name_1", "event_name_2", "event_type_1", "event_type_2", 'snap_CA', 'snap_TX', 'snap_WI', 'year', 'month', 'wday', 'mday']
train_cols = df.columns[~df.columns.isin(useless_cols)]

In [29]:
remove_features(df, list(train_cols) + ['sales',  'id'])

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45174237 entries, 342559 to 46025082
Data columns (total 33 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id                object 
 1   item_id           int16  
 2   dept_id           int16  
 3   store_id          int16  
 4   cat_id            int16  
 5   state_id          int16  
 6   sales             float16
 7   wday              uint8  
 8   month             uint8  
 9   year              uint8  
 10  event_name_1      uint8  
 11  event_type_1      uint8  
 12  event_name_2      uint8  
 13  event_type_2      uint8  
 14  snap_CA           uint8  
 15  snap_TX           uint8  
 16  snap_WI           uint8  
 17  before_christmas  uint16 
 18  sell_price        float16
 19  Dayofyear         uint16 
 20  week              uint8  
 21  mday              uint8  
 22  lag_price_1       float16
 23  emean_7_7         float16
 24  esmean_7_7        float16
 25  emean_7_28        float16
 26  esmean_

In [31]:
np.random.seed(777)

valid_size = int(df.shape[0] * 0.1)
np.random.seed(777)

valid_idx = np.random.choice(df.index.values, valid_size, replace=False)
train_idx = np.setdiff1d(df.index.values, valid_idx)

In [32]:
train_df = df.loc[train_idx]
valid_df = df.loc[valid_idx]

In [33]:
%%time

scale_df = m5_common.rmsse_scales(path)

CPU times: user 1min 30s, sys: 394 ms, total: 1min 31s
Wall time: 1min 31s


In [34]:
del df
gc.collect()

31

In [35]:
class TabularDataset(Dataset):
    def __init__(self, data, cat_cols=None, output_col=None):
        self.n = len(data)
        if output_col:
            self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
        else:
            self.y = np.zeros((self.n, 1))
        self.cat_cols = cat_cols if cat_cols else []
        
        self.cont_cols = [
            col for col in data.columns if col not in self.cat_cols + [output_col, 'id']
        ]
        
        self.cont_X = data[self.cont_cols].astype(np.float32).values
        self.cat_X = data[cat_cols].astype(np.int16).values
        
        self.ids = data['id'].values
        
    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n
    
    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.cont_X[idx], self.cat_X[idx], self.ids[idx]]

In [36]:
train_ds = TabularDataset(train_df, cat_cols=cat_feats, output_col='sales')
valid_ds = TabularDataset(valid_df, cat_cols=cat_feats, output_col='sales')

In [37]:
batchsize = 4096 * 4
num_workers = 11

In [38]:
train_dl = DataLoader(train_ds, batchsize, shuffle=True, num_workers=num_workers)
valid_dl = DataLoader(valid_ds, batchsize, shuffle=True, num_workers=num_workers)

In [39]:
scale_df.shape

(30490, 2)

In [40]:
scale_map = {}
for _, row in scale_df.iterrows():
    scale_map[row['id']] = row['scale']

In [41]:
one_batch = next(iter(train_dl))
# for b in one_batch:
print(one_batch[1].shape)
print(len(one_batch[3]))
torch.tensor([scale_map[id] for id in one_batch[3]])

torch.Size([16384, 15])
16384


tensor([ 3.4215, 23.3217,  2.3044,  ...,  4.4874,  0.8044,  0.8828])

In [42]:
scale_df

Unnamed: 0,id,scale
0,HOBBIES_1_001_CA_1_evaluation,0.722280
1,HOBBIES_1_002_CA_1_evaluation,0.566946
2,HOBBIES_1_003_CA_1_evaluation,0.336297
3,HOBBIES_1_004_CA_1_evaluation,7.056485
4,HOBBIES_1_005_CA_1_evaluation,2.547071
...,...,...
30485,FOODS_3_823_WI_3_evaluation,1.342573
30486,FOODS_3_824_WI_3_evaluation,1.049163
30487,FOODS_3_825_WI_3_evaluation,2.316946
30488,FOODS_3_826_WI_3_evaluation,1.108264


### Model Preparation

In [43]:
cat_dims = [int(train_df[col].nunique()) for col in cat_feats]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
emb_dims

[(3049, 500),
 (7, 4),
 (10, 5),
 (3, 2),
 (3, 2),
 (31, 16),
 (5, 3),
 (5, 3),
 (3, 2),
 (2, 1),
 (2, 1),
 (2, 1),
 (6, 3),
 (12, 6),
 (7, 4),
 (31, 16)]

In [44]:
max_clamp = train_df['sales'].max() * 1.3

def clamp_pred(pred):
    return torch.clamp(pred, 0.0, max_clamp)

In [45]:
del train_df
del valid_df
gc.collect()

88

In [46]:
class FeedForwardNN(nn.Module):
    def __init__(self, emb_dims, no_of_cont, lin_layer_sizes, output_size, emb_dropout, lin_layer_dropouts):
        super().__init__()
        
        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        self.no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_cont = no_of_cont
        
        # Linear Layers
        first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont, lin_layer_sizes[0])
        self.lin_layers = nn.ModuleList(
            [first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)]
        )
        
        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)
            
        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)
            
        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])
        
        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList(
            [nn.Dropout(size) for size in lin_layer_dropouts]
        )
        
    def forward(self, cont_data, cat_data):
        
        x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
        x = torch.cat(x, 1)
        x = self.emb_dropout_layer(x)
        
        normalized_cont_data = self.first_bn_layer(cont_data)
        x = torch.cat([x, normalized_cont_data], 1)
        
        for lin_layer, dropout_layer, bn_layer in zip(
            self.lin_layers, self.droput_layers, self.bn_layers
        ):
            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        return self.output_layer(x)

In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
no_of_cont = len([c for c in list(train_cols) if c not in cat_feats])
if not 'lin_layer_sizes' in locals():
    lin_layer_sizes = [200, 400]
model = FeedForwardNN(emb_dims, no_of_cont=no_of_cont, lin_layer_sizes=lin_layer_sizes, output_size=1, emb_dropout=0.04, lin_layer_dropouts=[0.001,0.01]).to(device)
model = model.to(device)

In [48]:
from torch import tensor

tweedie_variance_power = 1.5
rho = tensor(tweedie_variance_power).cuda()
eps = tensor(1e-10).cuda()

def tweedie_loss(pred, targ):
    pred, targ = pred.contiguous().view(-1), targ.contiguous().view(-1)
    pred = torch.where(pred < eps, eps, pred)
    a = targ * torch.exp((1 - rho) * torch.log(pred)) / (1 - rho)
    b = torch.exp((2 - rho) * torch.log(pred)) / (2 - rho)
    return torch.mean(-a + b)

In [49]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))
    
class YearMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, yhat, y, year_factor):
        mse = torch.mean((yhat - y)**2 * year_factor)
        return mse
    
class RMSSELoss(nn.Module):
    def __init__(self, scale_df):
        super().__init__()
        self.mse = nn.MSELoss()
        self.scale_df = scale_df
        
    def forward(self, yhat, y, id):
        score = torch.mean((yhat - y)**2)
        scale = torch.mean(torch.tensor([scale_map[i] for i in id]).cuda())
        return torch.sqrt(score / scale)

In [50]:
if not 'lr' in locals():
    lr = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# criterion = nn.MSELoss()
criterion = RMSSELoss(scale_df)
rmse = RMSELoss()
# criterion = tweedie_loss
if not 'epochs' in locals():
    epochs = 30

In [51]:
def move_to_dev(cat_x, cont_x, y):
    cat_x = cat_x.long().to(device)
    cont_x = cont_x.to(device)
    y  = y.to(device)
    return cat_x, cont_x, y

def mov_avg(x, x_prev, a=0.9):
    return x_prev * a + x * (1 - a)

In [52]:

def fit(mlflow):
    best_rmse_metric = 10000.0
    best_model = None
    steps = len(train_dl) * 3
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)
    for epoch in tqdm(range(epochs), total=epochs):
        model.train()
        prev_loss_avg = 0.0
        for i, (y, cont_x, cat_x, id) in tqdm(enumerate(train_dl), total=len(train_dl)):

            cat_x, cont_x, y = move_to_dev(cat_x, cont_x, y)

            # Forward Pass
            preds = model(cont_x, cat_x)
            loss = criterion.forward(preds, y, id)
            prev_loss_avg = mov_avg(loss, prev_loss_avg)
            if i % 100 == 0:
                print(f'{i}/{len(train_dl)} lr: {scheduler.get_last_lr()} Loss avg: {prev_loss_avg}\r', end='')

            # Backward Pass and Optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
        if epoch > 0 and epoch % 3 == 0:
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)

        ## Validation
        model.eval()
        nv = len(valid_dl)
        with torch.no_grad():
            tot_loss = 0.
            for y, cont_x, cat_x, id in valid_dl:
                cat_x, cont_x, y = move_to_dev(cat_x, cont_x, y)
                pred = model(cont_x, cat_x)
                pred = clamp_pred(pred)
                tot_loss += rmse(pred, y)

            rmse_metric = tot_loss/nv
            if rmse_metric < best_rmse_metric:
                best_rmse_metric = rmse_metric
                best_model = copy.deepcopy(model)
                print('Replaced best model')
                mlflow.log_metric(key="Best Validation RMSE", value=round(best_rmse_metric.item(), 5))
            mlflow.log_metric(key="Validation RMSE", value=round(rmse_metric.item(), 5))
            print(f'epoch: {epoch + 1} loss: {rmse_metric.item()}')
            

    return best_model

In [53]:
def run_pred(model_pred):
    max_lags = h * 2 + 1
    sub = 0.
    cols = [f"F{i}" for i in range(1,29)]
    te = m5_common.create_dt(cal, prices, False, first_day=FIRST_DAY, path=path)
    replace_cats(te)
    model_pred.eval()

    for tdelta in tqdm(range(0, h), total=h):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst, False)
        remove_features(tst, selected_features + ['id'])
        tst = tst.loc[tst.date == day, list(train_cols) + ['id']]
        # Prepare data loader and predict
        test_ds = TabularDataset(tst, cat_cols=cat_feats, output_col=None)
        test_dl = DataLoader(test_ds, len(tst), shuffle=False, num_workers=1)
        y, cont_x, cat_x, id = next(iter(test_dl))
        cat_x, cont_x, y = move_to_dev(cat_x, cont_x, y)
        preds = model_pred(cont_x, cat_x)
        preds = clamp_pred(preds)
        te.loc[te.date == day, "sales"] = preds.squeeze().detach().cpu().numpy()

    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
    te_sub["id"] = te_sub["id"].str.replace("evaluation$", "validation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    sub = te_sub

    sub2 = sub.copy()
    sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
    sub = pd.concat([sub, sub2], axis=0, sort=False)
    sub.to_csv("submission.csv",index=False)

In [54]:
## evaluation metric
## from https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834 and edited to get scores at all levels
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'cat_id',
            'state_id',
            'dept_id',
            'store_id',
            'item_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        group_ids = []
        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            group_ids.append(group_id)
            all_scores.append(lv_scores.sum())

        return group_ids, all_scores

## public LB rank
def get_lb_rank(df_lb, score):
    """
    Get rank on public LB as of 2020-05-31 23:59:59
    """
    return (df_lb.Score <= score).sum() + 1

def validation_ranking(mlflow):
    ## new train data
    df_train_full = pd.read_csv(path/"sales_train_evaluation.csv")
    df_train_full.iloc[:, -31:].head()
    
    df_lb = pd.read_csv(path/"m5-forecasting-accuracy-publicleaderboard-rank.csv")
    
    ## reading data
    df_calendar = pd.read_csv(path/"calendar.csv")
    df_prices = pd.read_csv(path/"sell_prices.csv")
    df_sample_submission = pd.read_csv(path/"sample_submission.csv")
    df_sample_submission["order"] = range(df_sample_submission.shape[0])

    df_train = df_train_full.iloc[:, :-28]
    df_valid = df_train_full.iloc[:, -28:]

    evaluator = WRMSSEEvaluator(df_train, df_valid, df_calendar, df_prices)
    
    ## structure of validation data
    preds_valid = df_valid.copy() + np.random.randint(100, size = df_valid.shape)

    ## evaluating submission from public kernel M5 - Three shades of Dark: Darker magic
    preds_valid = pd.read_csv("submission.csv")
    preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
    preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
    preds_valid.rename(columns = {
        "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
        "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
        "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
        "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
    }, inplace = True)

    groups, scores = evaluator.score(preds_valid)

    score_public_lb = np.mean(scores)
    score_public_rank = get_lb_rank(df_lb, score_public_lb)

    for i in range(len(groups)):
        print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

    print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
    if mlflow is not None:
        mlflow.log_metric(key="Public LB Score", value=round(score_public_lb, 5))
        mlflow.set_tag(key="Public LB Rank", value=str(score_public_rank))

In [55]:
with mlflow.start_run():
    mlflow.log_params({'epochs': epochs, 'lr': lr, 'lin_layer_sizes': str(lin_layer_sizes)})
    if not 'num_cycles' in locals():
        num_cycles = 1
    for i in range(num_cycles):
        model_pred = fit(mlflow)
    run_pred(model_pred)
    validation_ranking(mlflow)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.79602682590484627
Replaced best model
epoch: 1 loss: 2.231393575668335


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.79075533151626599
Replaced best model
epoch: 2 loss: 2.1929075717926025


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7621117830276489
Replaced best model
epoch: 3 loss: 2.1749000549316406


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002351715760081836] Loss avg: 0.74683398008346563
epoch: 4 loss: 2.1803605556488037


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.77162778377532965
epoch: 5 loss: 2.183570384979248


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.74884724617004419
Replaced best model
epoch: 6 loss: 2.1671011447906494


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7488617897033691
Replaced best model
epoch: 7 loss: 2.1566545963287354


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.82818824052810679
epoch: 8 loss: 2.188462018966675


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.79201644659042366
Replaced best model
epoch: 9 loss: 2.155851125717163


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7615416646003723
Replaced best model
epoch: 10 loss: 2.1463732719421387


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.77099353075027474
epoch: 11 loss: 2.1654090881347656


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.76600557565689096
Replaced best model
epoch: 12 loss: 2.1427719593048096


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7330094575881958
epoch: 13 loss: 2.1436216831207275


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.78790503740310677
epoch: 14 loss: 2.1563589572906494


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.74789118766784674
Replaced best model
epoch: 15 loss: 2.1395976543426514


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7361591458320618
Replaced best model
epoch: 16 loss: 2.13457989692688


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.76432979106903088
epoch: 17 loss: 2.152217388153076


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.80180776119232184
Replaced best model
epoch: 18 loss: 2.1301565170288086


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7382914423942566
epoch: 19 loss: 2.134352684020996


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.77669674158096318
epoch: 20 loss: 2.140892267227173


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.74280524253845216
epoch: 21 loss: 2.1304547786712646


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7357199192047119
Replaced best model
epoch: 22 loss: 2.126316785812378


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.81928914785385132
epoch: 23 loss: 2.1489808559417725


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.74130159616470349
epoch: 24 loss: 2.146413564682007


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7436394095420837
epoch: 25 loss: 2.1301565170288086


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.75420033931732188
epoch: 26 loss: 2.1532814502716064


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.73096036911010747
epoch: 27 loss: 2.133375406265259


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [2.9921135786037704e-07] Loss avg: 0.7311385869979858
epoch: 28 loss: 2.1386799812316895


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.000764828423991816] Loss avg: 0.74473333358764659
epoch: 29 loss: 2.1331446170806885


HBox(children=(FloatProgress(value=0.0, max=2482.0), HTML(value='')))

2400/2482 lr: [0.0002651276353496774] Loss avg: 0.72960746288299566
epoch: 30 loss: 2.1314404010772705

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11}
{2011: 0, 2012: 1, 2013: 2, 2014: 3, 2015: 4, 2016: 5}
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30}


HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


Score for group all_id: 0.36679
Score for group cat_id: 0.38926
Score for group state_id: 0.43148
Score for group dept_id: 0.47518
Score for group store_id: 0.51739
Score for group item_id: 0.82317
Score for group ['state_id', 'cat_id']: 0.47402
Score for group ['state_id', 'dept_id']: 0.55145
Score for group ['store_id', 'cat_id']: 0.56319
Score for group ['store_id', 'dept_id']: 0.64447
Score for group ['item_id', 'state_id']: 0.83077
Score for group ['item_id', 'store_id']: 0.83058

Public LB Score: 0.57481
