In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [2]:
device = 'cuda:0'

In [3]:
path = "m5-forecasting-accuracy/"
timesteps = 14
startDay = 350

In [4]:
df = pd.read_csv(path + "sales_train_validation.csv")
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


In [6]:
def reduction_mem(df):
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols = [c for c in df if df[c].dtype in ['int64', 'int32']]
    df[float_cols] = df[ float_cols].astype(np.float16)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [7]:
df = reduction_mem(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int16(1913), object(6)
memory usage: 112.6+ MB


In [8]:
# transpose so items are columns and rows are days
df = df.T
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
id,HOBBIES_1_001_CA_1_validation,HOBBIES_1_002_CA_1_validation,HOBBIES_1_003_CA_1_validation,HOBBIES_1_004_CA_1_validation,HOBBIES_1_005_CA_1_validation,HOBBIES_1_006_CA_1_validation,HOBBIES_1_007_CA_1_validation,HOBBIES_1_008_CA_1_validation,HOBBIES_1_009_CA_1_validation,HOBBIES_1_010_CA_1_validation,...,FOODS_3_818_WI_3_validation,FOODS_3_819_WI_3_validation,FOODS_3_820_WI_3_validation,FOODS_3_821_WI_3_validation,FOODS_3_822_WI_3_validation,FOODS_3_823_WI_3_validation,FOODS_3_824_WI_3_validation,FOODS_3_825_WI_3_validation,FOODS_3_826_WI_3_validation,FOODS_3_827_WI_3_validation
item_id,HOBBIES_1_001,HOBBIES_1_002,HOBBIES_1_003,HOBBIES_1_004,HOBBIES_1_005,HOBBIES_1_006,HOBBIES_1_007,HOBBIES_1_008,HOBBIES_1_009,HOBBIES_1_010,...,FOODS_3_818,FOODS_3_819,FOODS_3_820,FOODS_3_821,FOODS_3_822,FOODS_3_823,FOODS_3_824,FOODS_3_825,FOODS_3_826,FOODS_3_827
dept_id,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,HOBBIES_1,...,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3,FOODS_3
cat_id,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,HOBBIES,...,FOODS,FOODS,FOODS,FOODS,FOODS,FOODS,FOODS,FOODS,FOODS,FOODS
store_id,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,CA_1,...,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3,WI_3
state_id,CA,CA,CA,CA,CA,CA,CA,CA,CA,CA,...,WI,WI,WI,WI,WI,WI,WI,WI,WI,WI
d_1,0,0,0,0,0,0,0,12,2,0,...,0,14,1,0,4,0,0,0,0,0
d_2,0,0,0,0,0,0,0,15,0,0,...,0,11,1,0,4,0,0,6,0,0
d_3,0,0,0,0,0,0,0,0,7,1,...,0,5,1,0,2,2,0,0,0,0
d_4,0,0,0,0,0,0,0,0,3,0,...,0,6,1,0,5,2,0,2,0,0


In [9]:
# preprocessing:  remove the info 
df = df[6 + startDay:]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_351,0,0,0,2,0,0,0,24,3,2,...,0,9,1,0,11,0,0,1,0,0
d_352,0,0,0,0,0,0,0,9,0,2,...,0,5,4,0,8,0,1,2,0,0
d_353,0,0,0,4,2,0,0,2,1,1,...,0,15,2,0,3,0,1,2,0,0
d_354,0,1,0,2,0,0,0,7,1,0,...,0,5,1,0,3,0,0,0,0,0
d_355,0,0,0,1,2,0,0,0,0,0,...,0,7,1,0,1,0,1,1,0,0


In [10]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_1909,1,1,1,0,1,0,1,4,0,0,...,4,1,1,0,0,0,0,1,1,0
d_1910,3,0,0,1,2,0,0,6,0,0,...,2,3,3,0,2,1,0,0,0,0
d_1911,0,0,1,3,2,2,0,3,0,2,...,0,1,6,0,3,0,0,0,3,0
d_1912,1,0,1,7,2,0,1,2,0,0,...,3,0,0,4,2,0,1,1,1,0
d_1913,1,0,1,2,4,0,1,1,0,2,...,1,2,1,0,1,1,0,0,3,0


In [11]:
calendar_df = pd.read_csv(path + 'calendar.csv')
daysBeforeEvent = pd.DataFrame(np.zeros((1969,1)))

In [12]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [13]:
for x,y in calendar_df.iterrows():
    if(not pd.isnull(calendar_df["event_name_1"][x])):
           daysBeforeEvent[0][x-1] = 1 

In [14]:
daysBeforeEvent

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
1964,0.0
1965,0.0
1966,0.0
1967,1.0


In [15]:
df.shape

(1563, 30490)

In [16]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
d_1909,1,1,1,0,1,0,1,4,0,0,...,4,1,1,0,0,0,0,1,1,0
d_1910,3,0,0,1,2,0,0,6,0,0,...,2,3,3,0,2,1,0,0,0,0
d_1911,0,0,1,3,2,2,0,3,0,2,...,0,1,6,0,3,0,0,0,3,0
d_1912,1,0,1,7,2,0,1,2,0,0,...,3,0,0,4,2,0,1,1,1,0
d_1913,1,0,1,2,4,0,1,1,0,2,...,1,2,1,0,1,1,0,0,3,0


In [17]:
daysBeforeEventTest = daysBeforeEvent[1913:1941]
daysBeforeEvent = daysBeforeEvent[startDay:1913]

In [18]:
daysBeforeEvent.columns = ["oneDayBeforeEvent"]
daysBeforeEvent.index = df.index
daysBeforeEvent

Unnamed: 0,oneDayBeforeEvent
d_351,0.0
d_352,1.0
d_353,0.0
d_354,0.0
d_355,0.0
...,...
d_1909,0.0
d_1910,0.0
d_1911,0.0
d_1912,0.0


In [19]:
df = pd.concat([df, daysBeforeEvent], axis = 1)

In [20]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30481,30482,30483,30484,30485,30486,30487,30488,30489,oneDayBeforeEvent
d_351,0,0,0,2,0,0,0,24,3,2,...,9,1,0,11,0,0,1,0,0,0.0
d_352,0,0,0,0,0,0,0,9,0,2,...,5,4,0,8,0,1,2,0,0,1.0
d_353,0,0,0,4,2,0,0,2,1,1,...,15,2,0,3,0,1,2,0,0,0.0
d_354,0,1,0,2,0,0,0,7,1,0,...,5,1,0,3,0,0,0,0,0,0.0
d_355,0,0,0,1,2,0,0,0,0,0,...,7,1,0,1,0,1,1,0,0,0.0


In [21]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30481,30482,30483,30484,30485,30486,30487,30488,30489,oneDayBeforeEvent
d_1909,1,1,1,0,1,0,1,4,0,0,...,1,1,0,0,0,0,1,1,0,0.0
d_1910,3,0,0,1,2,0,0,6,0,0,...,3,3,0,2,1,0,0,0,0,0.0
d_1911,0,0,1,3,2,2,0,3,0,2,...,1,6,0,3,0,0,0,3,0,0.0
d_1912,1,0,1,7,2,0,1,2,0,0,...,0,0,4,2,0,1,1,1,0,0.0
d_1913,1,0,1,2,4,0,1,1,0,2,...,2,1,0,1,1,0,0,3,0,0.0


In [22]:
sc = MinMaxScaler(feature_range = (0, 1))
df_scaled = sc.fit_transform(df)

In [23]:
pd.DataFrame(df_scaled, columns=df.columns)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30481,30482,30483,30484,30485,30486,30487,30488,30489,oneDayBeforeEvent
0,0.0,0.0,0.000000,0.133333,0.000000,0.0,0.000000,0.263736,0.15,0.333333,...,0.321429,0.111111,0.000000,0.458333,0.000000,0.000000,0.142857,0.000000,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.098901,0.00,0.333333,...,0.178571,0.444444,0.000000,0.333333,0.000000,0.083333,0.285714,0.000000,0.0,1.0
2,0.0,0.0,0.000000,0.266667,0.222222,0.0,0.000000,0.021978,0.05,0.166667,...,0.535714,0.222222,0.000000,0.125000,0.000000,0.083333,0.285714,0.000000,0.0,0.0
3,0.0,0.2,0.000000,0.133333,0.000000,0.0,0.000000,0.076923,0.05,0.000000,...,0.178571,0.111111,0.000000,0.125000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.000000,0.066667,0.222222,0.0,0.000000,0.000000,0.00,0.000000,...,0.250000,0.111111,0.000000,0.041667,0.000000,0.083333,0.142857,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558,0.2,0.2,0.166667,0.000000,0.111111,0.0,0.333333,0.043956,0.00,0.000000,...,0.035714,0.111111,0.000000,0.000000,0.000000,0.000000,0.142857,0.083333,0.0,0.0
1559,0.6,0.0,0.000000,0.066667,0.222222,0.0,0.000000,0.065934,0.00,0.000000,...,0.107143,0.333333,0.000000,0.083333,0.166667,0.000000,0.000000,0.000000,0.0,0.0
1560,0.0,0.0,0.166667,0.200000,0.222222,0.2,0.000000,0.032967,0.00,0.333333,...,0.035714,0.666667,0.000000,0.125000,0.000000,0.000000,0.000000,0.250000,0.0,0.0
1561,0.2,0.0,0.166667,0.466667,0.222222,0.0,0.333333,0.021978,0.00,0.000000,...,0.000000,0.000000,0.571429,0.083333,0.000000,0.083333,0.142857,0.083333,0.0,0.0


### Creating Training Data

In [24]:
# indices to grab training data using timesteps lag to make predictions
timesteps, 1913 - startDay

(14, 1563)

In [25]:
# corresponding to the actual days:
timesteps + startDay, 1913 - startDay + startDay

(364, 1913)

In [26]:
X_train = []
y_train = []
for i in range(timesteps, 1913 - startDay):
    X_train.append(df_scaled[i-timesteps:i])
    y_train.append(df_scaled[i][0:30490]) 

In [27]:
X_train = np.array(X_train, dtype = 'float16')
y_train = np.array(y_train, dtype = 'float16')
print(X_train.shape)
print(y_train.shape)

(1549, 14, 30491)
(1549, 30490)


In [28]:
X_train[0].shape

(14, 30491)

In [29]:
X_train[0]

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0.2, 0. , ..., 0. , 0. , 0. ],
       [0. , 0.2, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float16)

In [30]:
y_train[0].shape

(30490,)

In [31]:
y_train[0]

array([0.    , 0.    , 0.    , ..., 0.1428, 0.    , 0.    ], dtype=float16)

### Pytorch

In [32]:
class SalesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return x, y

In [33]:
batch_size = 2
train_ds = SalesDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [34]:
next(iter(train_dl))

[tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
          [0.0000, 0.2000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.2000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0000, 0.2000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]],
        dtype=torch.float16), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float16)]

In [35]:
x, y = next(iter(train_dl))

In [36]:
tuple(x[0].size())[0]

14

In [37]:
y[0].shape[0]

30490

## Model

In [38]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=2, batch_first=True, bidirectional=True, dropout=0.25)
        self.fc = nn.Linear(hidden_size*2, output_size)  # 2 for bidirection

    
    def forward(self, x):
        
        # Forward propagate LSTM
        out, (ht, ct)  = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        h = torch.cat((ht[-2,:,:],ht[-1,:,:]), dim=1)
        # Decode the hidden state of the last time step
        h = self.fc(h)
        return h

In [39]:
batch_size = 30
train_ds = SalesDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [40]:
input_size = x[0].shape[1]
output_size = y[0].shape[0]
hidden_size = 100
num_layers = 2

In [41]:
def train_epocs(model, optimizer, train_dl, valid_dl=None, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.float().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
#         val_loss = val_metrics(model, valid_dl)
        print(f"epoch {i} : train loss (MSE) {sum_loss/total:.6f}")

In [42]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in valid_dl:
        x = x.float().cuda()
        y = y.float().cuda()
        y_hat = model(x)
        loss = F.mse_loss(y_hat, y)
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

In [43]:
model = BiLSTM(input_size, hidden_size, num_layers, output_size).cuda()
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)
train_epocs(model, optimizer, train_dl, valid_dl=None, epochs=30)

epoch 0 : train loss (MSE) 0.016785
epoch 1 : train loss (MSE) 0.015779
epoch 2 : train loss (MSE) 0.015687
epoch 3 : train loss (MSE) 0.015707
epoch 4 : train loss (MSE) 0.015638
epoch 5 : train loss (MSE) 0.015630
epoch 6 : train loss (MSE) 0.015601
epoch 7 : train loss (MSE) 0.015578
epoch 8 : train loss (MSE) 0.015636
epoch 9 : train loss (MSE) 0.015557
epoch 10 : train loss (MSE) 0.015527
epoch 11 : train loss (MSE) 0.015527
epoch 12 : train loss (MSE) 0.015521
epoch 13 : train loss (MSE) 0.015495
epoch 14 : train loss (MSE) 0.015496
epoch 15 : train loss (MSE) 0.015458
epoch 16 : train loss (MSE) 0.015432
epoch 17 : train loss (MSE) 0.015428
epoch 18 : train loss (MSE) 0.015412
epoch 19 : train loss (MSE) 0.015403
epoch 20 : train loss (MSE) 0.015387
epoch 21 : train loss (MSE) 0.015377
epoch 22 : train loss (MSE) 0.015358
epoch 23 : train loss (MSE) 0.015348
epoch 24 : train loss (MSE) 0.015354
epoch 25 : train loss (MSE) 0.015318
epoch 26 : train loss (MSE) 0.015313
epoch 27 : 

In [44]:
train_epocs(model, optimizer, train_dl, valid_dl=None, epochs=30)

epoch 0 : train loss (MSE) 0.015287
epoch 1 : train loss (MSE) 0.015277
epoch 2 : train loss (MSE) 0.015262
epoch 3 : train loss (MSE) 0.015251
epoch 4 : train loss (MSE) 0.015251
epoch 5 : train loss (MSE) 0.015231
epoch 6 : train loss (MSE) 0.015246
epoch 7 : train loss (MSE) 0.015238
epoch 8 : train loss (MSE) 0.015236
epoch 9 : train loss (MSE) 0.015214
epoch 10 : train loss (MSE) 0.015201
epoch 11 : train loss (MSE) 0.015201
epoch 12 : train loss (MSE) 0.015192
epoch 13 : train loss (MSE) 0.015175
epoch 14 : train loss (MSE) 0.015169
epoch 15 : train loss (MSE) 0.015184
epoch 16 : train loss (MSE) 0.015186
epoch 17 : train loss (MSE) 0.015170
epoch 18 : train loss (MSE) 0.015154
epoch 19 : train loss (MSE) 0.015136
epoch 20 : train loss (MSE) 0.015123
epoch 21 : train loss (MSE) 0.015099
epoch 22 : train loss (MSE) 0.015100
epoch 23 : train loss (MSE) 0.015079
epoch 24 : train loss (MSE) 0.015090
epoch 25 : train loss (MSE) 0.015096
epoch 26 : train loss (MSE) 0.015057
epoch 27 : 

In [45]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.auto import tqdm as tqdm

class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

In [46]:
%%time

train_df = pd.read_csv(path + 'sales_train_evaluation.csv')
calendar = pd.read_csv(path + 'calendar.csv')
prices = pd.read_csv(path + 'sell_prices.csv')

train_fold_df = train_df.iloc[:, :-28]
valid_fold_df = train_df.iloc[:, -28:].copy()

e = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
del train_fold_df, train_df, calendar, prices

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=42840.0), HTML(value='')))


CPU times: user 37.4 s, sys: 2.08 s, total: 39.5 s
Wall time: 39.4 s


In [47]:
valid_fold_df

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0,0,0,2,0,3,5,0,0,1,...,2,4,0,0,0,0,3,3,0,1
1,0,1,0,0,0,0,0,0,0,1,...,0,1,2,1,1,0,0,0,0,0
2,0,0,1,1,0,2,1,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,0,0,1,2,4,1,6,4,0,0,...,1,1,0,4,0,1,3,0,2,6
4,1,0,2,3,1,0,3,2,3,1,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,0,2,2,0,0,0,2,0,...,1,0,3,0,1,1,0,0,1,1
30486,0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
30487,0,0,1,1,0,2,1,1,0,0,...,0,0,1,2,0,1,0,1,0,2
30488,1,3,0,1,2,1,0,2,1,1,...,1,1,1,4,6,0,1,1,1,0


In [48]:
inputs= df[-timesteps:]
inputs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30481,30482,30483,30484,30485,30486,30487,30488,30489,oneDayBeforeEvent
d_1900,0,0,0,2,2,1,0,5,0,1,...,4,2,0,0,2,1,4,0,0,0.0
d_1901,0,0,0,0,0,4,1,2,7,0,...,4,0,0,7,0,1,0,1,0,0.0
d_1902,0,0,1,3,1,0,0,8,0,0,...,3,2,0,0,0,0,0,0,0,0.0
d_1903,1,0,2,0,1,0,0,4,0,0,...,0,1,2,5,0,0,0,3,0,0.0
d_1904,1,0,2,1,2,0,0,0,0,1,...,5,1,3,0,2,0,2,0,0,0.0
d_1905,3,0,1,0,1,1,0,0,0,0,...,5,1,2,4,0,0,1,0,0,0.0
d_1906,0,0,2,5,1,0,0,1,1,0,...,0,0,1,2,0,0,0,1,0,0.0
d_1907,1,0,1,4,0,1,1,37,1,0,...,4,3,0,4,0,0,2,0,0,0.0
d_1908,1,0,1,1,1,0,0,3,6,0,...,1,0,2,1,0,0,0,0,0,0.0
d_1909,1,1,1,0,1,0,1,4,0,0,...,1,1,0,0,0,0,1,1,0,0.0


In [49]:
inputs = sc.transform(inputs)

In [52]:
X_test = []
X_test.append(inputs[0:timesteps])
X_test = np.array(X_test)
predictions = []
for j in range(timesteps,timesteps + 28):
    predicted_stock_price = model(torch.tensor(X_test).float().cuda())
    testInput = np.column_stack((predicted_stock_price.cpu().detach().numpy(), daysBeforeEventTest[0][1913 + j - timesteps]))
    X_test = np.append(X_test, testInput).reshape(1,j + 1,30491)
    predicted_stock_price = sc.inverse_transform(testInput)[:,0:30490]
    predictions.append(predicted_stock_price)

In [54]:
valid_pred = np.array(predictions).reshape(28,-1).T

In [55]:
e.score(valid_pred)

1.3434510819468948