In [97]:
import mlflow
import torch
from torch.utils.data import DataLoader, Dataset, random_split

import pandas as pd
from kaggle_pfs.models.lstm_v1 import SalesDataset, SalesDataloader, Network
from kaggle_pfs.data import readers

mlflow.set_tracking_uri("http://localhost:5000")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [98]:
model = mlflow.pytorch.load_model("runs:/bc7cad0540794fa49248cc6ac9618085/pytorch-model")
model

Network(
  (shop_embedding): Embedding(60, 2, max_norm=2)
  (item_embedding): Embedding(22170, 2, max_norm=2)
  (lstm): LSTM(1, 6)
  (linear_1): Linear(in_features=10, out_features=6, bias=True)
  (linear_2): Linear(in_features=6, out_features=1, bias=True)
)

In [99]:
train_set = readers.items_by_month()
test_set = pd.read_csv(readers.data_path("raw", "test.csv"))

In [100]:
df = pd.merge(test_set, train_set, how="left", on=("shop_id", "item_id"))
df.head()

Unnamed: 0,ID,shop_id,item_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
0,0,5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [101]:
df.isna().sum()

ID              0
shop_id         0
item_id         0
0          102796
1          102796
2          102796
3          102796
4          102796
5          102796
6          102796
7          102796
8          102796
9          102796
10         102796
11         102796
12         102796
13         102796
14         102796
15         102796
16         102796
17         102796
18         102796
19         102796
20         102796
21         102796
22         102796
23         102796
24         102796
25         102796
26         102796
27         102796
28         102796
29         102796
30         102796
31         102796
32         102796
33         102796
dtype: int64

In [102]:
df = df.fillna(0.0)

In [103]:
sales_ds = SalesDataset(df, end_month=33)

In [104]:
sales_ds[0]

((tensor(5),
  tensor(5037),
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 1., 2., 2., 0., 0., 0., 1., 1., 1., 3., 1., 0.])),
 tensor(0.))

In [105]:
sales_dl = SalesDataloader(device, DataLoader(sales_ds, batch_size=256, pin_memory=True))

In [106]:
with torch.no_grad():

    def make_predictions():
        for xb, _ in sales_dl:
            y_hat = model(*xb)
            yield xb[0], xb[1], y_hat

    dfs = (
        pd.DataFrame(
            {
                "shop_id": shop.cpu().numpy(),
                "item_id": item.cpu().numpy(),
                "item_cnt_month": y_hat.squeeze().cpu().numpy(),
            }
        )
        for shop, item, y_hat in make_predictions()
    )
    
    predictions = pd.concat(dfs, ignore_index=True)

In [107]:
predictions.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,5,5037,0.354022
1,5,5320,0.178185
2,5,5233,0.829488
3,5,5232,0.188032
4,5,5268,0.098386


In [108]:
submissions = pd.merge(test_set, predictions, how='left', on=('shop_id', 'item_id')).set_index('ID')
submissions.head()

Unnamed: 0_level_0,shop_id,item_id,item_cnt_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5037,0.354022
1,5,5320,0.178185
2,5,5233,0.829488
3,5,5232,0.188032
4,5,5268,0.098386


In [109]:
submissions.item_cnt_month.clip(0,20).to_csv(readers.data_path('processed', 'lstm_v1.csv'))

Score: 1.21019

In [134]:
df = pd.merge(test_set, train_set, how="left", on=("shop_id", "item_id"))
df = df.loc[:,['ID','shop_id','item_id','33']]
df.head()

Unnamed: 0,ID,shop_id,item_id,33
0,0,5,5037,0.0
1,1,5,5320,
2,2,5,5233,1.0
3,3,5,5232,0.0
4,4,5,5268,


In [135]:
submissions_nan = pd.merge(df, predictions, how='left', on=('shop_id', 'item_id')).set_index('ID')

In [136]:
submissions_nan.loc[submissions_nan["33"].isna(), "item_cnt_month"] = 0
submissions_nan

Unnamed: 0_level_0,shop_id,item_id,33,item_cnt_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5,5037,0.0,0.354022
1,5,5320,,0.000000
2,5,5233,1.0,0.829488
3,5,5232,0.0,0.188032
4,5,5268,,0.000000
...,...,...,...,...
214195,45,18454,1.0,-0.011955
214196,45,16188,,0.000000
214197,45,15757,0.0,0.232907
214198,45,19648,,0.000000


In [137]:
submissions_nan.item_cnt_month.clip(0, 20).to_csv(
    readers.data_path("processed", "lstm_v1_nan_zero.csv")
)

Score: 1.11753