In [1]:
cd ..

/home/xavier/projects/godatathon_2020


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.model.trainer import RNNModel
from src.model.dataset import NovartisDataset

### Params

In [4]:
input_dim = 1
hidden_dim = 5
num_layers = 1

### Data Engineering

In [5]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [6]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [7]:
# Filter out country/brand in submissions
volume = volume[~volume["country_brand"].isin(submissions["country_brand"])]

# Sort values
volume = volume.sort_values(["country", "brand", "month_num"])

In [8]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = volume[volume["month_num"] >= 0].groupby("country_brand").size()
idx_post_volume_full = country_brand_post_count[country_brand_post_count == 24].index
volume = volume[volume["country_brand"].isin(idx_post_volume_full)]

---

# Train/Val Split

### Train

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
country_brands = volume["country_brand"].drop_duplicates().values

In [11]:
# Train/Val split
country_brands_train, country_brands_val = train_test_split(country_brands,
                                                            test_size=0.20,
                                                            random_state=27)

In [12]:
volume_train = volume[volume["country_brand"].isin(country_brands_train)].copy()

#### Scaler

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [14]:
scaler = MinMaxScaler()
scaler.fit(volume_train[["volume"]])

MinMaxScaler()

In [15]:
volume_train[["volume"]] = scaler.transform(volume_train[["volume"]])

#### Dataset/DataLoader

In [16]:
ds_train = NovartisDataset(volume_train)
dl_train = DataLoader(ds_train, batch_size=1, num_workers=1)

### Validation

In [17]:
volume_val = volume[volume["country_brand"].isin(country_brands_val)].copy()

In [18]:
volume_val[["volume"]] = scaler.transform(volume_val[["volume"]])

In [19]:
ds_val = NovartisDataset(volume_val)
dl_val = DataLoader(ds_val, batch_size=1, num_workers=1)

# Lightning

In [20]:
trainer = pl.Trainer(max_epochs=20, gpus=1)
model = RNNModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers, lr=5e-4)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [21]:
trainer.fit(model, train_dataloader=dl_train, val_dataloaders=dl_val)


  | Name    | Type    | Params
------------------------------------
0 | model   | Seq2Seq | 491   
1 | loss_fc | MSELoss | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

  return F.mse_loss(input, target, reduction=self.reduction)


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

# Predict

In [22]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [23]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [24]:
# Filter out country/brand in submissions
volume_test = volume[volume["country_brand"].isin(submissions["country_brand"])]

# Sort values
volume_test = volume_test.sort_values(["country", "brand", "month_num"])

### Test

In [25]:
volume_test[["volume"]] = scaler.transform(volume_test[["volume"]])

In [26]:
ds_test = NovartisDataset(volume_test)
dl_test = DataLoader(ds_test, batch_size=1, num_workers=1)

In [52]:
predictions = []
model.eval()
for n, (x, y) in enumerate(tqdm(dl_test)):
    y_hat = model(x, y)

    y_hat_numpy = y_hat.squeeze(dim=1).detach().numpy()

    # Inverse scaling
    y_hat_numpy = scaler.inverse_transform(y_hat_numpy)
    
    for month, vol_pred in enumerate(y_hat_numpy.flatten()):
        
        country, brand = ds_test.group_keys[n]
        
        prediction = {"country": country,
                      "brand": brand,
                      "month_num": month,
                      "pred_95_low": vol_pred,
                      "prediction": vol_pred,
                      "pred_95_high": vol_pred}
        predictions.append(prediction)

100%|██████████| 191/191 [00:04<00:00, 38.94it/s]


In [53]:
df_preds = pd.DataFrame(predictions)
df_preds

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,129589032.0,129589032.0,129589032.0
1,country_1,brand_121,1,54638240.0,54638240.0,54638240.0
2,country_1,brand_121,2,83750312.0,83750312.0,83750312.0
3,country_1,brand_121,3,102805336.0,102805336.0,102805336.0
4,country_1,brand_121,4,111425664.0,111425664.0,111425664.0
...,...,...,...,...,...,...
4579,country_9,brand_187,19,110748264.0,110748264.0,110748264.0
4580,country_9,brand_187,20,110734312.0,110734312.0,110734312.0
4581,country_9,brand_187,21,110724856.0,110724856.0,110724856.0
4582,country_9,brand_187,22,110719912.0,110719912.0,110719912.0


In [54]:
merge_cols = ["country", "brand", "month_num"]
final_submissions = submissions[merge_cols].merge(df_preds, on=merge_cols, how="left")
final_submissions

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,129589032.0,129589032.0,129589032.0
1,country_1,brand_121,1,54638240.0,54638240.0,54638240.0
2,country_1,brand_121,2,83750312.0,83750312.0,83750312.0
3,country_1,brand_121,3,102805336.0,102805336.0,102805336.0
4,country_1,brand_121,4,111425664.0,111425664.0,111425664.0
...,...,...,...,...,...,...
4579,country_9,brand_187,19,110748264.0,110748264.0,110748264.0
4580,country_9,brand_187,20,110734312.0,110734312.0,110734312.0
4581,country_9,brand_187,21,110724856.0,110724856.0,110724856.0
4582,country_9,brand_187,22,110719912.0,110719912.0,110719912.0


In [56]:
final_submissions.to_csv("data/submissions/sumbission_02.csv", index=False)

# TODO
- Predict
- Inverse Normalization
- Formatting