In [1]:
cd ..

/home/xavier/projects/godatathon_2020


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.model.trainer import RNNModel
from src.model.dataset import NovartisDataset

### Params

In [4]:
input_dim = 1
hidden_dim = 5
num_layers = 1

In [5]:
SEED = 27
LR = 1e-4

In [6]:
pl.seed_everything(SEED)

27

### Data Engineering

In [7]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [8]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

### Average Last 12 Months
Necessary for computing the loss

In [9]:
avg_12_volume = volume[
    (volume.month_num >= -12) & (volume.month_num < 0)
].groupby("country_brand")["volume"].mean().reset_index()

avg_12_volume = avg_12_volume.rename(columns={"volume": "avg_12_volume"})

In [10]:
# Add avg_12_volume to dataset
volume = volume.merge(avg_12_volume, on="country_brand", how="left")

### Max & Norm Volume

In [11]:
g = volume.groupby(["country_brand"])

In [12]:
max_volume = g["volume"].max().reset_index().rename(columns={"volume": "max_volume"})

In [13]:
# Add max volume by country/brand
volume = volume.merge(max_volume, on="country_brand")

In [14]:
volume["volume_norm"] = volume["volume"] / volume["max_volume"]

### Filter out submissions
**Note:** We could actually use these points for training, only we would need to apply a mask to the loss calculated, and perhaps use teacher-forcing

In [15]:
# Filter out country/brand in submissions
volume_non_submission = volume[~volume["country_brand"].isin(submissions["country_brand"])]

# Sort values
volume_non_submission = volume_non_submission.sort_values(["country", "brand", "month_num"])

In [16]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = volume_non_submission[volume_non_submission["month_num"] >= 0].groupby("country_brand").size()
idx_post_volume_full = country_brand_post_count[country_brand_post_count == 24].index
volume_non_submission = volume_non_submission[volume_non_submission["country_brand"].isin(idx_post_volume_full)]

---

# Train/Val Split

### Train

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
country_brands = volume_non_submission["country_brand"].drop_duplicates().values

In [19]:
# Train/Val split
country_brands_train, country_brands_val = train_test_split(country_brands,
                                                            test_size=0.20,
                                                            random_state=SEED)

In [20]:
volume_train = volume_non_submission[volume_non_submission["country_brand"].isin(country_brands_train)].copy()

#### Dataset/DataLoader

In [21]:
ds_train = NovartisDataset(volume_train)
dl_train = DataLoader(ds_train, batch_size=1, num_workers=0, shuffle=True)

### Validation

In [22]:
volume_val = volume[volume["country_brand"].isin(country_brands_val)].copy()

In [23]:
ds_val = NovartisDataset(volume_val)
dl_val = DataLoader(ds_val, batch_size=1, num_workers=0, shuffle=True)

# Lightning

# Predict

In [40]:
model = RNNModel.load_from_checkpoint("lightning_logs/version_5/checkpoints/epoch=49.ckpt")

In [41]:
# Filter out country/brand in submissions
volume_test = volume[volume["country_brand"].isin(submissions["country_brand"])]

# Sort values
volume_test = volume_test.sort_values(["country", "brand", "month_num"])

In [42]:
volume_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,volume,month_name,country_brand,avg_12_volume,max_volume,volume_norm
country,brand,month_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
country_1,brand_121,-101,109954.88,Jan,country_1-brand_121,35999789.0,38294953.2,0.002871
country_1,brand_121,-100,860929.44,Feb,country_1-brand_121,35999789.0,38294953.2,0.022482
country_1,brand_121,-99,1455175.12,Mar,country_1-brand_121,35999789.0,38294953.2,0.037999
country_1,brand_121,-98,1883624.96,Apr,country_1-brand_121,35999789.0,38294953.2,0.049187
country_1,brand_121,-97,2451245.44,May,country_1-brand_121,35999789.0,38294953.2,0.06401


### Test

In [30]:
ds_test = NovartisDataset(volume_test)
dl_test = DataLoader(ds_test, batch_size=1, num_workers=1)

In [31]:
max_volume_series = max_volume.set_index("country_brand")

In [33]:
predictions = []

model.eval()
for n, batch in enumerate(tqdm(dl_test)):
    # Unpack batch
    x = batch["x_norm"]
    y = batch["y_norm"]

    y_hat = model(x, y)

    y_hat_numpy = y_hat.squeeze(dim=1).detach().numpy()

    for month, vol_pred in enumerate(y_hat_numpy.flatten()):
        
        country, brand = ds_test.group_keys[n]
        
        # Add volume scaling
        volume_scaling = max_volume_series.loc[country + "-" + brand].item()
        
        prediction = {"country": country,
                      "brand": brand,
                      "month_num": month,
                      "pred_95_low": vol_pred * volume_scaling,
                      "prediction": vol_pred * volume_scaling,
                      "pred_95_high": vol_pred * volume_scaling}
        predictions.append(prediction)

100%|██████████| 191/191 [00:06<00:00, 31.06it/s]


In [35]:
df_preds = pd.DataFrame(predictions)
df_preds.head()

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,26335880.0,26335880.0,26335880.0
1,country_1,brand_121,1,21186120.0,21186120.0,21186120.0
2,country_1,brand_121,2,18404560.0,18404560.0,18404560.0
3,country_1,brand_121,3,16853540.0,16853540.0,16853540.0
4,country_1,brand_121,4,15938670.0,15938670.0,15938670.0


# Submission

In [36]:
# Add predictions to submissions
merge_cols = ["country", "brand", "month_num"]
final_submissions = submissions[merge_cols].merge(df_preds, on=merge_cols, how="left")
final_submissions.head()

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,26335880.0,26335880.0,26335880.0
1,country_1,brand_121,1,21186120.0,21186120.0,21186120.0
2,country_1,brand_121,2,18404560.0,18404560.0,18404560.0
3,country_1,brand_121,3,16853540.0,16853540.0,16853540.0
4,country_1,brand_121,4,15938670.0,15938670.0,15938670.0


In [37]:
# Overwrite already know volumes to submissions
final_submissions = final_submissions.set_index(["country", "brand", "month_num"])
volume = volume.set_index(["country", "brand", "month_num"])

for idx, _ in final_submissions.iterrows():
    if idx in volume.index:
        final_submissions.loc[idx] = volume.loc[idx, "volume"]

In [38]:
final_submissions = final_submissions.reset_index()

In [39]:
final_submissions.to_csv("data/submissions/sumbission_04.csv", index=False)