In [None]:
cd ..

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.model.trainer import RNNModel
from src.model.dataset import NovartisDataset

### Params

In [None]:
input_dim = 3
hidden_dim = 30
num_layers = 1

In [None]:
SEED = 27
SEED = 28
LR = 1e-3
NUM_WORKERS = 8

TEST_SIZE = 0.20

In [None]:
pl.seed_everything(SEED)

# Data

In [None]:
df = pd.read_csv("data/features/final_features.csv")

In [None]:
df = df.sort_values(["country", "brand", "month_num"])

In [None]:
# Delete unknown duplicates
df = df.drop_duplicates(["country", "brand", "month_num"])

In [None]:
df.head(2)

### Preprocessing

#### Select only cases with 24 months after generic (To remove later)

In [None]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = df[df["month_num"] >= 0].groupby(["country", "brand"]).size()

country_brand_post_count.name = "post_months_count"
country_brand_post_count = country_brand_post_count.reset_index()

In [None]:
df = df.merge(country_brand_post_count, on=["country", "brand"], how="right")

In [None]:
# Select only dataset with 24 months after generic
df = df[df["post_months_count"]==24]

In [None]:
# Remove unused column
df = df.drop(columns="post_months_count")

#### Add country-brand column

In [None]:
df["country_brand"] = df["country"] + "-" + df["brand"]

---

# Train/Val Split

### Train

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
country_brands = df["country_brand"].drop_duplicates().values

In [None]:
# Train/Val split
country_brands_train, country_brands_val = train_test_split(country_brands,
                                                            test_size=TEST_SIZE,
                                                            random_state=SEED)

In [None]:
volume_train = df[df["country_brand"].isin(country_brands_train)].copy()

#### Dataset/DataLoader

In [None]:
ds_train = NovartisDataset(volume_train)
dl_train = DataLoader(ds_train, batch_size=1, num_workers=NUM_WORKERS, shuffle=True)

### Validation

In [None]:
volume_val = df[df["country_brand"].isin(country_brands_val)].copy()

In [None]:
ds_val = NovartisDataset(volume_val)
dl_val = DataLoader(ds_val, batch_size=1, num_workers=NUM_WORKERS)

# Lightning

# Predict

In [None]:
model_path = "lightning_logs/version_5/checkpoints/epoch=49.ckpt"
model_path = "lightning_logs/version_8/checkpoints/epoch=13.ckpt"
model_path = "lightning_logs/version_12/checkpoints/epoch=18.ckpt"
model_path = "lightning_logs/version_14/checkpoints/epoch=17.ckpt"
model_path = "lightning_logs/version_15/checkpoints/epoch=37.ckpt"
model_path = "lightning_logs/version_19/checkpoints/epoch=12.ckpt"

In [None]:
model = RNNModel.load_from_checkpoint(model_path)

In [None]:
df = pd.read_csv("data/features/final_features.csv")
df["country_brand"] = df["country"] + "-" + df["brand"]

# Delete unknown duplicates
df = df.drop_duplicates(["country", "brand", "month_num"])

In [None]:
submissions = pd.read_csv("data/raw/submission_template.csv")
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [None]:
# Filter out country/brand in submissions
df_test = df[df["country_brand"].isin(submissions["country_brand"])]

# Sort values
df_test = df_test.sort_values(["country", "brand", "month_num"])

In [None]:
df_test.head()

### Test

In [None]:
ds_test = NovartisDataset(df_test)
dl_test = DataLoader(ds_test, batch_size=1, num_workers=0)

In [None]:
max_volume_series = df.groupby("country_brand")["max_volume"].unique().apply(lambda x: x.item())

In [None]:
ds_test = NovartisDataset(df_test)

In [None]:
predictions = []

model.eval()
for n, batch in enumerate(tqdm(dl_test)):
    # Unpack batch
    encoder_temp_features = batch["encoder_temp_features"]
    encoder_num_features = batch["encoder_num_features"]
    encoder_cat_features = batch["encoder_cat_features"]
    decoder_temp_features = batch["decoder_temp_features"]
    y = batch["y_norm"]
    avg_12_volume = batch["avg_12_volume"]
    max_volume = batch["max_volume"]

    # Permute arrays
    encoder_temp_features = encoder_temp_features.permute(1, 0, 2)
    y = y.permute(1, 0, 2)

    # encoder_num_features = encoder_num_features.permute(1, 0)
    encoder_cat_features = encoder_cat_features.permute(1, 0)

    # Predict
    y_hat = model(encoder_temp_features,
                 encoder_num_features,
                 encoder_cat_features,
                 y)
    
    
    volume_preds = y_hat["prediction"].detach().numpy().flatten()
    upper_bounds = y_hat["upper_bound"].detach().numpy().flatten()
    lower_bounds = y_hat["lower_bound"].detach().numpy().flatten()

    for month in range(24):
        country, brand = ds_test.group_keys[n]
        
        # Add volume scaling
        volume_scaling = max_volume_series.loc[country + "-" + brand].item()
        
        # Select month predictions + Scale
        vol_pred = volume_preds[month] * volume_scaling
        upper_pred = upper_bounds[month] * volume_scaling
        lower_pred = lower_bounds[month] * volume_scaling
        
        # Filter out invalid values
        vol_pred = max(vol_pred, 0)
        upper_pred = max(upper_pred, vol_pred)
        lower_pred = min(max(lower_pred, 0), vol_pred)

        prediction = {"country": country,
                      "brand": brand,
                      "month_num": month,
                      "pred_95_low": lower_pred,
                      "prediction": vol_pred,
                      "pred_95_high": upper_pred}

        predictions.append(prediction)

In [None]:
df_preds = pd.DataFrame(predictions)
df_preds.head()

# Submission

In [None]:
# Add predictions to submissions
merge_cols = ["country", "brand", "month_num"]
final_submissions = submissions[merge_cols].merge(df_preds, on=merge_cols, how="left")
final_submissions.head()

In [None]:
id_cols = ["country", "brand", "month_num"]

In [None]:
final_submissions = final_submissions.set_index(id_cols)
df = df.set_index(id_cols)

In [None]:
i = 0
for row in tqdm(final_submissions.itertuples()):
    if row.Index in df.index:
        final_submissions.loc[row.Index] = df.loc[row.Index, "volume"]
        i +=1

In [None]:
print("Total overwrites:", i)

In [None]:
final_submissions.to_csv("data/submissions/sumbission_09.csv")