In [1]:
cd ..

/home/xavier/projects/godatathon_2020


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.model.trainer import RNNModel
from src.model.dataset import NovartisDataset

### Params

In [4]:
input_dim = 3
hidden_dim = 20
num_layers = 1

In [5]:
SEED = 27
SEED = 28
LR = 1e-3
NUM_WORKERS = 8

TEST_SIZE = 0.20

In [6]:
pl.seed_everything(SEED)

28

# Data

In [7]:
df = pd.read_csv("data/features/final_features.csv")

In [8]:
df = df.sort_values(["country", "brand", "month_num"])

In [9]:
df

Unnamed: 0,country,brand,volume,month_num,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,therapeutic_id,avg_12_volume,max_volume,month_sin,month_cos,volume_norm
224,country_1,brand_10,6088874.84,-47,0,1,0.12,6,0.0,0.010157,0.000000,6,7.325746e+06,8296237.56,-0.500000,-8.660254e-01,0.733932
225,country_1,brand_10,6658654.12,-46,0,1,0.12,6,0.0,0.010157,0.000000,6,7.325746e+06,8296237.56,-0.866025,-5.000000e-01,0.802611
226,country_1,brand_10,6055711.56,-45,0,1,0.12,6,0.0,0.010157,0.000000,6,7.325746e+06,8296237.56,-1.000000,-1.836970e-16,0.729935
227,country_1,brand_10,7036485.24,-44,0,1,0.12,6,0.0,0.010157,0.000000,6,7.325746e+06,8296237.56,-0.866025,5.000000e-01,0.848154
228,country_1,brand_10,6786609.28,-43,0,1,0.12,6,0.0,0.010157,0.000000,6,7.325746e+06,8296237.56,-0.500000,8.660254e-01,0.818035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43242,country_9,brand_477,10752.88,19,15,419,0.02,3,0.0,0.068118,0.931882,6,2.925174e+04,39475.20,0.500000,8.660254e-01,0.272396
43243,country_9,brand_477,9868.80,20,15,419,0.02,3,0.0,0.068118,0.931882,6,2.925174e+04,39475.20,0.866025,5.000000e-01,0.250000
43244,country_9,brand_477,6908.16,21,15,419,0.02,3,0.0,0.068118,0.931882,6,2.925174e+04,39475.20,1.000000,6.123234e-17,0.175000
43245,country_9,brand_477,3947.52,22,15,419,0.02,3,0.0,0.068118,0.931882,6,2.925174e+04,39475.20,0.866025,-5.000000e-01,0.100000


### Preprocessing

#### Select only cases with 24 months after generic (To remove later)

In [9]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = df[df["month_num"] >= 0].groupby(["country", "brand"]).size()

country_brand_post_count.name = "post_months_count"
country_brand_post_count = country_brand_post_count.reset_index()

In [10]:
df = df.merge(country_brand_post_count, on=["country", "brand"], how="right")

In [11]:
# Select only dataset with 24 months after generic
df = df[df["post_months_count"]==24]

In [12]:
# Remove unused column
df = df.drop(columns="post_months_count")

#### Add country-brand column

In [13]:
df["country_brand"] = df["country"] + "-" + df["brand"]

---

# Train/Val Split

### Train

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
country_brands = df["country_brand"].drop_duplicates().values

In [16]:
# Train/Val split
country_brands_train, country_brands_val = train_test_split(country_brands,
                                                            test_size=TEST_SIZE,
                                                            random_state=SEED)

In [17]:
volume_train = df[df["country_brand"].isin(country_brands_train)].copy()

#### Dataset/DataLoader

In [18]:
ds_train = NovartisDataset(volume_train)
dl_train = DataLoader(ds_train, batch_size=1, num_workers=NUM_WORKERS, shuffle=True)

### Validation

In [19]:
volume_val = df[df["country_brand"].isin(country_brands_val)].copy()

In [20]:
ds_val = NovartisDataset(volume_val)
dl_val = DataLoader(ds_val, batch_size=1, num_workers=NUM_WORKERS)

# Lightning

# Predict

In [21]:
model_path = "lightning_logs/version_5/checkpoints/epoch=49.ckpt"
model_path = "lightning_logs/version_8/checkpoints/epoch=13.ckpt"
model_path = "lightning_logs/version_12/checkpoints/epoch=18.ckpt"
model_path = "lightning_logs/version_14/checkpoints/epoch=17.ckpt"

In [22]:
model = RNNModel.load_from_checkpoint(model_path)

In [23]:
df = pd.read_csv("data/features/final_features.csv")
df["country_brand"] = df["country"] + "-" + df["brand"]

In [24]:
submissions = pd.read_csv("data/raw/submission_template.csv")
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [25]:
# Filter out country/brand in submissions
df_test = df[df["country_brand"].isin(submissions["country_brand"])]

# Sort values
df_test = df_test.sort_values(["country", "brand", "month_num"])

In [26]:
df_test.head()

Unnamed: 0,country,brand,volume,month_num,country_id,brand_id,num_generics,package_id,channel_rate_A,channel_rate_B,channel_rate_C,therapeutic_id,avg_12_volume,max_volume,month_sin,month_cos,volume_norm,country_brand
76478,country_1,brand_121,109954.88,-101,0,25,0.08,6,0.0,0.017237,0.0,7,35999789.0,38294953.2,0.5,0.8660254,0.002871,country_1-brand_121
76479,country_1,brand_121,860929.44,-100,0,25,0.08,6,0.0,0.017237,0.0,7,35999789.0,38294953.2,0.866025,0.5,0.022482,country_1-brand_121
76480,country_1,brand_121,1455175.12,-99,0,25,0.08,6,0.0,0.017237,0.0,7,35999789.0,38294953.2,1.0,6.123234000000001e-17,0.037999,country_1-brand_121
76481,country_1,brand_121,1883624.96,-98,0,25,0.08,6,0.0,0.017237,0.0,7,35999789.0,38294953.2,0.866025,-0.5,0.049187,country_1-brand_121
76482,country_1,brand_121,2451245.44,-97,0,25,0.08,6,0.0,0.017237,0.0,7,35999789.0,38294953.2,0.5,-0.8660254,0.06401,country_1-brand_121


### Test

In [27]:
ds_test = NovartisDataset(df_test)
dl_test = DataLoader(ds_test, batch_size=1, num_workers=0)

In [28]:
max_volume_series = df.groupby("country_brand")["max_volume"].unique().apply(lambda x: x.item())

In [29]:
ds_test = NovartisDataset(df_test)

In [44]:
predictions = []

model.eval()
for n, batch in enumerate(tqdm(dl_test)):
    # Unpack batch
    encoder_temp_features = batch["encoder_temp_features"]
    encoder_num_features = batch["encoder_num_features"]
    encoder_cat_features = batch["encoder_cat_features"]
    decoder_temp_features = batch["decoder_temp_features"]
    y = batch["y_norm"]
    avg_12_volume = batch["avg_12_volume"]
    max_volume = batch["max_volume"]

    # Permute arrays
    encoder_temp_features = encoder_temp_features.permute(1, 0, 2)
    y = y.permute(1, 0, 2)

    # encoder_num_features = encoder_num_features.permute(1, 0)
    encoder_cat_features = encoder_cat_features.permute(1, 0)

    # Predict
    y_hat = model(encoder_temp_features,
                 encoder_num_features,
                 encoder_cat_features,
                 y)
    
    
    volume_preds = y_hat["prediction"].detach().numpy().flatten()
    upper_bounds = y_hat["upper_bound"].detach().numpy().flatten()
    lower_bounds = y_hat["lower_bound"].detach().numpy().flatten()

    for month in range(24):
        country, brand = ds_test.group_keys[n]
        
        # Add volume scaling
        volume_scaling = max_volume_series.loc[country + "-" + brand].item()
        
        # Select month predictions + Scale
        vol_pred = volume_preds[month] * volume_scaling
        upper_pred = upper_bounds[month] * volume_scaling
        lower_pred = lower_bounds[month] * volume_scaling
        
        # Filter out negative values
        vol_pred = max(vol_pred, 0)

        prediction = {"country": country,
                      "brand": brand,
                      "month_num": month,
                      "pred_95_low": lower_pred,
                      "prediction": vol_pred,
                      "pred_95_high": upper_pred}

        predictions.append(prediction)

100%|██████████| 191/191 [00:06<00:00, 31.72it/s]


In [45]:
df_preds = pd.DataFrame(predictions)
df_preds.head()

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,23339950.0,29481690.0,36941840.0
1,country_1,brand_121,1,17610500.0,25580730.0,33425740.0
2,country_1,brand_121,2,14623310.0,22868460.0,31016260.0
3,country_1,brand_121,3,12848960.0,21046000.0,29436730.0
4,country_1,brand_121,4,11719240.0,19812120.0,28375870.0


# Submission

In [46]:
# Add predictions to submissions
merge_cols = ["country", "brand", "month_num"]
final_submissions = submissions[merge_cols].merge(df_preds, on=merge_cols, how="left")
final_submissions.head()

Unnamed: 0,country,brand,month_num,pred_95_low,prediction,pred_95_high
0,country_1,brand_121,0,23339950.0,29481690.0,36941840.0
1,country_1,brand_121,1,17610500.0,25580730.0,33425740.0
2,country_1,brand_121,2,14623310.0,22868460.0,31016260.0
3,country_1,brand_121,3,12848960.0,21046000.0,29436730.0
4,country_1,brand_121,4,11719240.0,19812120.0,28375870.0


In [47]:
# Overwrite already know volumes to submissions
df = df.set_index(["country", "brand", "month_num"])

In [48]:
for idx, _ in final_submissions.iterrows():
    if idx in df.index:
        final_submissions.loc[idx] = df.loc[idx, "volume"]