In [None]:
cd ../..

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

from src.model.nets import Seq2Seq
from src.model.dataset import NovartisDataset

In [None]:
class RNNModel(pl.LightningModule):

    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.model = Seq2Seq(input_dim, hidden_dim, num_layers)
        self.loss_fc = torch.nn.MSELoss()

    def forward(self, x, y=None):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        
        x = x.float()
        y = y.float()
        x = x.permute(1, 0, 2)

        y_hat = self(x)
        loss = self.loss_fc(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        
        x = x.float()
        y = y.float()
        x = x.permute(1, 0, 2)

        y_hat = self(x)
        loss = self.loss_fc(y_hat, y)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

### Params

In [None]:
input_dim = 1
hidden_dim = 5
num_layers = 1

### Data Engineering

In [None]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [None]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [None]:
# Filter out country/brand in submissions
volume = volume[~volume["country_brand"].isin(submissions["country_brand"])]

# Sort values
volume = volume.sort_values(["country", "brand", "month_num"])

In [None]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = volume[volume["month_num"] >= 0].groupby("country_brand").size()
idx_post_volume_full = country_brand_post_count[country_brand_post_count == 24].index
volume = volume[volume["country_brand"].isin(idx_post_volume_full)]

---

### Train/Val Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
country_brands = volume["country_brand"].drop_duplicates().values

In [None]:
# Train/Val split
country_brands_train, country_brands_val = train_test_split(country_brands,
                                                            test_size=0.20,
                                                            random_state=27)

In [None]:
volume_train = volume[volume["country_brand"].isin(country_brands_train)]

ds_train = NovartisDataset(volume_train)
dl_train = DataLoader(ds_train, batch_size=1, num_workers=1)

In [None]:
volume_val = volume[volume["country_brand"].isin(country_brands_val)]

ds_val = NovartisDataset(volume_val)
dl_val = DataLoader(ds_val, batch_size=1, num_workers=1)

### Training

In [None]:
trainer = pl.Trainer()
model = RNNModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers)

In [17]:
trainer.fit(model, train_dataloader=dl_train, val_dataloaders=dl_val)


  | Name    | Type    | Params
------------------------------------
0 | model   | Seq2Seq | 491   
1 | loss_fc | MSELoss | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu






1

Traceback (most recent call last):
  File "/home/xavier/miniconda3/envs/godathon/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/home/xavier/miniconda3/envs/godathon/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/xavier/miniconda3/envs/godathon/lib/python3.8/multiprocessing/connection.py", line 411, in _send_bytes
    self._send(header + buf)
  File "/home/xavier/miniconda3/envs/godathon/lib/python3.8/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
