In [1]:
cd ../..

/home/xavier/projects/godatathon_2020


In [11]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from src.model.nets import Encoder, Decoder, Seq2Seq

In [3]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [4]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [5]:
# Filter out country/brand in submissions
volume = volume[~volume["country_brand"].isin(submissions["country_brand"])]

In [6]:
volume = volume.sort_values(["country", "brand", "month_num"])

### Post-processing

In [7]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = volume[volume["month_num"] >= 0].groupby("country_brand").size()
idx_post_volume_full = country_brand_post_count[country_brand_post_count == 24].index
volume = volume[volume["country_brand"].isin(idx_post_volume_full)]

### Grouping

In [8]:
g = volume.groupby(["country", "brand"])

In [9]:
for idx, df in g:
    df_pre = df[df["month_num"] < 0]
    df_post = df[df["month_num"] >= 0]
    
    # TODO: Take into consideration scaling
    
    X = df_pre["volume"]
    y = df_post["volume"] # Todo: Add shift (take up to month -1)
    break

# Dataset

In [38]:
class NovartisDataset(Dataset):
    def __init__(self, volume_df):
        self.data = volume_df
        self.Xs = list()
        self.ys = list()
        
        volume_grouped = self.data.groupby(["country", "brand"])
        for _, df in volume_grouped:
            self.Xs.append(df["volume"][df["month_num"] < 0].values)
            self.ys.append(df["volume"][df["month_num"] >= 0].values)
    
    def __len__(self):
        return len(self.Xs)
    
    def __getitem__(self, index):
        return self.Xs[index], self.ys[index]

In [56]:
g = volume.groupby(["country", "brand"])

In [39]:
ds = NovartisDataset(volume)

In [46]:
for a, b in ds:
    assert len(b) == 24

In [53]:
dl = DataLoader(ds, batch_size=1, num_workers=2)

In [54]:
for x in dl:
    pass

In [55]:
x

[tensor([[11308.0000, 15420.0000, 10280.0000, 16448.0000, 13364.0000, 12336.0000,
          20560.0000, 16448.0000, 15420.0000, 14392.0000, 20560.0000, 17476.0000,
          15420.0000, 15420.0000, 20560.0000, 19737.6000, 19737.6000, 15420.0000,
          25905.6000, 18915.2000, 19737.6000, 18709.6000, 25905.6000, 25905.6000,
          18709.6000, 30223.2000, 23438.4000, 21773.0400, 25905.6000, 25288.8000,
          23849.6000, 32484.8000, 21588.0000, 34335.2000, 25905.6000, 24507.5200,
          31703.5200, 31456.8000, 32443.6800, 27632.6400, 28619.5200, 25761.6800,
          30593.2800, 28619.5200, 20724.4800, 27632.6400, 35527.6800, 29935.3600,
          32567.0400, 35774.4000, 17763.8400, 26645.7600, 39475.2000]],
        dtype=torch.float64),
 tensor([[27632.6400, 25658.8800, 26645.7600, 26645.7600, 25843.9200, 20724.4800,
          16776.9600, 29606.4000, 18750.7200,  3947.5200, 13816.3200, 11102.4000,
           9868.8000,  7154.8800, 12829.4400,  6908.1600,  9868.8000,  6908.16