In [1]:
cd ../..

/home/xavier/projects/godatathon_2020


In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader

from src.model.nets import Encoder

In [3]:
volume = pd.read_csv("data/raw/gx_volume.csv", index_col=0)
submissions = pd.read_csv("data/raw/submission_template.csv")


In [4]:
volume["country_brand"] = volume["country"] + "-" + volume["brand"]
submissions["country_brand"] = submissions["country"] + "-" + submissions["brand"]

In [5]:
# Filter out country/brand in submissions
volume = volume[~volume["country_brand"].isin(submissions["country_brand"])]

In [6]:
volume = volume.sort_values(["country", "brand", "month_num"])

### Post-processing

In [7]:
# Note: In the future, we will compute the loss only on data that we have available for each country/mont
# i.e. If a country only has volume until mont 20, we will pad/ignore the loss of months 21-24
country_brand_post_count = volume[volume["month_num"] >= 0].groupby("country_brand").size()
idx_post_volume_full = country_brand_post_count[country_brand_post_count == 24].index
volume = volume[volume["country_brand"].isin(idx_post_volume_full)]

### Grouping

In [8]:
g = volume.groupby(["country", "brand"])

In [9]:
for idx, df in g:
    df_pre = df[df["month_num"] < 0]
    df_post = df[df["month_num"] >= 0]
    
    # TODO: Take into consideration scaling
    
    X = df_pre["volume"]
    y = df_post["volume"]   
    break

# RNN

In [10]:
X = torch.from_numpy(X.values)
y = torch.from_numpy(y.values)

In [11]:
input_dim = 1
hidden_dim = 5
num_layers = 1

batch_size = 16

#### Encoder

In [12]:
encoder = Encoder(input_dim, hidden_dim, num_layers)

In [13]:
input_legth = 123 # Number of months (depends on case)

encoder_input = torch.randn(input_legth, batch_size, input_dim)

# Predict
encoder_out, encoder_hidden_out = encoder(encoder_input)

In [15]:
print("encoder_input:\t\t", encoder_input.shape)
print("encoder_out:\t\t", encoder_out.shape)
print("encoder_hidden_out:\t", encoder_hidden_out.shape)

encoder_input:		 torch.Size([123, 16, 1])
encoder_out:		 torch.Size([123, 16, 10])
encoder_hidden_out:	 torch.Size([2, 16, 5])


#### Decoder

In [15]:
decoder = nn.GRU(input_dim, hidden_dim, num_layers, bidirectional=True)

In [37]:
# Throw away encoder output
_ = encoder_out

# Dummy representing month -1
decoder_input = torch.randn(1, batch_size, input_dim)
# decoder_input = torch.randn(23, batch_size, input_dim)

# Using hidden_out from Encoder as hidden_in
decoder_hidden_0 = encoder_hidden_out

In [26]:
decoder_input.shape

torch.Size([23, 16, 1])

In [35]:
decoder_out[-1].unsqueeze(dim=0).shape

torch.Size([1, 16, 10])

In [27]:
decoder_out, decoder_hidden_out = decoder(decoder_input, decoder_hidden_0)

In [28]:
print("decoder_input:\t\t", decoder_input.shape)
print("decoder_hidden_0:\t", decoder_hidden_0.shape)
print("decoder_out:\t\t", decoder_out.shape)
print("decoder_hidden_out:\t", decoder_hidden_out.shape)

decoder_input:		 torch.Size([23, 16, 1])
decoder_hidden_0:	 torch.Size([2, 16, 5])
decoder_out:		 torch.Size([23, 16, 10])
decoder_hidden_out:	 torch.Size([2, 16, 5])


In [None]:
class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers)

    def forward(self, x):
        # Initialize hidden with zeros
        h0 = torch.zeros(self.input_dim, self.batch_size, self.hidden_dim)
        output, hn = rnn(input, h0)
        return output, hn