In [1]:
import torch

In [2]:
import torch.nn as nn

In [3]:
#basic imports for data preprocessing
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
#import data
data = pd.read_csv("../data/train.csv")
#check out the data
data.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [4]:
country = 'Belgium'
public_holidays = pd.read_csv("../data/EUpublicholidays.csv")
public_holidays.head()

Unnamed: 0,Country,Holiday
0,Belgium,01/01/21
1,Belgium,04/04/21
2,Belgium,04/05/21
3,Belgium,05/01/21
4,Belgium,05/13/21


In [5]:
country_holidays = public_holidays.where(public_holidays['Country']==country)
country_holidays = country_holidays.drop(np.where(country_holidays['Country'].isnull())[0])

In [6]:
country_holidays['Holiday']=pd.to_datetime(country_holidays['Holiday'])

In [7]:
data['date']=pd.to_datetime(data['date'])
data['holiday'] = data['date'].isin(country_holidays['Holiday'])



In [8]:
#instead collect useful data from date
data['weekday'] = data['date'].map(lambda x : x.dayofweek).astype('category')
data['month'] = data['date'].map(lambda x : x.month).astype('category')
data['monthday'] = data['date'].map(lambda x : x.day).astype('category')
#year is the only continuous variable, it needs to be scaled
data['year'] = data['date'].map(lambda x : x.year - 2016).astype(float)
data['year'] = data['year'].map(lambda x : (x - 1)/(4-1))
data['product'] = data['product'].astype('category')
data['store'] = data['store'].astype('category')
dataf = data.drop('row_id',axis=1).drop('date',axis=1)
# dataf.drop('index',axis=1,inplace=True)

# dataf.drop('num_sold',axis=1,inplace=True)


dataf = dataf.where(dataf['country']==country)
dataf = dataf.drop(np.where(dataf['country'].isnull())[0])
y = dataf['num_sold'].to_numpy().astype('float32').reshape(-1,1)

In [9]:
WeekdayTransformer = OneHotEncoder(sparse=False)
MonthTransformer = OneHotEncoder(sparse=False)
MonthDayTransformer = OneHotEncoder(sparse=False)
ProductTransformer = OneHotEncoder(sparse=False)
HolidayTransformer = OneHotEncoder(sparse=False)
StoreTransformer = OneHotEncoder(sparse=False)

#create separate transformers to make it easier to drop one of the dummy variables and avoid
#multi-collinearity problems
weekday = WeekdayTransformer.fit_transform(dataf[['weekday']])[:,:-1]
month = MonthTransformer.fit_transform(dataf[['month']])[:,:-1]
monthday = MonthTransformer.fit_transform(dataf[['monthday']])[:,:-1]
product = ProductTransformer.fit_transform(dataf[['product']])[:,:-1]
holiday = HolidayTransformer.fit_transform(dataf[['holiday']])[:,:-1]
store = StoreTransformer.fit_transform(dataf[['store']])[:,:-1]


# transforms = ENC.fit_transform(dataf[['weekday','month','monthday','product','store','holiday']])
year = dataf['year'].to_numpy().reshape(-1,1)

In [10]:
full_data = np.concatenate((weekday,month,monthday,product,holiday,store,year,y),axis=1)
training_data = full_data[:(len(full_data)-2000)]
test_data = full_data[(len(full_data)-2000):]

In [11]:
x_train = training_data[:,:-1][:]
y_train = training_data[:,-1:][:]
x_test = test_data[:,:-1][:]
y_test = test_data[:,-1:][:]

print(x_train.shape)

(9688, 53)


In [12]:
from torch.utils.data import DataLoader, Dataset

In [13]:
x_train = torch.tensor(x_train,dtype=torch.float32)
y_train = torch.tensor(y_train,dtype=torch.float32)
x_test = torch.tensor(x_test,dtype=torch.float32)
y_test = torch.tensor(y_test,dtype=torch.float32)

In [14]:
y_train.shape

torch.Size([9688, 1])

In [15]:
class DataSetMaker:
    def __init__(self,X,y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self,idx):
        return [self.X[idx], self.y[idx]]
training_data =  DataSetMaker(x_train,y_train)
test_data = DataSetMaker(x_test,y_test)

training_data_loader = DataLoader(training_data,batch_size=8,shuffle=True)
test_data_loader = DataLoader(test_data,batch_size=8,shuffle=False)

In [16]:
neurons =256
dropout = 0.2

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(53, neurons),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(neurons, neurons),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(neurons, 1),
        )

    def forward(self, x):
        pred = self.linear_relu_stack(x)
        return pred

model = NeuralNetwork()


In [17]:
learning_rate = 1e-3
batch_size = 8

In [18]:
loss = nn.MSELoss()

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss(pred, y).item()


    test_loss /= num_batches

    print(f"Test Error: Avg loss: {test_loss:>8f} \n")

In [None]:
model.float()
epochs = 100
for t in range(epochs):

    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(training_data_loader, model, loss, optimizer)
    test_loop(test_data_loader, model, loss)
print("Done!")

Epoch 1
-------------------------------
loss: 85053.054688  [    0/ 9688]
loss: 12210.537109  [  800/ 9688]
loss: 607.246155  [ 1600/ 9688]
loss: 2388.253906  [ 2400/ 9688]
loss: 2757.569092  [ 3200/ 9688]
loss: 1750.170288  [ 4000/ 9688]
loss: 1192.749023  [ 4800/ 9688]
loss: 881.704346  [ 5600/ 9688]
loss: 1435.794678  [ 6400/ 9688]
loss: 11295.060547  [ 7200/ 9688]
loss: 303.168457  [ 8000/ 9688]
loss: 498.025604  [ 8800/ 9688]
loss: 412.625000  [ 9600/ 9688]
Test Error: Avg loss: 1454.994372 

Epoch 2
-------------------------------
loss: 1048.535889  [    0/ 9688]
loss: 908.437256  [  800/ 9688]
loss: 992.209106  [ 1600/ 9688]
loss: 470.613220  [ 2400/ 9688]
loss: 1151.277344  [ 3200/ 9688]
loss: 1533.388916  [ 4000/ 9688]
loss: 338.242767  [ 4800/ 9688]
loss: 587.309387  [ 5600/ 9688]
loss: 542.048279  [ 6400/ 9688]
loss: 719.423767  [ 7200/ 9688]
loss: 2875.412354  [ 8000/ 9688]
loss: 4509.156738  [ 8800/ 9688]
loss: 201.530334  [ 9600/ 9688]
Test Error: Avg loss: 1348.203009 



In [None]:
PATH = f"SEPT_TPS_{country}.pt"
torch.save(model.state_dict(), PATH)