In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch import optim
from sklearn.model_selection import KFold
import sklearn
from numpy import random
from model_architecture import MatchPredictor
from copy import deepcopy
pd.options.display.max_rows = 50

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [3]:
with open('time_series_dataset.pkl', 'rb') as file:
    data = pickle.load(file)

In [4]:
for values in data.values():
    values[1].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)
    values[2].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)

In [5]:
class StatsDataset(Dataset):

    def __init__(self, x, y, kfold=None, curr_fold=None, n_splits=None):
        self.x = x
        self.y = y
        self.curr_fold = curr_fold
        self.kfold = KFold(n_splits=n_splits, shuffle=False) if kfold is not None else None

    def __getitem__(self, index):
        static, home, away = self.x[index]
        y = self.y[index]

        static = torch.as_tensor(static.values, dtype=torch.float32)
        home = torch.as_tensor(home.values, dtype=torch.float32)
        away = torch.as_tensor(away.values, dtype=torch.float32)

        target = torch.as_tensor(y, dtype=torch.long)
        return static, home, away, target
    
    def __len__(self):
        return len(self.x)
    
    def get_splits(self):
        if self.kfold is None:
            return None
        
        folds = list(self.kfold.split(self.x))
        train_indices, val_indices = folds[self.curr_fold]

        train_data = self._get_subset(train_indices)
        val_data = self._get_subset(val_indices)

        return train_data, val_data

    def _get_subset(self, indices):
        return Subset(self, indices)

In [6]:
seasons = [key[2] for key in data.keys()]
max_season = sorted(seasons, reverse=True)[0]

train_seasons = np.arange(1, max_season, 1)
test_seasons = np.random.choice(train_seasons, size=10, replace=False)
train_seasons = np.setdiff1d(train_seasons, test_seasons)

train_data = [value for key, value in data.items() if key[2] in train_seasons]
x_test = [value for key, value in data.items() if key[2] in test_seasons]

y_train = np.array([x[0].pop('result') for x in train_data])
y_test = np.array([x[0].pop('result') for x in x_test])


In [7]:
model = MatchPredictor(43, 16, 2, 8)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
def collate_fn(batch):
    statics, homes, aways, targets = zip(*batch)

    away_lens = torch.tensor([a.size(0) for a in aways])
    home_lens = torch.tensor([h.size(0) for h in homes])

    home_padded = pad_sequence(homes, batch_first=True, padding_value=0.0)
    away_padded = pad_sequence(aways, batch_first=True, padding_value=0.0)

    home_packed = pack_padded_sequence(home_padded, home_lens, batch_first=True, enforce_sorted=False)
    away_packed = pack_padded_sequence(away_padded, away_lens, batch_first=True, enforce_sorted=False)

    statics = torch.stack(statics)
    targets = torch.stack(targets)

    return {
        'statics' : statics,
        'homes' : (home_packed, home_lens),
        'aways' : (away_packed, away_lens),
        'targets' : targets
    }


In [9]:
n_splits = 5
batch_size = 64
num_epochs = 1

train_data_copy = deepcopy(train_data)
test_dataset = StatsDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for fold in range(n_splits):
        train_data = deepcopy(train_data_copy)
        train_dataset = StatsDataset(train_data, y_train, kfold=True, curr_fold=fold, n_splits=n_splits)
        train_data, val_data = train_dataset.get_splits()

        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)
        val_loader = DataLoader(val_data, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)
        
        for step, batch in enumerate(train_loader):
            pass