In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.functional import F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch import optim
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report
import sklearn
from sklearn.preprocessing import MinMaxScaler
from numpy import random
from model_architecture import MatchPredictor
import itertools
pd.options.display.max_rows = 50

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [3]:
with open('time_series_dataset.pkl', 'rb') as file:
    data = pickle.load(file)

In [15]:
class StatsDataset(Dataset):

    def __init__(self, x, y, cv=False, val_idx=None, n_splits=None):
        self.x = x
        self.y = y
        self.val_idx = val_idx
        self.cv = cv
        self.n_splits = n_splits
        if self.cv:
            self.folds = self._get_folds()

    def __getitem__(self, index):
        static, home_ts, away_ts = self.x[index]
        y = self.y[index]
        home = home_ts.drop('season', axis=1)
        away = away_ts.drop('season', axis=1)

        static = torch.as_tensor(static.values, dtype=torch.float32)
        home = torch.as_tensor(home.values, dtype=torch.float32)
        away = torch.as_tensor(away.values, dtype=torch.float32)

        target = torch.as_tensor(y, dtype=torch.long)
        return static, home, away, target
    
    def __len__(self):
        return len(self.x)
    
    def _get_folds(self):
        assert self.cv, 'This is a testing dataset. You cannot split it into training and validation ones.'

        folds = [[] for _ in range(self.n_splits)]
        for i, data in enumerate(self.x):
            folds[data[1]['season'].head(1).values[0].astype('int') % self.n_splits].append(i)
        return folds

    def get_subsets(self):
        train_indices = list(itertools.chain.from_iterable(self.folds[:self.val_idx] + self.folds[self.val_idx+1:]))
        val_indices = self.folds[self.val_idx]
        train_data = Subset(self, train_indices)
        val_data = Subset(self, val_indices)
        return train_data, val_data

In [5]:
seasons = [key[2] for key in data.keys()]
max_season = sorted(seasons, reverse=True)[0]

train_seasons = np.arange(1, max_season, 1)
test_seasons = np.random.choice(train_seasons, size=15, replace=False)
train_seasons = np.setdiff1d(train_seasons, test_seasons)

train_data = [value for key, value in data.items() if key[2] in train_seasons]
test_data = [value for key, value in data.items() if key[2] in test_seasons]

y_train = np.array([x[0].pop('result') for x in train_data])
y_test = np.array([x[0].pop('result') for x in test_data])


In [6]:
for values in data.values():
    values[1].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)
    values[2].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)

In [7]:
statics_lst, ts_homes_lst, ts_aways_lst = zip(*train_data)

home_dfs = pd.concat(ts_homes_lst)
away_dfs = pd.concat(ts_aways_lst)
static_dfs = pd.concat(statics_lst)

home_scaler = MinMaxScaler()
away_scaler  = MinMaxScaler()
static_scaler = MinMaxScaler()

home_scaler.fit(home_dfs.values)
away_scaler.fit(away_dfs.values)
static_scaler.fit(static_dfs.values)

for i, el in enumerate(train_data):
    train_data[i][0] = pd.DataFrame(static_scaler.transform(el[0].values), index=el[0].index, columns=el[0].columns)
    train_data[i][1] = pd.DataFrame(home_scaler.transform(el[1].values), index=el[1].index, columns=el[1].columns)
    train_data[i][2] = pd.DataFrame(away_scaler.transform(el[2].values), index=el[2].index, columns=el[2].columns)

for i, el in enumerate(test_data):
    test_data[i][0] = pd.DataFrame(static_scaler.transform(el[0].values), index=el[0].index, columns=el[0].columns)
    test_data[i][1] = pd.DataFrame(home_scaler.transform(el[1].values), index=el[1].index, columns=el[1].columns)
    test_data[i][2] = pd.DataFrame(away_scaler.transform(el[2].values), index=el[2].index, columns=el[2].columns)

In [8]:
model = MatchPredictor(40, 32, 2, 8, device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)

MatchPredictor(
  (home_lstm): LSTM(40, 32, num_layers=2, batch_first=True, dropout=0.2)
  (away_lstm): LSTM(40, 32, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Sequential(
    (0): Linear(in_features=72, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [9]:
def collate_fn(batch):
    statics, homes, aways, targets = zip(*batch)

    away_lens = torch.tensor([a.size(0) for a in aways])
    home_lens = torch.tensor([h.size(0) for h in homes])

    home_padded = pad_sequence(homes, batch_first=True, padding_value=0.0)
    away_padded = pad_sequence(aways, batch_first=True, padding_value=0.0)

    home_packed = pack_padded_sequence(home_padded, home_lens, batch_first=True, enforce_sorted=False)
    away_packed = pack_padded_sequence(away_padded, away_lens, batch_first=True, enforce_sorted=False)

    statics = torch.stack(statics)
    statics = statics.squeeze(1)
    targets = torch.stack(targets)

    return {
        'statics' : statics,
        'homes' : home_packed,
        'aways' : away_packed,
        'targets' : targets
    }


In [None]:
n_splits = 5
batch_size = 64
num_epochs = 1

test_dataset = StatsDataset(test_data, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

for epoch in range(num_epochs):
    for fold in range(n_splits):
        train_dataset = StatsDataset(train_data, y_train, cv=True, val_idx=fold, n_splits=n_splits)
        train_split, val_split = train_dataset.get_subsets()

        train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)
        val_loader = DataLoader(val_split, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

        for step, batch in enumerate(train_loader):
            statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
            logits = model(statics, homes, aways)
            pred = F.softmax(logits, dim=1)
            prob, pred = pred.max(dim=1)
            print(classification_report(batch['targets'], pred.cpu().detach()))