In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.functional import F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch import optim
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report
import sklearn
from sklearn.preprocessing import MinMaxScaler
from numpy import random
from model_architecture import MatchPredictor
from gru_model_architecture import GRUMatchPredictor
import itertools
from sklearn.metrics import accuracy_score
pd.options.display.max_rows = 50

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [3]:
with open('time_series_dataset.pkl', 'rb') as file:
    data = pickle.load(file)

In [4]:
class StatsDataset(Dataset):

    def __init__(self, x, y, cv=False, n_splits=None):
        self.x = x
        self.y = y
        self._cv = cv
        self._n_splits = n_splits
        if self._cv:
            self.folds = self._get_folds()

    def __getitem__(self, index):
        static_df, home_ts, away_ts = self.x[index]
        y = self.y[index]

        home = home_ts.drop(['season'], axis=1)
        away = away_ts.drop(['season'], axis=1)
        cols = ['goals_scored_ratio', 'goals_conceded_ratio', 'raiting_ratio', 'xG_ratio']
        static = static_df[static_df.columns.difference(cols)]
        
        static = torch.as_tensor(static.values, dtype=torch.float32)
        home = torch.as_tensor(home.values, dtype=torch.float32)
        away = torch.as_tensor(away.values, dtype=torch.float32)

        target = torch.as_tensor(y, dtype=torch.long)
        return static, home, away, target
    
    def __len__(self):
        return len(self.x)

    def _get_folds(self):
        folds = [[] for _ in range(self._n_splits)]
        for i, data in enumerate(self.x):
            folds[data[1]['season'].head(1).values[0].astype('int') % self._n_splits].append(i)
        return folds

    def group_matches_by_seasons(self):
        seasons_data_indices = dict()
        for i, data in enumerate(self.x):
            match_season = data[1]['season'].head(1).values[0].astype('int')
            if match_season in seasons_data_indices.keys():
                seasons_data_indices[match_season].append(i)
            else:
                seasons_data_indices[match_season] = [i]
        return seasons_data_indices

    def get_season_subset(self, indices):
        return Subset(self, indices)

    def get_subsets(self, val_idx):
        assert self._cv, (
        'Cross validation is not possible when _cv attribute is set to False (that means it is a testing dataset).')

        train_indices = list(itertools.chain.from_iterable(self.folds[:val_idx] + self.folds[val_idx+1:]))
        val_indices = self.folds[val_idx]
        train_data = Subset(self, train_indices)
        val_data = Subset(self, val_indices)
        return train_data, val_data

In [5]:
seasons_data_indices = [key[2] for key in data.keys()]
max_season = sorted(seasons_data_indices, reverse=True)[0]

train_seasons = np.arange(1, max_season, 1)

#train_seasons = np.arange(1, 64, 1)

test_seasons = np.random.choice(train_seasons, size=10, replace=False)
train_seasons = np.setdiff1d(train_seasons, test_seasons)

train_data = [value for key, value in data.items() if key[2] in train_seasons]
test_data = [value for key, value in data.items() if key[2] in test_seasons]

y_train = np.array([x[0].pop('result') for x in train_data])
y_test = np.array([x[0].pop('result') for x in test_data])


In [6]:
for values in data.values():
    values[1].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)
    values[2].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)

    values[1].fillna({'round' : 55, 'coach_matches' : 0}, inplace=True)
    values[2].fillna({'round' : 55, 'coach_matches' : 0}, inplace=True)

    '''values[1] = values[1][['result_from_team_perspective', 'season']]
    values[2] = values[2][['result_from_team_perspective', 'season']]'''

In [8]:
#model = MatchPredictor(40, 8, 1, 4, device)
model = GRUMatchPredictor(40, 8, 1, 4, device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)



GRUMatchPredictor(
  (home_gru): GRU(40, 8, batch_first=True, dropout=0.2)
  (away_gru): GRU(40, 8, batch_first=True, dropout=0.2)
  (fc): Sequential(
    (0): Linear(in_features=20, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=16, out_features=3, bias=True)
  )
)

In [9]:
def collate_fn(batch):
    statics, homes, aways, targets = zip(*batch)

    away_lens = torch.as_tensor([a.size(0) for a in aways])
    home_lens = torch.as_tensor([h.size(0) for h in homes])

    home_padded = pad_sequence(homes, batch_first=True, padding_value=0.0)
    away_padded = pad_sequence(aways, batch_first=True, padding_value=0.0)

    home_packed = pack_padded_sequence(home_padded, home_lens, batch_first=True, enforce_sorted=False)
    away_packed = pack_padded_sequence(away_padded, away_lens, batch_first=True, enforce_sorted=False)

    statics = torch.stack(statics)
    statics = statics.squeeze(1)
    targets = torch.stack(targets)

    return {
        'statics' : statics,
        'homes' : home_packed,
        'aways' : away_packed,
        'targets' : targets
    }


In [10]:
n_splits = 5
batch_size = 64
num_epochs = 8

train_dataset = StatsDataset(train_data, y_train, cv=True, n_splits=n_splits)
test_dataset = StatsDataset(test_data, y_test)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    for val_idx in range(n_splits):
        train_fold, val_fold = train_dataset.get_subsets(val_idx)

        train_loader = DataLoader(
            train_fold, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn)
        
        val_loader = DataLoader(
            val_fold, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

        model.train()
        running_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
            targets = batch['targets'].squeeze(1).to(device)

            logits = model(statics, homes, aways)
            loss = criterion(logits, targets)
            running_loss += loss.item() * homes.data.size(0)

            loss.backward()
            optimizer.step()

        total_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(total_train_loss)

        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
                targets = batch['targets'].squeeze(1).to(device)

                logits = model(statics, homes, aways)
                loss = criterion(logits, targets)
                running_loss += loss.item() * homes.data.size(0)

            total_val_loss = running_loss / len(val_loader.dataset)
            val_losses.append(total_val_loss)

            pred = F.softmax(logits, dim=1).argmax(dim=1)

    print(f'Epoch: {epoch+1}/{num_epochs} - train loss: {train_losses[-1]} - validation loss: {val_losses[-1]}')


Epoch: 1/8 - train loss: 21.470626441599823 - validation loss: 21.172434831319272
Epoch: 2/8 - train loss: 21.31921265707425 - validation loss: 20.946672870408534
Epoch: 3/8 - train loss: 21.22005295468082 - validation loss: 20.83317345314009
Epoch: 4/8 - train loss: 21.17953526447259 - validation loss: 20.79622736321938
Epoch: 5/8 - train loss: 21.173850065675445 - validation loss: 20.790429281633138
Epoch: 6/8 - train loss: 21.070231935548993 - validation loss: 20.79819008070132
Epoch: 7/8 - train loss: 21.066355263892 - validation loss: 20.66529414897679
Epoch: 8/8 - train loss: 20.985203298777677 - validation loss: 20.58397569669024


In [11]:
test_dataset = StatsDataset(test_data, y_test)
test_loader = DataLoader(
    test_dataset, batch_size=20000, num_workers=0, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    for batch in test_loader:
        statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
        targets = batch['targets'].squeeze(1).to(device)

        logits = model(statics, homes, aways)
        pred = F.softmax(logits, dim=1).argmax(dim=1)
        print(classification_report(targets.cpu(), pred.cpu()))
        
        score = accuracy_score(targets.cpu(), pred.cpu())
        

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       854
           1       0.49      0.87      0.63      1472
           2       0.45      0.32      0.38       953

    accuracy                           0.48      3279
   macro avg       0.32      0.40      0.34      3279
weighted avg       0.35      0.48      0.39      3279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
test_dataset = StatsDataset(test_data, y_test)
data_grouped_by_seasons = test_dataset.group_matches_by_seasons()

model.eval()
with torch.no_grad():
    for season_idx, data in data_grouped_by_seasons.items():
        subset = test_dataset.get_season_subset(season_idx)
        test_loader = DataLoader(subset, 10000, num_workers=0, collate_fn=collate_fn)
        
        for batch in test_loader:
            statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
            targets = batch['targets'].squeeze(1).to(device)

            logits = model(statics, homes, aways)
            pred = F.softmax(logits, dim=1).argmax(dim=1)
            print(f'\nClassification report for season: {season_idx}')
            print(classification_report(targets.cpu(), pred.cpu()))
            


TypeError: object of type 'numpy.int32' has no len()

In [12]:
train_seasons_check = StatsDataset(train_data, y_train)
train_seasons_check_loader = DataLoader(
    train_seasons_check, batch_size=20000, num_workers=0, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    for batch in train_seasons_check_loader:
        statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
        targets = batch['targets'].squeeze(1).to(device)

        logits = model(statics, homes, aways)
        pred = F.softmax(logits, dim=1).argmax(dim=1)
        print(classification_report(y_train, pred.cpu()))
        
        

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      3941
         1.0       0.49      0.87      0.63      6676
         2.0       0.51      0.38      0.44      4742

    accuracy                           0.50     15359
   macro avg       0.33      0.42      0.35     15359
weighted avg       0.37      0.50      0.41     15359



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
train_dataset = StatsDataset(train_data, y_train)
data_grouped_by_seasons = train_dataset.group_matches_by_seasons()

model.eval()
with torch.no_grad():
    for season_idx, data in data_grouped_by_seasons.items():
        subset = test_dataset.get_season_subset(season_idx)
        test_loader = DataLoader(subset, 10000, num_workers=0, collate_fn=collate_fn)
        
        for batch in test_loader:
            statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
            targets = batch['targets'].squeeze(1).to(device)

            logits = model(statics, homes, aways)
            pred = F.softmax(logits, dim=1).argmax(dim=1)
            print(f'\nClassification report for season: {season_idx}')
            print(classification_report(targets.cpu(), pred.cpu()))
            

<torch.utils.data.dataset.Subset object at 0x0000014FC9B98AA0>


TypeError: object of type 'numpy.int32' has no len()

In [None]:
# save GRU model
#torch.save(model.state_dict(), f'gru_model_{score}.pkl')