In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.functional import F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch import optim
from sklearn.metrics import classification_report
import sklearn
from model_architecture import MatchPredictor
from gru_model_architecture import GRUMatchPredictor
import itertools
from sklearn.metrics import accuracy_score
pd.options.display.max_rows = 50

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [3]:
with open('time_series_dataset.pkl', 'rb') as file:
    data = pickle.load(file)

In [4]:
class StatsDataset(Dataset):

    def __init__(self, x, y, cv=False, n_splits=None):
        self.x = x
        self.y = y
        self._cv = cv
        self._n_splits = n_splits
        if self._cv:
            self.folds = self._get_folds()

    def __getitem__(self, index):
        static_df, home_ts, away_ts = self.x[index]
        y = self.y[index]

        home = home_ts.drop(['season'], axis=1)
        away = away_ts.drop(['season'], axis=1)
        #cols = ['goals_scored_ratio', 'goals_conceded_ratio', 'raiting_ratio', 'xG_ratio']
        #static = static_df[static_df.columns.difference(cols)]

        #static = torch.as_tensor(static.values, dtype=torch.float32)
        static = torch.as_tensor(static_df.values, dtype=torch.float32)
        home = torch.as_tensor(home.values, dtype=torch.float32)
        away = torch.as_tensor(away.values, dtype=torch.float32)

        target = torch.as_tensor(y, dtype=torch.long)
        return static, home, away, target
    
    def __len__(self):
        return len(self.x)

    def _get_folds(self):
        folds = [[] for _ in range(self._n_splits)]
        for i, data in enumerate(self.x):
            folds[data[1]['season'].head(1).values[0].astype('int') % self._n_splits].append(i)
        return folds

    def group_matches_by_seasons(self):
        seasons_data_indices = dict()
        for i, data in enumerate(self.x):
            match_season = data[1]['season'].head(1).values[0].astype('int')
            if match_season in seasons_data_indices.keys():
                seasons_data_indices[match_season].append(i)
            else:
                seasons_data_indices[match_season] = [i]
        return seasons_data_indices

    def get_season_subset(self, indices):
        return Subset(self, indices)

    def get_subsets_for_cv(self, val_idx):
        assert self._cv, (
        'Cross validation is not possible when _cv attribute is set to False (that means it is a testing dataset).')

        train_indices = list(itertools.chain.from_iterable(self.folds[:val_idx] + self.folds[val_idx+1:]))
        val_indices = self.folds[val_idx]
        train_data = Subset(self, train_indices)
        val_data = Subset(self, val_indices)
        return train_data, val_data


In [5]:
seasons_data_indices = [key[2] for key in data.keys()]
max_season = sorted(seasons_data_indices, reverse=True)[0]

train_seasons = np.arange(1, max_season, 1)

#train_seasons = np.arange(1, 64, 1)

test_seasons = np.random.choice(train_seasons, size=60, replace=False)
train_seasons = np.setdiff1d(train_seasons, test_seasons)

train_data = [value for key, value in data.items() if key[2] in train_seasons]
test_data = [value for key, value in data.items() if key[2] in test_seasons]

y_train = np.array([x[0].pop('result') for x in train_data])
y_test = np.array([x[0].pop('result') for x in test_data])


In [6]:
for values in data.values():
    values[1].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)
    values[2].drop(['away_name', 'home_name', 'match_date'], inplace=True, axis=1)

    values[1].fillna({'round' : 55, 'coach_matches' : 0}, inplace=True)
    values[2].fillna({'round' : 55, 'coach_matches' : 0}, inplace=True)

    '''values[1] = values[1][['result_from_team_perspective', 'season']]
    values[2] = values[2][['result_from_team_perspective', 'season']]'''

In [7]:
#model = MatchPredictor(40, 8, 1, 4, device)
model = GRUMatchPredictor(40, 8, 1, 4, device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)



GRUMatchPredictor(
  (home_gru): GRU(40, 8, batch_first=True, dropout=0.2)
  (away_gru): GRU(40, 8, batch_first=True, dropout=0.2)
  (fc): Sequential(
    (0): Linear(in_features=20, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=16, out_features=3, bias=True)
  )
)

In [7]:
len(data)

75526

In [9]:
for key, val in data.items():
    print(val[0].info())
    print(val[1].info())
    print(len(val[2].columns))
    break

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   home_points    1 non-null      float64
 1   home_position  1 non-null      float64
 2   away_points    1 non-null      float64
 3   away_position  1 non-null      float64
dtypes: float64(4)
memory usage: 40.0 bytes
None
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 110 to 91
Data columns (total 46 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   away_acc_passes               4 non-null      float64
 1   away_acc_shots                4 non-null      float64
 2   away_blocked_shots            4 non-null      float64
 3   away_corners                  4 non-null      float64
 4   away_excluded_count           4 non-null      float64
 5   away_fouls                    4 non-null      float64
 6   away_free_kicks 

In [7]:
model = MatchPredictor(45, 64, 1, 4, device)
#model = GRUMatchPredictor(45, 8, 1, 4, device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
model.to(device)



MatchPredictor(
  (home_lstm): LSTM(45, 64, batch_first=True, dropout=0.5)
  (away_lstm): LSTM(45, 64, batch_first=True, dropout=0.5)
  (fc): Sequential(
    (0): Linear(in_features=132, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.3, inplace=False)
    (5): Linear(in_features=512, out_features=1024, bias=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=1024, out_features=2048, bias=True)
    (9): ReLU()
    (10): Dropout(p=0.3, inplace=False)
    (11): Linear(in_features=2048, out_features=1024, bias=True)
    (12): ReLU()
    (13): Dropout(p=0.3, inplace=False)
    (14): Linear(in_features=1024, out_features=512, bias=True)
    (15): ReLU()
    (16): Linear(in_features=512, out_features=256, bias=True)
    (17): ReLU()
    (18): Linear(in_features=256, out_features=128, bias=True)
    (19): Linear(in_features=128, out_features=64, bias=True)
    (

In [8]:
def collate_fn(batch):
    statics, homes, aways, targets = zip(*batch)

    away_lens = torch.as_tensor([a.size(0) for a in aways])
    home_lens = torch.as_tensor([h.size(0) for h in homes])

    home_padded = pad_sequence(homes, batch_first=True, padding_value=0.0)
    away_padded = pad_sequence(aways, batch_first=True, padding_value=0.0)

    home_packed = pack_padded_sequence(home_padded, home_lens, batch_first=True, enforce_sorted=False)
    away_packed = pack_padded_sequence(away_padded, away_lens, batch_first=True, enforce_sorted=False)

    statics = torch.stack(statics)
    statics = statics.squeeze(1)
    #statics = statics.flatten(start_dim=1)
    targets = torch.stack(targets)

    return {
        'statics' : statics,
        'homes' : home_packed,
        'aways' : away_packed,
        'targets' : targets
    }


In [9]:
batch_size = 64
num_epochs = 16

train_dataset = StatsDataset(train_data, y_train, cv=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

test_dataset = StatsDataset(test_data, y_test)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size, num_workers=0, collate_fn=collate_fn)

train_losses = []

for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for batch in train_loader:
            optimizer.zero_grad()
            statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
            targets = batch['targets'].squeeze(1).to(device)

            logits = model(statics, homes, aways)
            loss = criterion(logits, targets)
            running_loss += loss.item() * homes.data.size(0)

            loss.backward()
            optimizer.step()

        total_train_loss = running_loss / len(train_loader.dataset)
        train_losses.append(total_train_loss)

        '''model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                statics, homes, aways = (
                     batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device))
                targets = batch['targets'].squeeze(1).to(device)

                logits = model(statics, homes, aways)
                loss = criterion(logits, targets)
                running_loss += loss.item() * homes.data.size(0)

            total_val_loss = running_loss / len(val_loader.dataset)
            val_losses.append(total_val_loss)

            pred = F.softmax(logits, dim=1).argmax(dim=1)'''

        print(f'Epoch: {epoch+1}/{num_epochs} - train loss: {train_losses[-1]}')


Epoch: 1/16 - train loss: 22.55870849949587
Epoch: 2/16 - train loss: 20.665281972673053
Epoch: 3/16 - train loss: 20.663743266925916
Epoch: 4/16 - train loss: 20.658232658141177
Epoch: 5/16 - train loss: 20.654469244727693
Epoch: 6/16 - train loss: 20.658186029508187
Epoch: 7/16 - train loss: 20.660564913358034
Epoch: 8/16 - train loss: 20.655567268897578
Epoch: 9/16 - train loss: 20.657768057459364
Epoch: 10/16 - train loss: 20.654833396445536
Epoch: 11/16 - train loss: 20.656584797619022
Epoch: 12/16 - train loss: 20.65578469129728
Epoch: 13/16 - train loss: 40.277145173131565
Epoch: 14/16 - train loss: 20.7137404502654
Epoch: 15/16 - train loss: 20.677322747440467
Epoch: 16/16 - train loss: 20.66995717326623


In [10]:
test_dataset = StatsDataset(test_data, y_test)
test_loader = DataLoader(
    test_dataset, batch_size=20000, num_workers=0, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    for batch in test_loader:
        statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
        targets = batch['targets'].squeeze(1).to(device)

        logits = model(statics, homes, aways)
        pred = F.softmax(logits, dim=1).argmax(dim=1)
        print(classification_report(targets.cpu(), pred.cpu()))
        
        score = accuracy_score(targets.cpu(), pred.cpu())

np.bincount(pred.cpu())
        

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4528
           1       0.45      1.00      0.62      7697
           2       0.00      0.00      0.00      4833

    accuracy                           0.45     17058
   macro avg       0.15      0.33      0.21     17058
weighted avg       0.20      0.45      0.28     17058



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([    0, 17058], dtype=int64)

In [11]:
train_seasons_check = StatsDataset(train_data, y_train)
train_seasons_check_loader = DataLoader(
    train_seasons_check, batch_size=80000, num_workers=0, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    for batch in train_seasons_check_loader:
        statics, homes, aways = batch['statics'].to(device), batch['homes'].to(device), batch['aways'].to(device)
        targets = batch['targets'].squeeze(1).to(device)

        logits = model(statics, homes, aways)
        pred = F.softmax(logits, dim=1).argmax(dim=1)
        print(classification_report(targets.cpu(), pred.cpu()))
        
        

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     15179
           1       0.44      1.00      0.62     25856
           2       0.00      0.00      0.00     17154

    accuracy                           0.44     58189
   macro avg       0.15      0.33      0.21     58189
weighted avg       0.20      0.44      0.27     58189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# save GRU model
#torch.save(model.state_dict(), f'gru_model_{score}.pkl')