# Training

## Step 1: import

In [33]:
from random import randrange

import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping

from _MatchNotFoundException import MatchNotFoundException

In [34]:
match_cols = ['season', 'round'] + \
['date', 'time', 'referee', 'home_team', 'away_team', 'home_score', 'away_score'] + \
['home_coach'] + \
['home_player_' + str(i) for i in range(1, 12)] + \
['home_substitute_' + str(i) for i in range(1, 13)] + \
['away_coach'] + \
['away_player_' + str(i) for i in range(1, 12)] + \
['away_substitute_' + str(i) for i in range(1, 13)]

In [35]:
train = pd.read_csv('train.csv')
tot_num_of_feats = len(train.columns)
del train

## Step 2: dataset definition

In [36]:
# utility methods
def get_match_by_team_season_round(df: pd.DataFrame, team: str, season: int, round: int) -> pd.DataFrame:
    """Get the match played by the given team in the given season and round. If the team has not played any match in that round, an empty dataframe is returned."""
    return df[((df[f'home_team_{team}'] == 1) | (df[f'away_team_{team}'] == 1)) & (df['round'] == round) & (df['season'] == season)]


def get_last_n_matches_played_by_team_before_round_in_season(df: pd.DataFrame, team: str, season: int, round: int, n: int) -> pd.DataFrame:
    """Look in df for the last n matches played by the given team before the given round and season. A dataframe with exactly n element is returned.
    If n matches can't be found from the current season, the previous ones are iteratively considered, until n matches are found or the end
    of the dataframe is reached, in which case padding is applied to ensure a result size of n."""

    def exists_historical_matches_before_round_and_season(q_round: int, q_season: int) -> bool:
        if (q_season == 0) & (q_round <= 1):
            return False
        return True

    def decrement_round_in_season(c_round: int, c_season: int) -> (int, int):
        if c_round - 1 > 0:
            c_round -= 1
            return c_round, c_season
        c_season -= 1
        c_round = 38
        return c_round, c_season

    current_round, current_season = round, season
    result = pd.DataFrame()
    while True:
        if not exists_historical_matches_before_round_and_season(current_round, current_season):
            if result.empty:
                raise MatchNotFoundException
            return fill_with_padding(result)
        current_round, current_season = decrement_round_in_season(current_round, current_season)
        historical_match_at_current_round = get_match_by_team_season_round(df, team, current_season, current_round)
        if not historical_match_at_current_round.empty:
            result = pd.concat([result, historical_match_at_current_round])
            if len(result) == n:
                return result


def fill_with_padding(source: pd.DataFrame):
    if len(source) < 5:
        padding = source.tail(1)
        for i in range(5 - len(source)):
            source = pd.concat([source, padding], ignore_index=True)
    return source


def get_playing_home_team_name(row: pd.DataFrame) -> str:
    team_columns = row.loc[:, [col for col in row.columns if col.startswith('home_team_')]]
    team_name = team_columns.where(team_columns == 1).dropna(axis=1).columns[0].replace('home_team_', '')
    return team_name


def get_playing_away_team_name(row: pd.DataFrame) -> str:
    team_columns = row.loc[:, [col for col in row.columns if col.startswith('away_team_')]]
    team_name = team_columns.where(team_columns == 1).dropna(axis=1).columns[0].replace('away_team_', '')
    return team_name

In [37]:
class SerieAFootballMatchesDataset(Dataset):
    def __init__(self, csv_file, history_len = 5):
        self.dataframe = pd.read_csv(csv_file)
        self.history_len = history_len

    def __len__(self) -> int:
        return len(self.dataframe)

    def __getitem__(self, idx):
        def show_error(index, error_x, error_x_historical_home, error_x_historical_away, error_y):
            print(f'error at index (scaled): {index} (unscaled): {self.unscale_min_idx(index)}')
            print(f'x: {error_x}')
            print(f'x.shape: {error_x.shape}')
            print(f'x_historical_home: {error_x_historical_home}')
            print(f'x_historical_home.shape: {error_x_historical_home.shape}')
            print(f'x_historical_away: {error_x_historical_away}')
            print(f'x_historical_away.shape: {error_x_historical_away.shape}')
            print(f'y: {error_y}')
            print(f'y.shape: {error_y.shape}')

        idx = self.scale_min_idx(idx)
        x = self.dataframe.iloc[[idx]]  # df
        y = self.dataframe[['result_home', 'result_draw', 'result_away']].iloc[0].values
        try: # if we are not able to fetch at least one historical match, then we switch to another index
            last_n_games_home, last_n_games_away = self.retrieve_historical_data(x)
            x, x_historical_home, x_historical_away, y = self.to_tensor(x, last_n_games_home, last_n_games_away, y)
            exp_num_of_features = len(self.dataframe.columns)
            if x.shape[0] != exp_num_of_features:
                show_error(idx, x, x_historical_home, x_historical_away, y)
            if (x_historical_home.shape[0] != 5) | (x_historical_home.shape[1] != exp_num_of_features):
                show_error(idx, x, x_historical_home, x_historical_away, y)
            if (x_historical_away.shape[0] != 5) | (x_historical_away.shape[1] != exp_num_of_features):
                show_error(idx, x, x_historical_home, x_historical_away, y)
            if y.shape[0] != 3:
                show_error(idx, x, x_historical_home, x_historical_away, y)
            return x, x_historical_home, x_historical_away, y
        except MatchNotFoundException:
            new_idx = randrange(0, len(self.dataframe))
            print(f'MatchNotFoundException for idx={idx}, switching to idx={new_idx}')
            return self.__getitem__(new_idx)

    def scale_min_idx(self, idx: int) -> int:
        """Scale the given index to a range with a new minimum that allows for historical data retrieval"""
        old_min = 0
        old_max = len(self.dataframe)
        # idx = 10 corresponds to the first match of the second round.
        # This ensure the retrieval of at least 1 historical match.
        # In the worst case scenario, padding will fill the other 4 historical slots.
        new_min = 10
        new_max = old_max
        return self.scale_idx(idx, old_min, old_max, new_min, new_max)

    def unscale_min_idx(self, idx: int) -> int:
        """Apply the inverse transformation of scale_min_idx"""
        old_min = 10
        old_max = len(self.dataframe)
        new_min = 0
        new_max = old_max
        return self.scale_idx(idx, old_min, old_max, new_min, new_max)

    def scale_idx(self, idx, old_min, old_max, new_min, new_max):
        """Scale the given index to a new range"""
        old_range = old_max - old_min
        new_range = new_max - new_min
        normalized_idx = (idx - old_min) / old_range
        return int(round(normalized_idx * new_range + new_min))

    def retrieve_historical_data(self, source: pd.DataFrame):
        """Retrieve historical data for home and away teams from source"""
        last_n_games_home = get_last_n_matches_played_by_team_before_round_in_season(
            self.dataframe, get_playing_home_team_name(source), source['season'].values[0], source['round'].values[0], self.history_len)
        last_n_games_away = get_last_n_matches_played_by_team_before_round_in_season(
            self.dataframe, get_playing_away_team_name(source), source['season'].values[0], source['round'].values[0], self.history_len)
        return last_n_games_home, last_n_games_away

    def to_tensor(self, x: pd.DataFrame, x_historical_home: pd.DataFrame, x_historical_away: pd.DataFrame, y: list[int]):
        x_tensor = torch.flatten(torch.tensor(x.values))
        x_historical_home_tensor = torch.tensor(x_historical_home.values)
        x_historical_away_tensor = torch.tensor(x_historical_away.values)
        y_tensor = torch.tensor(y)
        return x_tensor, x_historical_home_tensor, x_historical_away_tensor, y_tensor

In [38]:
train_dataset = SerieAFootballMatchesDataset(csv_file='train.csv')
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = SerieAFootballMatchesDataset(csv_file='test.csv')
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Step 3: training

### Parameters

In [39]:
learning_rate = 0.00001
num_epochs = 1
hidden_size = 128

### Model definition

In [40]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.linear = nn.Linear(input_size + hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat([input, hidden], dim=0)
        pre_hidden = self.linear(combined)
        hidden = self.tanh(pre_hidden)
        return hidden

    def init_hidden(self, minibatch_size):
        return torch.zeros(minibatch_size, self.hidden_size)


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 3),
            # nn.Softmax(dim=1) softmax is applied implicitly by CrossEntropyLoss
        )

    def forward(self, x):
        # 'x' is the combination of: 'x', 'x_historical_home', 'x_historical_away'
        # they all have size: minibatch_size x num_of_feats
        x = self.flatten(x) # just in case x was not flattened
        output = self.layers(x)
        return output


class HybridNetwork(pl.LightningModule):
    def __init__(self, rnn_home_model: RNN, rnn_away_model: RNN, mlp_model: NeuralNetwork, learning_rate: float = 0.001):
        super(HybridNetwork, self).__init__()
        self.rnn_home = rnn_home_model
        self.rnn_away = rnn_away_model
        self.mlp = mlp_model
        self.learning_rate = learning_rate

    def forward(self, x, x_historical_home, x_historical_away):
        """Compute y_hat from dataloader input"""
        # 'x' comes in as:                minibatch_size x 1 x num_of_feats
        # 'x_historical_*' comes in as:   minibatch_size x 5 x num_of_feats
        # 'rnn_*_hidden' will be:         minibatch_size x num_of_feats
        batch_size = x.size(0)
        time_seq_len = x_historical_home.size(1)
        ''' === RNN HOME FORWARD === '''
        rnn_home_hidden = self.rnn_home.init_hidden(batch_size)
        # print(rnn_home_hidden)
        for batch_idx in range(batch_size):
            for history_idx in range(time_seq_len):
                rnn_home_hidden[batch_idx] = self.rnn_home(
                    torch.flatten(x_historical_home[batch_idx, history_idx]),
                    rnn_home_hidden[batch_idx])
        ''' === RNN AWAY FORWARD === '''
        rnn_away_hidden = self.rnn_away.init_hidden(batch_size)
        for batch_idx in range(batch_size):
            for history_idx in range(time_seq_len):
                rnn_away_hidden[batch_idx] = self.rnn_away(
                    torch.flatten(x_historical_away[batch_idx, history_idx]),
                    rnn_away_hidden[batch_idx])
        # print(f'rnn_home: {rnn_home_hidden}, rnn_away: {rnn_away_hidden}')
        ''' === MLP FORWARD === '''
        x_train = torch.cat([x, rnn_home_hidden, rnn_away_hidden], dim=1)
        y_hat = self.mlp(x_train)
        return y_hat

    def training_step(self, batch, batch_idx):
        x, x_historical_home, x_historical_away, y = batch
        y_hat = self(x, x_historical_home, x_historical_away)
        loss = F.cross_entropy(y_hat, y.to(dtype=torch.float))
        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def test_step(self, batch, batch_idx):
        x, x_historical_home, x_historical_away, y = batch
        y_hat = self(x, x_historical_home, x_historical_away)
        test_loss = F.cross_entropy(y_hat, y.to(dtype=torch.float))
        return {"test_loss": test_loss}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x["test_loss"] for x in outputs]).mean()
        logs = {"test_loss": avg_loss}
        return {"test_loss": avg_loss, "log": logs, "progress_bar": logs}

    def configure_optimizers(self):
        return optim.SGD(model.parameters(), lr=self.learning_rate)

In [41]:
rnn_home = RNN(input_size=tot_num_of_feats, hidden_size=hidden_size)
rnn_away = RNN(input_size=tot_num_of_feats, hidden_size=hidden_size)
mlp = NeuralNetwork(hidden_size * 2 + tot_num_of_feats)
model = HybridNetwork(rnn_home_model=rnn_home, rnn_away_model=rnn_away, mlp_model=mlp, learning_rate=learning_rate)

### Model fitting

In [42]:
logger = TensorBoardLogger("tb_logs", name="football_results")
trainer = pl.Trainer(max_epochs=num_epochs, logger=logger)
trainer.fit(model, train_dataloader=train_loader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name     | Type          | Params
-------------------------------------------
0 | rnn_home | RNN           | 1 M   
1 | rnn_away | RNN           | 1 M   
2 | mlp      | NeuralNetwork | 4 M   


Training: 0it [00:00, ?it/s]

MatchNotFoundException for idx=2433, switching to idx=167
MatchNotFoundException for idx=765, switching to idx=2620
MatchNotFoundException for idx=384, switching to idx=1236
MatchNotFoundException for idx=1528, switching to idx=3946
MatchNotFoundException for idx=385, switching to idx=3908
MatchNotFoundException for idx=1902, switching to idx=4463
MatchNotFoundException for idx=3804, switching to idx=2070
MatchNotFoundException for idx=3042, switching to idx=1355
MatchNotFoundException for idx=766, switching to idx=793
MatchNotFoundException for idx=1901, switching to idx=3762
MatchNotFoundException for idx=2662, switching to idx=1758
MatchNotFoundException for idx=386, switching to idx=321
MatchNotFoundException for idx=3039, switching to idx=4694
MatchNotFoundException for idx=3803, switching to idx=3608
MatchNotFoundException for idx=4563, switching to idx=2350
MatchNotFoundException for idx=4184, switching to idx=2315
MatchNotFoundException for idx=1149, switching to idx=2302
Match

1

In [43]:
trainer.test(model, test_dataloaders=test_loader)

Testing: 0it [00:00, ?it/s]

MatchNotFoundException for idx=38, switching to idx=389
MatchNotFoundException for idx=99, switching to idx=251
MatchNotFoundException for idx=384, switching to idx=755
MatchNotFoundException for idx=386, switching to idx=325
MatchNotFoundException for idx=389, switching to idx=912
MatchNotFoundException for idx=762, switching to idx=1041
MatchNotFoundException for idx=777, switching to idx=979
MatchNotFoundException for idx=779, switching to idx=48
--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(6.5140)}
--------------------------------------------------------------------------------


{'test_loss': 6.51402473449707}

# Limitations
- We don't have data about new players that come to play in _Serie A_ during the course of the seasons. The model has to learn from zero context how important their contribution is for the outcome of the matches. If we were to considered multiple leagues, we could keep track of player transfers and maintain the history.
- We don't have data about cup matches played during the course of the seasons, like _Champions League_, _Europa League_ and _Coppa Italia_. Since they are very prestigious competitions and matches are usually very competitive, teams put a lot of effort in them and therefore can then perform worse in the championship.
- We don't have any type of player performance metric like who scored a goal, who was the assist man, red or yellow cards, goalkeeper's saves etc. so the model could face some difficulties in learning which player is important for the team.