In [1920]:
import re

import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelBinarizer
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from train_utils import AverageMeter

In [1921]:
match_cols = ['season', 'round'] + \
['date', 'time', 'referee', 'home_team', 'away_team', 'home_team_score', 'away_team_score'] + \
['home_team_coach'] + \
['home_player_' + str(i) for i in range(1, 12)] + \
['home_substitute_' + str(i) for i in range(1, 8)] + \
['away_team_coach'] + \
['away_player_' + str(i) for i in range(1, 12)] + \
['away_substitute_' + str(i) for i in range(1, 8)]

In [1922]:
train = pd.read_csv('train.csv')

In [1923]:
train.head()

Unnamed: 0,season,round,year,month,day,hour,home_team_score,away_team_score,referee_ANDREA DE,referee_ANDREA GERVASONI,...,away_substitute_7_Vitaliy Kutuzov,away_substitute_7_Vitorino Antunes,away_substitute_7_Vittorio Tosto,away_substitute_7_Walter Samuel,away_substitute_7_Willy Aubameyang,away_substitute_7_Wilson,away_substitute_7_Xhulian Rrudho,away_substitute_7_Yoann Gourcuff,away_substitute_7_Zdravko Kuzmanovic,away_substitute_7_Zlatan Muslimovic
0,0,1,2005,8,27,20,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,2005,8,27,18,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,2005,8,28,15,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,2005,8,28,15,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,2005,8,28,15,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1882]:
# utility methods
def get_column_names_containing_str(df: pd.DataFrame, substring: str) -> list[str]:
    return df.loc[:, df.columns.str.contains(substring)].columns.values.tolist()


def get_team_and_historical_index_from_match_team_id(match_team_id: str) -> (str, str):
    match_team_name = re.findall("\s+", match_team_id)[0]
    match_team_index = re.findall("\d+", match_team_id)[0]
    return match_team_name, match_team_index


def get_match_by_team_season_round(df: pd.DataFrame, team: str, season: int, round: int) -> pd.DataFrame:
    return df[((df[f'home_team_{team}'] == 1) | (df[f'away_team_{team}'] == 1)) & (df['round'] == round) & (df['season'] == season)]


def get_last_n_matches_played_by_team_before_round_in_season(df: pd.DataFrame, team: str, season: int, round: int, n: int) -> pd.DataFrame:
    last_n_matches = pd.DataFrame()
    for i in range(1, n + 1):
        if round - i > 0:
            last_n_matches = pd.concat([last_n_matches, get_match_by_team_season_round(df, team, season, round - i)])
    return last_n_matches

### Data encoding
We need to encode the data before feeding it to the network. Here we define encoding methods that returns pytorch Tensors.

#### Seasons and Rounds

In [1883]:
# class SeasonRoundEncoder(object):
#     """Encode the season and round columns of the given pandas DataFrame sample"""
#
#     def __init__(self, season_dict_map: dict):
#         self.mapping = season_dict_map
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         season_encoding = torch.tensor([[el] for el in sample['season'].map(self.mapping).tolist()], dtype=torch.int32)
#         round_encoding = torch.tensor([[el] for el in sample['round'].tolist()], dtype=torch.int32)
#         return torch.cat([season_encoding, round_encoding], 1)

In [1884]:
# season2index = {'20' + f'{i + 5}'.zfill(2) + '-' + f'{i + 6}'.zfill(2): i for i in range(16)}
# season_round_encoder = SeasonRoundEncoder(season2index)

In [1885]:
# # TEST seasons and rounds encoding
# tensor = season_round_encoder(df.iloc[0:2])
# seasons_rounds_expected_num_of_feats = 2
# if tensor.shape[1] == seasons_rounds_expected_num_of_feats:
#     print('SEASONS and ROUNDS encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {seasons_rounds_expected_num_of_feats}')
#     raise Exception('SEASONS and ROUNDS encoding NOT OK! :(')

SEASONS and ROUNDS encoding OK


#### Datetime values

In [1886]:
# class DatetimeEncoder(object):
#     """Encode the year, month, day and hour columns of the given pandas DataFrame sample"""
#
#     def __init__(self):
#         pass
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         year_encoding = torch.tensor([[el] for el in sample['year'].tolist()], dtype=torch.int32)
#         month_encoding = torch.tensor([[el] for el in sample['month'].tolist()], dtype=torch.int32)
#         day_encoding = torch.tensor([[el] for el in sample['day'].tolist()], dtype=torch.int32)
#         hour_encoding = torch.tensor([[el] for el in sample['hour'].tolist()], dtype=torch.int32)
#         return torch.cat([year_encoding, month_encoding, day_encoding, hour_encoding], 1)

In [1887]:
# datetime_encoder = DatetimeEncoder()

In [1888]:
# # TEST datetime values encoding
# tensor = datetime_encoder(df.iloc[0:2])
# datetime_expected_num_of_feats = 4
# if tensor.shape[1] == datetime_expected_num_of_feats:
#     print('DATETIME encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {datetime_expected_num_of_feats}')
#     raise Exception('DATETIME encoding NOT OK! :(')

DATETIME encoding OK


#### Results
One-hot encoding

In [1889]:
# class ResultEncoder(object):
#     """Encode the result column of the given pandas DataFrame sample"""
#
#     def __init__(self, dict_map: dict):
#         self.mapping = dict_map
#
#     def __call__(self, sample: pd.DataFrame) -> torch.LongTensor:
#         return torch.LongTensor(sample['result'].map(self.mapping).tolist())

In [1890]:
# result2onehot = {'home': [1, 0, 0], 'draw': [0, 1, 0], 'away': [0, 0, 1]}
# result_encoder = ResultEncoder(result2onehot)

In [1891]:
# # TEST results encoding
# tensor = result_encoder(df.iloc[0:2])
# print(tensor)
# print(type(tensor.to(torch.long)))
# results_expected_num_of_feats = len(df['result'].unique())
# if tensor.shape[1] == results_expected_num_of_feats:
#     print('RESULT encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {results_expected_num_of_feats}')
#     raise Exception('RESULT encoding NOT OK! :(')

tensor([[1, 0, 0],
        [1, 0, 0]])
<class 'torch.Tensor'>
RESULT encoding OK


#### Referees
One-hot encoding

In [1892]:
# class RefereeEncoder(object):
#     """Encode the referee column of the given pandas DataFrame sample"""
#
#     def __init__(self, lb: LabelBinarizer):
#         self.lb = lb
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         return torch.tensor(self.lb.transform(sample['referee'].tolist()))

In [1893]:
# lb = LabelBinarizer()
# fitted_lb = lb.fit(df['referee'].tolist())
# referee_encoder = RefereeEncoder(fitted_lb)

In [1894]:
# # TEST referees encoding
# tensor = referee_encoder(df.iloc[0:2])
# referees_expected_num_of_feats = len(df['referee'].unique())
# if tensor.shape[1] == referees_expected_num_of_feats:
#     print('REFEREE encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {referees_expected_num_of_feats}')
#     raise Exception('REFEREE encoding NOT OK! :(')

REFEREE encoding OK


#### Teams
One-hot encoding

In [1895]:
# class TeamsEncoder(object):
#     """Encode the home_team and away_team columns of the given pandas DataFrame sample"""
#
#     def __init__(self, lb: LabelBinarizer):
#         self.lb = lb
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         home_encoding = torch.tensor(self.lb.transform(sample['home_team'].tolist()))
#         away_encoding = torch.tensor(self.lb.transform(sample['away_team'].tolist()))
#         return torch.cat([home_encoding, away_encoding], 1)

In [1896]:
# lb = LabelBinarizer()
# # every team has played as home team at least once
# fitted_lb = lb.fit(df['home_team'].tolist())
# teams_encoder = TeamsEncoder(fitted_lb)

In [1897]:
# # TEST teams encoding
# tensor = teams_encoder(df.iloc[0:2])
# teams_expected_num_of_feats = len(df['home_team'].unique()) * 2
# if tensor.shape[1] == teams_expected_num_of_feats:
#     print('TEAMS encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {teams_expected_num_of_feats}')
#     raise Exception('TEAMS encoding NOT OK! :(')

TEAMS encoding OK


#### Coaches
One-hot encoding

In [1898]:
# class CoachesEncoder(object):
#     """Encode the home_team_coach and away_team_coach columns of the given pandas DataFrame sample"""
#
#     def __init__(self, lb: LabelBinarizer):
#         self.lb = lb
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         home_coach_encoding = torch.tensor(self.lb.transform(sample['home_team_coach'].tolist()))
#         away_coach_encoding = torch.tensor(self.lb.transform(sample['away_team_coach'].tolist()))
#         return torch.cat([home_coach_encoding, away_coach_encoding], 1)

In [1899]:
# lb = LabelBinarizer()
# # every team has played as home team at least once, so home_team_coach already contains all the coaches
# fitted_lb = lb.fit(df['home_team_coach'].tolist())
# coaches_encoder = CoachesEncoder(fitted_lb)

In [1900]:
# # TEST coaches encoding
# tensor = coaches_encoder(df.iloc[0:2])
# coaches_expected_num_of_feats = len(df['home_team_coach'].unique()) * 2
# if tensor.shape[1] == coaches_expected_num_of_feats:
#     print('COACH encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {coaches_expected_num_of_feats}')
#     raise Exception('COACH encoding NOT OK! :(')

COACH encoding OK


#### Players
One-hot encoding. We treat all players equally, both those that are part of the lineup and the substitutes

In [1901]:
# class PlayersEncoder(object):
#     """Encode the home and away team lineup and substitute players of the given pandas DataFrame sample"""
#
#     def __init__(self, lb: LabelBinarizer):
#         self.lb = lb
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         result = []
#         for i in range(1, 12):
#             result += [torch.tensor(self.lb.transform(sample[f'home_player_{i}'].tolist()))]
#         for i in range(1, 8):
#             result += [torch.tensor(self.lb.transform(sample[f'home_substitute_{i}'].tolist()))]
#         for i in range(1, 12):
#             result += [torch.tensor(self.lb.transform(sample[f'away_player_{i}'].tolist()))]
#         for i in range(1, 8):
#             result += [torch.tensor(self.lb.transform(sample[f'away_substitute_{i}'].tolist()))]
#         return torch.cat(result, 1)

In [1902]:
# def flatten_list(list_of_lists: list[list[str]]) -> list[str]:
#     return [item for sublist in list_of_lists for item in sublist]
#
#
# def encode_fit_players(source_df: pd.DataFrame) -> LabelBinarizer:
#     lb = LabelBinarizer()
#     player_cols = get_column_names_containing_str(source_df, 'home_player')
#     player_cols += get_column_names_containing_str(source_df, 'home_substitute')
#     all_players_unflattened = source_df.loc[:, player_cols].values.tolist()
#     all_players_flattened = flatten_list(all_players_unflattened)
#     lb.fit(all_players_flattened)
#     return lb

In [1903]:
# lb = LabelBinarizer()
# fitted_lb = encode_fit_players(df)
# players_encoder = PlayersEncoder(fitted_lb)

In [1904]:
# # TEST players encoding
# player_cols = get_column_names_containing_str(df, 'home_player')
# player_cols += get_column_names_containing_str(df, 'home_substitute')
# tensor = players_encoder(df.iloc[0:2])
# all_unique_player_names = pd.concat([df[player_cols[i]] for i in range(len(player_cols))], axis=0).unique()
# players_expected_num_of_feats = len(all_unique_player_names) * (11 + 7) * 2
# if tensor.shape[1] == players_expected_num_of_feats:
#     print('PLAYER encoding OK')
# else:
#     print(f'num of features: {tensor.shape[1]}')
#     print(f'expected num of features: {players_expected_num_of_feats}')
#     raise Exception('PLAYER encoding NOT OK! :(')

PLAYER encoding OK


#### Encoder

In [1905]:
# class Encode(object):
#     """Encode the given pandas DataFrame sample and return a pytorch Tensor"""
#
#     def __init__(self, season_round_enc: SeasonRoundEncoder, datetime_enc: DatetimeEncoder,
#                  result_enc: ResultEncoder, referee_enc: RefereeEncoder, teams_enc: TeamsEncoder,
#                  coaches_enc: CoachesEncoder, players_enc: PlayersEncoder,
#                  keep_scores: bool, keep_result: bool):
#         self.season_round_encoder = season_round_enc
#         self.datetime_encoder = datetime_enc
#         self.result_encoder = result_enc
#         self.referee_encoder = referee_enc
#         self.teams_encoder = teams_enc
#         self.coaches_encoder = coaches_enc
#         self.players_encoder = players_enc
#         self.keep_scores = keep_scores
#         self.keep_result = keep_result
#
#     def __call__(self, sample: pd.DataFrame) -> torch.tensor:
#         encoded = torch.cat((
#             self.season_round_encoder(sample),
#             self.datetime_encoder(sample),
#             self.referee_encoder(sample),
#             self.teams_encoder(sample),
#             self.coaches_encoder(sample),
#             self.players_encoder(sample)
#         ), dim=1)
#         # print(encoded)
#         if self.keep_scores:
#             encoded = self.add_encoded_scores(encoded, sample)
#         if self.keep_result:
#             return self.add_encoded_result(encoded, sample)
#         return encoded
#
#     def add_encoded_scores(self, target: torch.tensor, source: pd.DataFrame) -> torch.tensor:
#         return torch.cat([
#             target,
#             torch.tensor([[el] for el in source['home_team_score'].tolist()], dtype=torch.int32),
#             torch.tensor([[el] for el in source['away_team_score'].tolist()], dtype=torch.int32)
#         ], 1)
#
#     def add_encoded_result(self, target: torch.tensor, source: pd.DataFrame) -> torch.tensor:
#         return torch.cat([target, self.result_encoder(source)], 1)

In [1906]:
# full_encoder = Encode(season_round_encoder, datetime_encoder, result_encoder, referee_encoder, teams_encoder,
#                       coaches_encoder, players_encoder, True, True)
#
# no_result_encoder = Encode(season_round_encoder, datetime_encoder, result_encoder, referee_encoder, teams_encoder,
#                            coaches_encoder, players_encoder, True, False)
#
# basic_encoder = Encode(season_round_encoder, datetime_encoder, result_encoder, referee_encoder, teams_encoder,
#                        coaches_encoder, players_encoder, False, False)

In [1907]:
# scores_expected_num_of_feats = 2

In [1908]:
# # print train tensor example
# test_sample = df.iloc[0:2]
# test_encoded_sample = full_encoder(test_sample)
# total_num_of_features = seasons_rounds_expected_num_of_feats + \
# datetime_expected_num_of_feats + \
# results_expected_num_of_feats + \
# referees_expected_num_of_feats + \
# teams_expected_num_of_feats + \
# coaches_expected_num_of_feats + \
# players_expected_num_of_feats + \
# scores_expected_num_of_feats
# if test_encoded_sample.shape[1] == total_num_of_features:
#     print("encoding OK")
# else:
#     print(f'num of features: {test_encoded_sample.shape[1]}')
#     print(f'expected num of features: {total_num_of_features}')
#     raise Exception("encoding NOT OK")
#


encoding OK


In [1909]:
tot_num_of_feats = len(train.columns)
print(f'Total number of encoded features: {tot_num_of_feats}')

Total number of encoded features: 39912


In [1910]:
del train
# del lb

### Data normalization

In [1911]:
# todo

### Dataset construction

We need to define a torch Dataset and torch Dataloader that will be used during training.

In [1912]:
def fill_with_padding(source: pd.DataFrame):
    if len(source) < 5:
        initial_len = len(source)
        padding = source.tail(1)
        for i in range(5 - len(source)):
            source = pd.concat([source, padding], ignore_index=True)
        # print(f'padding applied. Initial len: {initial_len} new_len: {len(source)}')
    return source


class SerieAFootballMatchesDataset(Dataset):
    history_len = 5

    def __init__(self, csv_file):
        self.dataframe = pd.read_csv(csv_file)

    def __len__(self) -> int:
        return len(self.dataframe)

    def __getitem__(self, idx):
        # print(f'picked index {idx}')
        idx = self.scale_idx(idx)
        x_as_df = self.dataframe.iloc[[idx]]  # df
        x = self.dataframe.loc[idx]  # series
        # encoder expects a dataframe with a 'result' column
        y = self.dataframe[['result_home', 'result_draw', 'result_home']]
        last_n_games_home, last_n_games_away = self.retrieve_historical_data(x)
        last_n_games_home = fill_with_padding(last_n_games_home)
        last_n_games_away = fill_with_padding(last_n_games_away)
        x, x_historical_home, x_historical_away, y = self.to_tensor(x_as_df, last_n_games_home, last_n_games_away, y)
        return x, x_historical_home, x_historical_away, y

    def scale_idx(self, idx: int) -> int:
        """Scale the given index to a range that allows for historical data retrieval"""
        old_min = 0
        old_max = len(self.dataframe)
        '''
        idx = 10 corresponds to the first match of the second round.
        This ensure the retrieval of at least 1 historical match.
        In the worst case scenario, padding will fill the other 4 historical slots.
        '''
        new_min = 10
        new_max = old_max
        old_range = old_max - old_min
        new_range = new_max - new_min
        normalized_idx = (idx - old_min) / old_range
        return int(round(normalized_idx * new_range + new_min))

    def retrieve_historical_data(self, source: pd.DataFrame):
        """Retrieve historical data for home and away teams from source"""
        last_n_games_home = get_last_n_matches_played_by_team_before_round_in_season(
            self.dataframe, source['home_team'], source['season'], source['round'], self.history_len)
        last_n_games_away = get_last_n_matches_played_by_team_before_round_in_season(
            self.dataframe, source['away_team'], source['season'], source['round'], self.history_len)
        return last_n_games_home, last_n_games_away

    def to_tensor(self, x: pd.DataFrame, x_historical_home: pd.DataFrame, x_historical_away: pd.DataFrame,
               y: pd.DataFrame):
        x_tensor = torch.tensor(x.values)
        x_historical_home_tensor = torch.tensor(x_historical_home)
        x_historical_away_tensor = torch.tensor(x_historical_away)
        y_tensor = torch.tensor(y)
        return x_tensor, x_historical_home_tensor, x_historical_away_tensor, y_tensor


## Training

In [1913]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.linear = nn.Linear(input_size + hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat([input, hidden], dim=0)
        pre_hidden = self.linear(combined)
        hidden = self.tanh(pre_hidden)
        return hidden

    def init_hidden(self, minibatch_size):
        return torch.zeros(minibatch_size, self.hidden_size)


class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 3),
            # nn.Softmax(dim=1) softmax is applied implicitly by CrossEntropyLoss
        )

    def forward(self, x):
        # 'x' is the combination of: 'x', 'x_historical_home', 'x_historical_away'
        # they all have size: minibatch_size x num_of_feats
        x = self.flatten(x) # just in case x was not flattened
        output = self.layers(x)
        return output


class HybridNetwork(nn.Module):
    def __init__(self, rnn_home_model: RNN, rnn_away_model: RNN, mlp_model: NeuralNetwork):
        super(HybridNetwork, self).__init__()
        self.rnn_home = rnn_home_model
        self.rnn_away = rnn_away_model
        self.mlp = mlp_model

    def forward(self, x, x_historical_home, x_historical_away):
        # 'x' comes in as:                minibatch_size x 1 x num_of_feats
        # 'x_historical_*' comes in as:   minibatch_size x 5 x num_of_feats
        # 'rnn_*_hidden' will be:         minibatch_size x num_of_feats
        batch_size = x.size(0)
        time_seq_len = x_historical_home.size(1)
        # RNN HOME FORWARD
        rnn_home_hidden = self.rnn_home.init_hidden(batch_size)
        # print(rnn_home_hidden)
        for batch_idx in range(batch_size):
            for history_idx in range(time_seq_len):
                rnn_home_hidden[batch_idx] = self.rnn_home(
                    torch.flatten(x_historical_home[batch_idx, history_idx]),
                    rnn_home_hidden[batch_idx])
        # RNN AWAY FORWARD
        rnn_away_hidden = self.rnn_away.init_hidden(batch_size)
        for batch_idx in range(batch_size):
            for history_idx in range(time_seq_len):
                rnn_away_hidden[batch_idx] = self.rnn_away(
                    torch.flatten(x_historical_away[batch_idx, history_idx]),
                    rnn_away_hidden[batch_idx])
        # print(rnn_home_hidden)
        # MLP FORWARD
        # concat on the features dimension
        x_train = torch.cat([x, rnn_home_hidden, rnn_away_hidden], dim=1)
        print(f'x_train shape: {x_train.shape}')
        y_hat = self.mlp(x_train)
        return y_hat

In [1914]:
def train_epoch(model: HybridNetwork, dataloader: DataLoader, optimizer: optim.Optimizer, loss_fn, loss_meter):
    for x, x_historical_home, x_historical_away, y in dataloader:
        optimizer.zero_grad()
        y_hat = model(x, x_historical_home, x_historical_away)
        # print(f'y: {y}')
        # print(f'y_hat: {y_hat}')
        loss = loss_fn(y.to(dtype=torch.float), y_hat)
        print(f'loss: {loss}')
        loss.backward()
        optimizer.step()
        loss_meter.update(val=loss.item(), n=x.shape[0])


def train_model(model: HybridNetwork, dataloader: DataLoader, optimizer: optim.Optimizer, loss_fn, num_epochs: int):
    model.train()
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        train_epoch(model=model, dataloader=dataloader, optimizer=optimizer, loss_fn=loss_fn, loss_meter=loss_meter)
        print(f"Epoch {epoch + 1} completed. Training loss: {loss_meter.avg}")

In [1915]:
train_dataset = SerieAFootballMatchesDataset(csv_file='train.csv')

In [1916]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [1917]:
learning_rate = 0.001
num_epochs = 3
hidden_size = 128
rnn_home = RNN(input_size=tot_num_of_feats, hidden_size=hidden_size)
rnn_away = RNN(input_size=tot_num_of_feats, hidden_size=hidden_size)
# we have two hidden states (for home and away team) plus all features except for 'home_score', 'away_score', 'result_home', 'result_draw' and 'result_away'
mlp = NeuralNetwork(hidden_size * 2 + tot_num_of_feats - 5)
model = HybridNetwork(rnn_home_model=rnn_home, rnn_away_model=rnn_away, mlp_model=mlp)
cross_entropy_loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [1918]:
train_model(model=model, dataloader=train_dataloader, optimizer=optimizer, loss_fn=cross_entropy_loss_fn, num_epochs=num_epochs)

KeyError: 'season'

# Missing data
- We don't have data about new players that come to play in _Serie A_ during the course of the seasons. The model has to learn from zero context how important their contribution is for the outcome of the matches. If we were to considered multiple leagues, we could keep track of player transfers and maintain the history.
- We don't have data about cup matches played during the course of the seasons, like _Champions League_, _Europa League_ and _Coppa Italia_. Since they are very prestigious competitions and matches are usually very competitive, teams put a lot of effort in them and therefore can then perform worse in the championship.
- We don't have any type of player performance metric like who scored a goal, who was the assist man, red or yellow cards, goalkeeper's saves etc. so the model could face some difficulties in learning which player is important for the team.