# Football matches result prediction

Let's try to predict *Serie A* matches result (i.e. home win, away win or draw) with a RNN.

## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [258]:
import pandas as pd
from _MatchNotFoundException import MatchNotFoundException
from HomeOrAway import HomeOrAway
from MatchResult import MatchResult

In [259]:
match_cols = ['season', 'round'] + \
             ['date', 'time', 'referee', 'home_team', 'away_team', 'home_team_score', 'away_team_score'] + \
             ['home_team_coach'] + \
             ['home_player_' + str(i) for i in range(1, 12)] + \
             ['home_substitute_' + str(i) for i in range(1, 8)] + \
             ['away_team_coach'] + \
             ['away_player_' + str(i) for i in range(1, 12)] + \
             ['away_substitute_' + str(i) for i in range(1, 8)]

In [260]:
raw_data = pd.read_csv('raw.csv')
raw_data.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


## Data visualization

Let's inspect our data a little bit more

In [261]:
# todo

## Dataset construction
Now let's clean our raw data and construct the dataset. The full process for preparing the data is:
- Convert date string values to pandas datetime values
- Convert round string values to integers
- Construct and add historical features
- Derive match results from scores
- Encode data

In [262]:
df = pd.DataFrame(raw_data)
df = df[:200]

In [263]:
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)
# convert 'round' values to int
df['round'] = df['round'].astype(int)

### Historical features
In order to train an RNN model, we need to have series of football games, so the goal of this section is to add some historical features that will carry information about the last five games played by the home and away team of each match in the dataset.

In [264]:
def get_match_index_by_match(match: pd.DataFrame) -> int:
    return match.index.tolist()[0]


def is_team_home_or_away_in_match(team_name: str, match: pd.DataFrame):
    home_team = match.squeeze()['home_team']
    if home_team == team_name:
        return 'home'
    else:
        return 'away'


def get_last_match_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> pd.DataFrame:
    """
    Find in df the last match played by team_name prior to the game identified by target_match_index
    :param df: where to search
    :param target_match_index: the index in df of the target match
    :param team_name: name of the team that has played the target match
    :return:
    """
    for i in reversed(range(target_match_index)):
        current_match = df.iloc[[i]]  # dataframe
        if current_match.at[i, 'home_team'] == team_name or current_match.at[i, 'away_team'] == team_name:
            return current_match
    raise MatchNotFoundException(f'Previous match for team {team_name} was not found')


def get_last_five_matches_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> list[pd.Series]:
    """
    Find in df the last five matches played by team_name prior to the game identified by match_index
    :param df: where to search
    :param target_match_index: target match index in df
    :param team_name: the name of the team that has played all the last five matches
    :return:
    """
    match = df.iloc[[target_match_index]]  # dataframe
    last_match_found = match
    last_five_matches = []
    for i in range(1, 6):
        try:
            last_match_found = get_last_match_played_by_team(df, get_match_index_by_match(last_match_found), team_name)
            last_five_matches.append(last_match_found.squeeze())
        except MatchNotFoundException:
            pass
    return last_five_matches


def construct_historical_features_of_last_five_matches_for_target_match(target_match_index: int,
                                                                        target_home_or_away: HomeOrAway,
                                                                        last_five_matches: list[
                                                                            pd.Series]) -> pd.DataFrame:
    """
    Build a dataframe containing information about the last five matches played by home or away team of target match as historical features.
    :param target_match_index: the index of the target match in df
    :param target_home_or_away: tells whether the five matches has been played by the home or away team of target match
    :param last_five_matches: a list containing the last five matches
    :return:
    """
    # Init columns for 5 historical matches
    historical_cols = [f'{target_home_or_away.name}_team_history_{i}_{colName}' for i in range(1, 6) for colName in
                       match_cols]
    # Init empty DataFrame with those columns and specific index
    result = pd.DataFrame(columns=historical_cols, index=[target_match_index])
    # Copy values into DataFrame
    for i in range(len(last_five_matches)):
        source_match = last_five_matches[i]
        for colName, colValue in source_match.iteritems():
            result.at[target_match_index, f'{target_home_or_away.name}_team_history_{i + 1}_{colName}'] = colValue
    return result


def add_historical_features_of_last_five_matches_for_all_matches(df: pd.DataFrame) -> pd.DataFrame:
    """
    Construct a new dataframe adding information about the last five matches played by home and away team of all matches in df
    :param df: source of data
    :return: a new dataframe
    """
    new_df = pd.DataFrame()
    # for each row in dataframe
    for index, row in df.iterrows():
        team = row['home_team']
        home_team_historical_df = construct_historical_features_of_last_five_matches_for_target_match(
            index, HomeOrAway.home, get_last_five_matches_played_by_team(df, index, team)
        )
        team = row['away_team']
        away_team_historical_df = construct_historical_features_of_last_five_matches_for_target_match(
            index, HomeOrAway.away, get_last_five_matches_played_by_team(df, index, team)
        )
        new_row_as_df = pd.concat([df.iloc[[index]], home_team_historical_df, away_team_historical_df], axis=1)
        new_df = pd.concat([new_df, new_row_as_df], axis=0)
    return new_df

In [265]:
# Construct historical features
df1 = add_historical_features_of_last_five_matches_for_all_matches(df)

In [266]:
# Remove the matches that do not have values for all historical features
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)

In [267]:
df1.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_team_history_5_away_player_9,away_team_history_5_away_player_10,away_team_history_5_away_player_11,away_team_history_5_away_substitute_1,away_team_history_5_away_substitute_2,away_team_history_5_away_substitute_3,away_team_history_5_away_substitute_4,away_team_history_5_away_substitute_5,away_team_history_5_away_substitute_6,away_team_history_5_away_substitute_7
0,2005-06,6,2005-10-01,20:30,ROBERTO ROSETTI,UDINESE,LAZIO,3,0,Serse Cosmi,...,Gaetano DAgostino,Giuseppe Sculli,Riccardo Zampagna,Arturo Di Napoli,Zlatan Muslimovic,Ivica Iliev,Marco Storari,Filippo Cristante,Luca Fusco,Atsushi Yanagisawa
1,2005-06,6,2005-10-01,18:00,MARIO MAZZOLENI,CHIEVOVERONA,TREVISO,0,0,Giuseppe Pillon,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
2,2005-06,6,2005-10-02,15:00,MASSIMO DE,MESSINA,SAMPDORIA,1,4,Bortolo Mutti,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
3,2005-06,6,2005-10-02,15:00,LUCA BANTI,PALERMO,EMPOLI,2,2,Luigi Delneri,...,Matteo Serafini,Francesco Tavano,Ighli Vannucchi,Francesco Lodi,Nicola Pozzi,Daniele Balli,Davide Moro,Paolo Zanetti,Andrea Raggi,Francesco Pratali
4,2005-06,6,2005-10-02,15:00,OSCAR GIRARDI,LECCE,CAGLIARI,3,0,Silvio Baldini,...,Mauro Esposito,David Suazo,Andrea Cossu,Andrea Capone,Alessandro Budel,Claudio Ferrarese,Andrea Campagnolo,Fabio Vignati,Francesco Pisano,Claudio Pani


As expected, the first few retained matches comes from round 6, as we have considered 5 games for historical features.

In [268]:
# re-convert all date values to datetime
for colName, colValue in df1.iteritems():
    if colName.endswith('date'):
        df1[colName] = pd.to_datetime(df1[colName], infer_datetime_format=True)

In [269]:
df1.shape

(138, 517)

### Result column
We don't care so much about scores because our model will try to predict match results, i.e. home win, away win or draw. We need a result column to be used as our target column, so let's construct it from the scores.

In [270]:
def get_match_result_from_score(home_team_score: int, away_team_score: int) -> MatchResult:
    if home_team_score == away_team_score:
        return MatchResult.draw
    if home_team_score > away_team_score:
        return MatchResult.home
    return MatchResult.away


def add_target_column(df: pd.DataFrame) -> pd.DataFrame:
    results = {'result': []}
    for i in range(5):
        results[f'home_team_history_{i+1}_result'] = []
        results[f'away_team_history_{i+1}_result'] = []
    for index, row in df.iterrows():
        results['result'] += [get_match_result_from_score(row['home_team_score'], row['away_team_score']).name]
        for i in range(5):
            results[f'home_team_history_{i+1}_result'] += \
                [get_match_result_from_score(row[f'home_team_history_{i+1}_home_team_score'], row[f'home_team_history_{i+1}_away_team_score']).name]
            results[f'away_team_history_{i+1}_result'] += \
                [get_match_result_from_score(row[f'away_team_history_{i+1}_home_team_score'], row[f'away_team_history_{i+1}_away_team_score']).name]
    df.insert(loc=df.columns.get_loc('home_team_score'), column='result', value=results['result'])
    for i in range(5):
        df.insert(loc=df.columns.get_loc(f'home_team_history_{i+1}_home_team_score'), column=f'home_team_history_{i+1}_result', value=results[f'home_team_history_{i+1}_result'])
        df.insert(loc=df.columns.get_loc(f'away_team_history_{i+1}_home_team_score'), column=f'away_team_history_{i+1}_result', value=results[f'away_team_history_{i+1}_result'])
    return df

In [271]:
# add target column
add_target_column(df1)
df1.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_score,away_team_score,...,away_team_history_5_away_player_9,away_team_history_5_away_player_10,away_team_history_5_away_player_11,away_team_history_5_away_substitute_1,away_team_history_5_away_substitute_2,away_team_history_5_away_substitute_3,away_team_history_5_away_substitute_4,away_team_history_5_away_substitute_5,away_team_history_5_away_substitute_6,away_team_history_5_away_substitute_7
0,2005-06,6,2005-10-01,20:30,ROBERTO ROSETTI,UDINESE,LAZIO,home,3,0,...,Gaetano DAgostino,Giuseppe Sculli,Riccardo Zampagna,Arturo Di Napoli,Zlatan Muslimovic,Ivica Iliev,Marco Storari,Filippo Cristante,Luca Fusco,Atsushi Yanagisawa
1,2005-06,6,2005-10-01,18:00,MARIO MAZZOLENI,CHIEVOVERONA,TREVISO,draw,0,0,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
2,2005-06,6,2005-10-02,15:00,MASSIMO DE,MESSINA,SAMPDORIA,away,1,4,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
3,2005-06,6,2005-10-02,15:00,LUCA BANTI,PALERMO,EMPOLI,draw,2,2,...,Matteo Serafini,Francesco Tavano,Ighli Vannucchi,Francesco Lodi,Nicola Pozzi,Daniele Balli,Davide Moro,Paolo Zanetti,Andrea Raggi,Francesco Pratali
4,2005-06,6,2005-10-02,15:00,OSCAR GIRARDI,LECCE,CAGLIARI,home,3,0,...,Mauro Esposito,David Suazo,Andrea Cossu,Andrea Capone,Alessandro Budel,Claudio Ferrarese,Andrea Campagnolo,Fabio Vignati,Francesco Pisano,Claudio Pani


In [272]:
# drop score columns
df1 = df1.drop(columns=['home_team_score', 'away_team_score'])
df1.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_coach,home_player_1,...,away_team_history_5_away_player_9,away_team_history_5_away_player_10,away_team_history_5_away_player_11,away_team_history_5_away_substitute_1,away_team_history_5_away_substitute_2,away_team_history_5_away_substitute_3,away_team_history_5_away_substitute_4,away_team_history_5_away_substitute_5,away_team_history_5_away_substitute_6,away_team_history_5_away_substitute_7
0,2005-06,6,2005-10-01,20:30,ROBERTO ROSETTI,UDINESE,LAZIO,home,Serse Cosmi,Morgan De Sanctis,...,Gaetano DAgostino,Giuseppe Sculli,Riccardo Zampagna,Arturo Di Napoli,Zlatan Muslimovic,Ivica Iliev,Marco Storari,Filippo Cristante,Luca Fusco,Atsushi Yanagisawa
1,2005-06,6,2005-10-01,18:00,MARIO MAZZOLENI,CHIEVOVERONA,TREVISO,draw,Giuseppe Pillon,Alberto Fontana,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
2,2005-06,6,2005-10-02,15:00,MASSIMO DE,MESSINA,SAMPDORIA,away,Bortolo Mutti,Marco Storari,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
3,2005-06,6,2005-10-02,15:00,LUCA BANTI,PALERMO,EMPOLI,draw,Luigi Delneri,Nicola Santoni,...,Matteo Serafini,Francesco Tavano,Ighli Vannucchi,Francesco Lodi,Nicola Pozzi,Daniele Balli,Davide Moro,Paolo Zanetti,Andrea Raggi,Francesco Pratali
4,2005-06,6,2005-10-02,15:00,OSCAR GIRARDI,LECCE,CAGLIARI,home,Silvio Baldini,Vincenzo Sicignano,...,Mauro Esposito,David Suazo,Andrea Cossu,Andrea Capone,Alessandro Budel,Claudio Ferrarese,Andrea Campagnolo,Fabio Vignati,Francesco Pisano,Claudio Pani


### Result column balancing

### Data encoding
Convert values to integers

In [273]:
from collections import defaultdict

In [274]:
def get_column_names_containing_str(df: pd.DataFrame, substring: str) -> list[str]:
    return df.loc[:,[substring in i for i in df.columns]].columns.values.tolist()

#### Results encoding

In [275]:
# encode results
target2int = {'home': 0, 'draw': 1, 'away': 2}
result_cols = get_column_names_containing_str(df1, 'result')
for col in result_cols:
    df1[f'{col}'] = df1[f'{col}'].map(target2int)

#### Referees encoding

In [276]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode referees
referee_cols = get_column_names_containing_str(df1, 'referee')
for col in referee_cols:
    referee_ids = [temp_dict[ele] for ele in df1[f'{col}'].tolist()]

#### Teams encoding

In [277]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode teams
for home_or_away in HomeOrAway:
    teams_ids = [temp_dict[ele] for ele in df1[f'{home_or_away.name}_team'].tolist()]
    df1[f'{home_or_away.name}_team'] = teams_ids
print(df1['home_team'].max())
print(df1['away_team'].max())
for i in range(5):
    for home_or_away in HomeOrAway:
        teams_ids = [temp_dict[ele] for ele in df1[f'{home_or_away.name}_team_history_{i+1}_home_team'].tolist()]
        df1[f'{home_or_away.name}_team_history_{i+1}_home_team'] =  teams_ids
        teams_ids = [temp_dict[ele] for ele in df1[f'{home_or_away.name}_team_history_{i+1}_away_team'].tolist()]
        df1[f'{home_or_away.name}_team_history_{i+1}_away_team'] = teams_ids

19
19


#### Coaches encoding

In [278]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode team coaches
coach_cols = get_column_names_containing_str(df1, 'coach')
for col in coach_cols:
    team_coach_ids = [temp_dict[ele] for ele in df1[f'{col}'].tolist()]
    df1[f'{col}'] = team_coach_ids

#### Players encoding

In [279]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode players
players_cols = get_column_names_containing_str(df1, 'player')
players_cols += get_column_names_containing_str(df1, 'substitute')
for col in players_cols:
    df1[f'{col}'] = [temp_dict[ele] for ele in df1[f'{col}'].tolist()]

In [280]:
df1[:200]

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_coach,home_player_1,...,away_team_history_5_away_player_9,away_team_history_5_away_player_10,away_team_history_5_away_player_11,away_team_history_5_away_substitute_1,away_team_history_5_away_substitute_2,away_team_history_5_away_substitute_3,away_team_history_5_away_substitute_4,away_team_history_5_away_substitute_5,away_team_history_5_away_substitute_6,away_team_history_5_away_substitute_7
0,2005-06,6,2005-10-01,20:30,ROBERTO ROSETTI,0,15,0,0,0,...,280,215,312,349,353,293,2,78,414,432
1,2005-06,6,2005-10-01,18:00,MARIO MAZZOLENI,1,19,1,1,1,...,340,410,256,397,368,421,26,214,435,382
2,2005-06,6,2005-10-02,15:00,MASSIMO DE,2,17,2,2,2,...,365,324,328,156,309,370,467,476,110,490
3,2005-06,6,2005-10-02,15:00,LUCA BANTI,3,11,1,3,3,...,308,320,288,422,296,32,220,254,56,124
4,2005-06,6,2005-10-02,15:00,OSCAR GIRARDI,4,13,0,4,4,...,304,337,402,295,237,485,25,377,374,484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,2005-06,20,2006-01-18,20:30,DOMENICO MESSINA,17,6,0,17,19,...,164,397,368,443,410,421,423,508,256,406
134,2005-06,20,2006-01-18,20:30,PAOLO TAGLIAVENTO,5,14,0,5,16,...,266,279,290,478,134,403,444,507,408,390
135,2005-06,20,2006-01-18,20:30,GIANLUCA ROCCHI,7,9,0,7,7,...,266,279,290,478,134,403,444,507,408,390
136,2005-06,20,2006-01-18,20:30,PASQUALE RODOMONTI,4,16,1,4,29,...,291,369,322,385,342,307,22,92,195,257


## Training
Now that out dataset is ready, we can configure an RNN model and train it.

In [281]:
import torch
import torch.nn as nn
from torch import optim

In [282]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.linear = nn.Linear(input_size + hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        pre_hidden = self.linear(combined)
        hidden = self.tanh(pre_hidden)
        return hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [283]:
n_historical_features = 17
n_hidden = 128
encoder = EncoderRNN(input_size=n_historical_features, hidden_size=n_hidden)

In [284]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 3),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.flatten(x)
        output = self.layers(x)
        return output

In [285]:
n_basic_features = len(match_cols)
mlp = NeuralNetwork(n_historical_features * 2 + n_basic_features)

In [286]:
def train(x, y, encoder: EncoderRNN, nn: NeuralNetwork,
          encoder_optimizer: optim.Optimizer, nn_optimizer: optim.Optimizer, loss_fn):
    # init
    encoder_optimizer.zero_grad()
    nn_optimizer.zero_grad()
    input_length = x.size(0)
    target_length = y.size(0)
    loss = 0
    # encoder forward
    encoder_hidden = encoder.init_hidden()
    for history_index in range(input_length):
        encoder_hidden = encoder(x[history_index], encoder_hidden)
    home_team_form = encoder_hidden
    encoder_hidden = encoder.init_hidden()
    for history_index in range(input_length):
        encoder_hidden = encoder(x[history_index], encoder_hidden)
    away_team_form = encoder_hidden
    # mlp forward
    match = torch.tensor([])  # todo
    x_train = torch.cat((match, home_team_form, away_team_form), 1)
    y_hat = nn(x_train)
    # backward
    loss = loss_fn(y, y_hat)
    loss.backward()


learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
mlp_optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)
loss_fn = nn.NLLLoss()