# Football matches result prediction

Let's try to predict *Serie A* matches result (i.e. home win, away win or draw) with a RNN.

## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [1]:
import pandas as pd
import re
from _MatchNotFoundException import MatchNotFoundException
from HomeOrAway import HomeOrAway

## Dataset construction
Let's clean our raw data and construct the dataset

In [2]:
raw_data = pd.read_csv('raw.csv')
raw_data.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


In [3]:
# let's work just on the head for now
df = pd.DataFrame(raw_data)
df = df.iloc[0:50]

In [4]:
# add id str to each match
df['match_id'] = [re.sub(r'[^a-zA-Z\d ]', '', df.loc[i, 'date'] + df.loc[i, 'time'] + df.loc[i, 'home_team'][:3] + df.loc[i, 'away_team'][:3]) for i in range(len(df.index))]
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)
# make 'match_id' to be the first column
cols = list(df.columns)
cols.remove('match_id')
df = df[['match_id'] + cols]
# convert 'round' values to int
df['round'] = df['round'].astype(int)

In [5]:
df

Unnamed: 0,match_id,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,270820052030FIOSAM,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,270820051800LIVLEC,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,280820051500ASCMIL,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,280820051500PARPAL,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,280820051500INTTRE,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
5,280820052030JUVCHI,2005-06,1,2005-08-28,20:30,MATTEO SIMONE,JUVENTUS,CHIEVOVERONA,1,0,...,Daniele Franceschini,Federico Cossato,Sergio Pellissier,Lorenzo Squizzi,Luciano,Amauri,John Mensah,Filippo Antonelli,Victor Obinna,Giovanni Marchese
6,280820051500ROBCAG,2005-06,1,2005-08-28,15:00,CHRISTIAN BRIGHI,ROBUR SIENA,CAGLIARI,2,1,...,Mauro Esposito,David Suazo,Andrea Cossu,Andrea Capone,Alessandro Budel,Claudio Ferrarese,Andrea Campagnolo,Fabio Vignati,Francesco Pisano,Claudio Pani
7,280820051500UDIEMP,2005-06,1,2005-08-28,15:00,ANDREA DE,UDINESE,EMPOLI,1,0,...,Matteo Serafini,Francesco Tavano,Ighli Vannucchi,Francesco Lodi,Nicola Pozzi,Daniele Balli,Davide Moro,Paolo Zanetti,Andrea Raggi,Francesco Pratali
8,280820051500REGROM,2005-06,1,2005-08-28,15:00,ROBERTO ROSETTI,REGGINA,ROMA,0,3,...,Francesco Totti,Rodrigo Taddei,Vincenzo Montella,Alberto Aquilani,Aleandro Rosi,Shabani Nonda,Pietro Pipolo,Cesare Bovo,Houssine Kharja,Antonio Cassano
9,280820051500LAZMES,2005-06,1,2005-08-28,15:00,PAOLO DONDARINI,LAZIO,MESSINA,1,0,...,Gaetano DAgostino,Giuseppe Sculli,Riccardo Zampagna,Arturo Di Napoli,Zlatan Muslimovic,Ivica Iliev,Marco Storari,Filippo Cristante,Luca Fusco,Atsushi Yanagisawa


In [6]:
def get_match_by_id(df: pd.DataFrame, match_id: str) -> pd.DataFrame:
    return df[df['match_id'] == match_id]

def get_match_index_by_id(df: pd.DataFrame, match_id: str) -> int:
    return get_match_by_id(df, match_id).index.tolist()[0]

def get_match_index_by_match(match: pd.DataFrame) -> int:
    return match.index.tolist()[0]

def is_team_home_or_away_in_match(team_name: str, match: pd.DataFrame):
    home_team = match.squeeze()['home_team']
    if home_team == team_name:
        return 'home'
    else:
        return 'away'

def get_last_match_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> pd.DataFrame:
    """
    Find in df the last match played by team_name prior to the game identified by target_match_index
    :param df: where to search
    :param target_match_index: the index in df of the target match
    :param team_name: name of the team that has played the target match
    :return:
    """
    for i in reversed(range(target_match_index)):
        current_match = df.iloc[[i]] # dataframe
        if current_match.at[i, 'home_team'] == team_name or current_match.at[i, 'away_team'] == team_name:
            return current_match
    raise MatchNotFoundException(f'Previous match for team {team_name} was not found')

def get_last_five_matches_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> list[pd.Series]:
    """
    Find in df the last five matches played by team_name prior to the game identified by match_index
    :param df: where to search
    :param target_match_index: target match index in df
    :param team_name: the name of the team that has played all the last five matches
    :return:
    """
    match = df.iloc[[target_match_index]] # dataframe
    last_match_found = match
    last_five_matches = []
    for i in range(1, 6):
        try:
            last_match_found = get_last_match_played_by_team(df, get_match_index_by_match(last_match_found), team_name)
            print(f'Run {i}, found')
            last_five_matches.append(last_match_found.squeeze())
        except MatchNotFoundException:
            print(f'Run {i}, not found')
    return last_five_matches


def add_historic_features_of_source_match_to_target_match(df: pd.DataFrame, target_match_index: int, target_home_or_away: HomeOrAway, target_historic_index: int, source_match: pd.Series):
    """
    Add information about source_match as historic features of target_match, with an historic index given by target_history_index
    :param df: the DataFrame from which target_match and source_match are taken from
    :param target_match_index: the index of the target_match in the dataframe
    :param target_home_or_away: tells whether the historic features belong to the home or away team of target_match
    :param target_historic_index: describes the source_match as the target_historic_index-th last match of target_match
    :param source_match: the target_historic_index-th last match of target_match from where data is taken
    :return:
    """
    for colName, colValue in source_match.iteritems():
        df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
            source_match[colName]

def add_historic_features_of_last_five_matches_to_target_match(df: pd.DataFrame, target_match_index: int, target_home_or_away: HomeOrAway, last_five_matches: list[pd.Series]):
    """
    Add information about the last five matches of home or away team of target match as historic features
    :param df: the DataFrame from which target match and source match are taken from
    :param target_match_index: the index of the target match in df
    :param target_home_or_away: tells whether the last five matches (and so the historic features) belong to the home or away team of target match
    :param last_five_matches: a list containing the last five matches
    :return:
    """
    for i in range(1, len(last_five_matches)+1):
        add_historic_features_of_source_match_to_target_match(df, target_match_index, target_home_or_away, i, last_five_matches[i - 1])

def add_historic_features_of_last_five_matches_to_all_matches(df: pd.DataFrame):
    """
    Add information about the last five matches of home and away team for all games in df
    :param df: source of data
    :return:
    """
    df_copy = df.copy()
    #for index, row in df.iterrows():
    #    add_historic_features_of_last_five_matches_to_target_match(df_copy, index, HomeOrAway.home, get_last_five_matches_of_home_or_away_team(df_copy))


In [7]:
def test():
    match_index = 49
    selected_match = df.iloc[match_index]
    home_or_away = HomeOrAway.away
    team_name = selected_match[f'{home_or_away.name}_team']
    # df.at[0, 'home_team'] = 'PARMA'
    try:
        #print("Looking for a match played by {} prior to {}".format(selected_match['home_team'], selected_match['home_team'] + ' vs ' + selected_match['away_team']))
        #previous = get_last_match_by_match_id_for_team(df, selected_match['match_id'], selected_match['home_team'])
        #print("FOUND! {}".format(previous['match_id']))
        print(f'Looking for the last five matches played by {selected_match["{}_team".format(home_or_away.name)]} prior to {selected_match["home_team"] + " vs " + selected_match["away_team"]}')
        last_five = get_last_five_matches_played_by_team(df, match_index, team_name)
        print(f'FOUND {len(last_five)}!')
        print(last_five)
        #print("Adding history features")
        add_historic_features_of_last_five_matches_to_target_match(df, match_index, home_or_away, last_five)
    except MatchNotFoundException:
        pass
    # df.at[0, 'home_team'] = 'FIORENTINA'

test()

Looking for the last five matches played by ASCOLI prior to LIVORNO vs ASCOLI
Run 1, found
Run 2, found
Run 3, found
Run 4, found
Run 5, not found
FOUND 4!
[match_id              210920052030ASCROB
season                           2005-06
round                                  4
date                 2005-09-21 00:00:00
time                               20:30
referee                 NICOLA STEFANINI
home_team                         ASCOLI
away_team                    ROBUR SIENA
home_team_score                        1
away_team_score                        1
home_team_coach            Massimo Silva
home_player_1         Ferdinando Coppola
home_player_2           Gianluca Comotto
home_player_3               Mirko Cudini
home_player_4           Maurizio Domizzi
home_player_5             Vittorio Tosto
home_player_6            Pasquale Foggia
home_player_7              Roberto Guana
home_player_8         Domenico Cristiano
home_player_9               Michele Fini
home_player_10         

  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[target_match_index, f'{target_home_or_away.name}_team_history_{target_historic_index}_{colName}'] = \
  df.at[ta

In [8]:
df

Unnamed: 0,match_id,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,...,away_team_history_4_away_player_9,away_team_history_4_away_player_10,away_team_history_4_away_player_11,away_team_history_4_away_substitute_1,away_team_history_4_away_substitute_2,away_team_history_4_away_substitute_3,away_team_history_4_away_substitute_4,away_team_history_4_away_substitute_5,away_team_history_4_away_substitute_6,away_team_history_4_away_substitute_7
0,270820052030FIOSAM,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,...,,,,,,,,,,
1,270820051800LIVLEC,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,...,,,,,,,,,,
2,280820051500ASCMIL,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,...,,,,,,,,,,
3,280820051500PARPAL,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,...,,,,,,,,,,
4,280820051500INTTRE,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,...,,,,,,,,,,
5,280820052030JUVCHI,2005-06,1,2005-08-28,20:30,MATTEO SIMONE,JUVENTUS,CHIEVOVERONA,1,0,...,,,,,,,,,,
6,280820051500ROBCAG,2005-06,1,2005-08-28,15:00,CHRISTIAN BRIGHI,ROBUR SIENA,CAGLIARI,2,1,...,,,,,,,,,,
7,280820051500UDIEMP,2005-06,1,2005-08-28,15:00,ANDREA DE,UDINESE,EMPOLI,1,0,...,,,,,,,,,,
8,280820051500REGROM,2005-06,1,2005-08-28,15:00,ROBERTO ROSETTI,REGGINA,ROMA,0,3,...,,,,,,,,,,
9,280820051500LAZMES,2005-06,1,2005-08-28,15:00,PAOLO DONDARINI,LAZIO,MESSINA,1,0,...,,,,,,,,,,


In [9]:
# Drop id columns, unusefull for training
df.drop(columns=['match_id'])
df.drop(columns=[f'home_team_history_{i}_match_id' for i in range(1, 6)])

KeyError: "['home_team_history_1_match_id', 'home_team_history_2_match_id', 'home_team_history_3_match_id', 'home_team_history_4_match_id', 'home_team_history_5_match_id'] not found in axis"