# Football matches result prediction

Let's try to predict *Serie A* matches result (i.e. home win, away win or draw) with a RNN.

## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [55]:
import pandas as pd
import re
from _MatchNotFoundException import MatchNotFoundException
from HomeOrAway import HomeOrAway
match_cols = ['match_id', 'season', 'round'] + \
             ['date', 'time', 'referee', 'home_team', 'away_team', 'home_team_score', 'away_team_score'] + \
             ['home_team_coach'] + \
             ['home_player_' + str(i) for i in range(1, 12)] + \
             ['home_substitute_' + str(i) for i in range(1, 8)] + \
             ['away_team_coach'] + \
             ['away_player_' + str(i) for i in range(1, 12)] + \
             ['away_substitute_' + str(i) for i in range(1, 8)]

## Dataset construction
Let's clean our raw data and construct the dataset

In [56]:
raw_data = pd.read_csv('raw.csv')
raw_data.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


In [57]:
# let's work just on the head for now
df = pd.DataFrame(raw_data)
# df = df.iloc[0:200]

In [58]:
# add id str to each match
df['match_id'] = [re.sub(r'[^a-zA-Z\d ]', '', df.loc[i, 'date'] + df.loc[i, 'time'] + df.loc[i, 'home_team'][:3] + df.loc[i, 'away_team'][:3]) for i in range(len(df.index))]
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)
# make 'match_id' to be the first column
cols = list(df.columns)
cols.remove('match_id')
df = df[['match_id'] + cols]
# convert 'round' values to int
df['round'] = df['round'].astype(int)

In [60]:
def get_match_by_id(df: pd.DataFrame, match_id: str) -> pd.DataFrame:
    return df[df['match_id'] == match_id]

def get_match_index_by_id(df: pd.DataFrame, match_id: str) -> int:
    return get_match_by_id(df, match_id).index.tolist()[0]

def get_match_index_by_match(match: pd.DataFrame) -> int:
    return match.index.tolist()[0]

def is_team_home_or_away_in_match(team_name: str, match: pd.DataFrame):
    home_team = match.squeeze()['home_team']
    if home_team == team_name:
        return 'home'
    else:
        return 'away'

def get_last_match_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> pd.DataFrame:
    """
    Find in df the last match played by team_name prior to the game identified by target_match_index
    :param df: where to search
    :param target_match_index: the index in df of the target match
    :param team_name: name of the team that has played the target match
    :return:
    """
    for i in reversed(range(target_match_index)):
        current_match = df.iloc[[i]] # dataframe
        if current_match.at[i, 'home_team'] == team_name or current_match.at[i, 'away_team'] == team_name:
            return current_match
    raise MatchNotFoundException(f'Previous match for team {team_name} was not found')

def get_last_five_matches_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> list[pd.Series]:
    """
    Find in df the last five matches played by team_name prior to the game identified by match_index
    :param df: where to search
    :param target_match_index: target match index in df
    :param team_name: the name of the team that has played all the last five matches
    :return:
    """
    match = df.iloc[[target_match_index]] # dataframe
    last_match_found = match
    last_five_matches = []
    for i in range(1, 6):
        try:
            last_match_found = get_last_match_played_by_team(df, get_match_index_by_match(last_match_found), team_name)
            last_five_matches.append(last_match_found.squeeze())
        except MatchNotFoundException:
            pass
    return last_five_matches

def construct_historic_features_of_last_five_matches_for_target_match(target_match_index: int, target_home_or_away: HomeOrAway, last_five_matches: list[pd.Series]) -> pd.DataFrame:
    """
    Build a dataframe containing information about the last five matches played by home or away team of target match as historic features.
    :param target_match_index: the index of the target match in df
    :param target_home_or_away: tells whether the five matches has been played by the home or away team of target match
    :param last_five_matches: a list containing the last five matches
    :return:
    """
    # Init columns for 5 historic matches
    historic_cols = [f'{target_home_or_away.name}_team_history_{i}_{colName}' for i in range(1, 6) for colName in match_cols]
    # Init empty DataFrame with those columns and specific index
    result = pd.DataFrame(columns=historic_cols, index=[target_match_index])
    # Copy values into DataFrame
    for i in range(len(last_five_matches)):
        source_match = last_five_matches[i]
        for colName, colValue in source_match.iteritems():
            result.at[target_match_index, f'{target_home_or_away.name}_team_history_{i+1}_{colName}'] = colValue
    return result

def add_historic_features_of_last_five_matches_for_all_matches(df: pd.DataFrame) -> pd.DataFrame:
    """
    Construct a new dataframe adding information about the last five matches played by home and away team of all matches in df
    :param df: source of data
    :return: a new dataframe
    """
    new_df = pd.DataFrame()
    # for each row in dataframe
    for index, row in df.iterrows():
        team = row['home_team']
        home_team_historic_df = construct_historic_features_of_last_five_matches_for_target_match(
            index, HomeOrAway.home, get_last_five_matches_played_by_team(df, index, team)
        )
        team = row['away_team']
        away_team_historic_df = construct_historic_features_of_last_five_matches_for_target_match(
            index, HomeOrAway.away, get_last_five_matches_played_by_team(df, index, team)
        )
        new_row_as_df = pd.concat([df.iloc[[index]], home_team_historic_df, away_team_historic_df], axis=1)
        new_df = pd.concat([new_df, new_row_as_df], axis=0)
    return new_df

df1 = add_historic_features_of_last_five_matches_for_all_matches(df)
df1.tail()

Unnamed: 0,match_id,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,...,away_team_historic_5_away_player_9,away_team_historic_5_away_player_10,away_team_historic_5_away_player_11,away_team_historic_5_away_substitute_1,away_team_historic_5_away_substitute_2,away_team_historic_5_away_substitute_3,away_team_historic_5_away_substitute_4,away_team_historic_5_away_substitute_5,away_team_historic_5_away_substitute_6,away_team_historic_5_away_substitute_7
0,270820052030FIOSAM,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,...,,,,,,,,,,
1,270820051800LIVLEC,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,...,,,,,,,,,,
2,280820051500ASCMIL,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,...,,,,,,,,,,
3,280820051500PARPAL,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,...,,,,,,,,,,
4,280820051500INTTRE,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,010220091500ROBLEC,2008-09,22,2009-02-01,15:00,CHRISTIAN BRIGHI,ROBUR SIENA,LECCE,1,2,...,Francesco Valiani,Marco Di Vaio,Marco Bernacci,Christian Amoroso,Adailton,Cesar,Roberto Colombo,Salvatore Lanna,Marcello Castellini,Nicola Mingazzini
1355,010220091500ATACAT,2008-09,22,2009-02-01,15:00,PAOLO DONDARINI,ATALANTA,CATANIA,1,0,...,Julio Baptista,Francesco Totti,Mirko Vucinic,Cicinho,Stefano Okaka,Jeremy Menez,Artur,Simone Loria,Leandro Greco,Valerio Virga
1356,010220091500CHISAM,2008-09,22,2009-02-01,15:00,GABRIELE GAVA,CHIEVOVERONA,SAMPDORIA,1,1,...,Pietro Accardi,Claudio Bellucci,Antonio Cassano,Reto Ziegler,Marius Stankevicius,Angelo Palombo,Antonio Mirante,Daniele Dessena,Bruno Fornaroli,Emiliano Bonazzoli
1357,010220091500GENPAL,2008-09,22,2009-02-01,15:00,MATTEO SIMONE,GENOA,PALERMO,1,0,...,Fabio Simplicio,Davide Succi,Fabrizio Miccoli,Roberto Guana,Levan Mchedlidze,Samir Ujkani,Hernan Paolo Dellafiore,Andrea Raggi,Giulio Migliaccio,Maurizio Ciaramitaro


In [61]:
# reset datetime values
for colName, colValue in df1.iteritems():
    if colName.endswith('date'):
        df1[colName] = pd.to_datetime(df1[colName], infer_datetime_format=True)

In [None]:
# Drop id columns, unusefull for training
df1.drop(columns=['match_id'])
df1.drop(columns=[f'home_team_history_{i}_match_id' for i in range(1, 6)])

In [66]:
df1.shape

(1359, 528)