# Football matches result prediction

Let's try to predict *Serie A* matches result (i.e. home win, away win or draw) with a RNN.

## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [97]:
import pandas as pd
import re
from _MatchNotFoundException import MatchNotFoundException

## Dataset construction
Let's clean our raw data and construct the dataset

In [98]:
raw_data = pd.read_csv('raw.csv')
raw_data.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


In [99]:
# let's work just on the head for now
df = pd.DataFrame(raw_data.head())

In [100]:
# add id str to each match
df['match_id'] = [re.sub(r'[^a-zA-Z\d ]', '', df.loc[i, 'date'] + df.loc[i, 'time'] + df.loc[i, 'home_team'][:3] + df.loc[i, 'away_team'][:3]) for i in range(len(df.index))]
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)
# make 'match_id' to be the first column
cols = list(df.columns)
cols.remove('match_id')
df = df[['match_id'] + cols]
# convert 'round' values to int
df['round'] = df['round'].astype(int)

In [101]:
df.head()

Unnamed: 0,match_id,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,270820052030FIOSAM,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,270820051800LIVLEC,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,280820051500ASCMIL,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,280820051500PARPAL,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,280820051500INTTRE,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto


In [102]:
def get_match_by_id(df: pd.DataFrame, match_id: str):
    return df[df['match_id'] == match_id]

def get_match_index_by_id(df: pd.DataFrame, match_id: str):
    return get_match_by_id(df, match_id).index.tolist()[0]

def get_match_index_by_match(match: pd.Series):
    return match.index.tolist()[0]

def get_previous_match_by_match_id_for_team(df: pd.DataFrame, match_id: str, team_name: str):
    """
    Find in df the last match played by team_name prior to the game identified by match_id
    :param df: where to search
    :param match_id: target match id
    :param team_name: name of the team that has played all the last five matches as home or away team
    :return:
    """
    match = get_match_by_id(df, match_id)
    for i in reversed(range(get_match_index_by_match(match))):
        current_match = df.iloc[i]
        if current_match['home_team'] == team_name or current_match['away_team'] == team_name:
            return current_match
    raise MatchNotFoundException("Previous match for team {} was not found".format(team_name))

def is_team_home_or_away_in_match(team_name: str, match: pd.Series):
    if match['home_team'] == team_name:
        return 'home'
    else:
        return 'away'

def add_historic_features_to_match(df: pd.DataFrame, target_match_index: int, target_historic_home_or_away: str, target_historic_index: int, source_match: pd.Series):
    """
    Add information about source_match as historic features of target_match, with an historic index given by target_history_index
    :param df: the DataFrame from which target_match and source_match are taken from
    :param target_match_index: the index of the target_match in the dataframe
    :param target_historic_home_or_away: tells whether the historic features belong to the home or away team of target_match
    :param target_historic_index: describes the source_match as the target_historic_index-th last match of target_match
    :param source_match: the target_historic_index-th last match of target_match from where data is taken
    :return:
    """
    print("target index: {}, source: {}".format(target_match_index, source_match))
    for colName, colValue in source_match.iteritems():
        df.at[target_match_index, "{}_team_history_{}_{}".format(target_historic_home_or_away, target_historic_index, colName)] = \
            source_match[colName]

def get_last_five_matches_by_match_id_for_team(df: pd.DataFrame, match_id: str, team_name: str):
    """
    Find in df the last five matches played by team_name prior to the game identified by match_id
    :param df: where to search
    :param match_id: target match id
    :param team_name: name of the team that has played all the last five matches as home or away team
    :return:
    """
    match = get_match_by_id(df, match_id)
    match_index = get_match_index_by_match(match)
    home_or_away = is_team_home_or_away_in_match(team_name, match)
    last_match_found = match
    last_five_matches = []
    for i in range(1, 6):
        try:
            last_match_found = get_previous_match_by_match_id_for_team(df, last_match_found['match_id'], team_name)
            last_five_matches.append(last_match_found)
        except MatchNotFoundException:
            pass
        #add_historic_features_to_match(df, match_index, home_or_away, i, last_match_found)


In [None]:
def test():
    selected_match = df.iloc[3]
    df.at[0, 'home_team'] = 'PARMA'
    try:
        print("Looking for a match played by {} prior to {}".format(selected_match['home_team'], selected_match['home_team'] + ' vs ' + selected_match['away_team']))
        previous = get_previous_match_by_match_id_for_team(df, selected_match['match_id'], selected_match['home_team'])
        print("FOUND! {}".format(previous))
        print("Looking for the last five matches played by {} prior to {}".format(selected_match['home_team'], selected_match['home_team'] + ' vs ' + selected_match['away_team']))
        last_five = get_previous_match_by_match_id_for_team(df, selected_match['match_id'], selected_match['home_team'])
        print("FOUND!")
        print(last_five)
        #print("Adding history features")
        #add_historic_features_to_match(df, 3, 'home', 1, previous)
    except MatchNotFoundException:
        pass
    df.at[0, 'home_team'] = 'FIORENTINA'

test()