## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [19]:
import re

import pandas as pd

from MatchResult import MatchResult

In [20]:
match_cols = ['season', 'round'] + \
['date', 'time', 'referee', 'home_team', 'away_team', 'home_team_score', 'away_team_score'] + \
['home_team_coach'] + \
['home_player_' + str(i) for i in range(1, 12)] + \
['home_substitute_' + str(i) for i in range(1, 8)] + \
['away_team_coach'] + \
['away_player_' + str(i) for i in range(1, 12)] + \
['away_substitute_' + str(i) for i in range(1, 8)]

In [21]:
train_raw = pd.read_csv('train-raw.csv')
train_raw.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


In [22]:
# test_raw = pd.read_csv('test-raw.csv')
# test_raw.head()

## Data visualization

Let's inspect our data a little bit more

In [23]:
# todo

## Data pre-processing
Now let's clean our raw data and add some additional features.

In [24]:
df = pd.DataFrame(train_raw)
# df = df[:200]

In [25]:
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)
# round values to int
df['round'] = df['round'].astype(int)

### Additional features

#### Result column
Our model will try to predict match results, i.e. **home win**, **away win** or **draw**, so we need a result column to be used as our target.

In [27]:
def get_match_result_from_score(home_team_score: int, away_team_score: int) -> MatchResult:
    if home_team_score == away_team_score:
        return MatchResult.draw
    if home_team_score > away_team_score:
        return MatchResult.home
    return MatchResult.away


def add_target_column(df: pd.DataFrame) -> pd.DataFrame:
    results = {'result': []}
    for index, row in df.iterrows():
        results['result'] += [get_match_result_from_score(row['home_team_score'], row['away_team_score']).name]
    df.insert(loc=df.columns.get_loc('home_team_score'), column='result', value=results['result'])
    return df

In [28]:
# add target column
add_target_column(df)

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_score,away_team_score,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,home,2,1,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,home,2,1,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,draw,1,1,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,draw,1,1,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,home,3,0,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,2008-09,22,2009-02-01,15:00,CHRISTIAN BRIGHI,ROBUR SIENA,LECCE,away,1,2,...,Guillermo Giacomazzi,Jose Castillo,Simone Tiribocchi,Andrea Ardito,Ndiaye Papa Waigo,Antonio Rosati,Raffaele Schiavi,Angelo,Daniele Cacia,Edinho
1355,2008-09,22,2009-02-01,15:00,PAOLO DONDARINI,ATALANTA,CATANIA,home,1,0,...,Jorge Martinez,Giuseppe Mascara,Michele Paolucci,Mariano Izco,Gionatha Spinesi,Takayuki Morimoto,Paolo Acerbis,Christian Silvestri,Cristian Llama,Pablo Ledesma
1356,2008-09,22,2009-02-01,15:00,GABRIELE GAVA,CHIEVOVERONA,SAMPDORIA,draw,1,1,...,Mirko Pieri,Giampaolo Pazzini,Antonio Cassano,Mattia Mustacchio,Claudio Bellucci,Antonio Mirante,Guido Marilungo,Pedro Obiang,Michele Ferri,Manuel Da Costa
1357,2008-09,22,2009-02-01,15:00,MATTEO SIMONE,GENOA,PALERMO,home,1,0,...,Mark Bresciano,Levan Mchedlidze,Edinson Cavani,Mirko Savini,Giulio Migliaccio,Roberto Guana,Samir Ujkani,Alberto Cossentino,Giovanni Tedesco,Davide Succi


### Rest days features
Rest days are very important for recovery.

In [29]:
def count_days_between_dates(date1, date2) -> int:
    return (date1 - date2).dt.days

In [30]:
# for i in range(5):
#     for home_or_away in HomeOrAway:
#         if i == 0:
#             df[f'{home_or_away.name}_team_rest_days'] = count_days_between_dates(df['date'], df[f'{home_or_away.name}_team_history_{i+1}_date'])
#         else:
#             df[f'{home_or_away.name}_team_history_{i}_rest_days'] = count_days_between_dates(df[f'{home_or_away.name}_team_history_{i}_date'], df[f'{home_or_away.name}_team_history_{i+1}_date'])

# todo: cannot count rest days for historical 5th games because we still miss the data about the 6th historical match

In [31]:
# delete columns referring to the historical 6th matches
# df = df.loc[:, ~df.columns.str.contains('history_6')]

#### Datetime features
Add **year**, **month** and **day** features for all **date** value

In [32]:
def get_exploded_datetime_values(df: pd.DataFrame) -> dict:
    data = {'year': [], 'month': [], 'day': [], 'hour': []}
    df['time'] = pd.to_datetime(df['time'], format="%H:%M")
    data['year'] += df['date'].map(lambda val: val.year).tolist()
    data['month'] += df['date'].map(lambda val: val.month).tolist()
    data['day'] += df['date'].map(lambda val: val.day).tolist()
    data['hour'] += df['time'].map(lambda val: val.hour).tolist()
    return data


def insert_exploded_datetime_values(df, exploded):
    df.insert(loc=df.columns.get_loc('time'), column='year', value=exploded['year'])
    df.insert(loc=df.columns.get_loc('time'), column='month', value=exploded['month'])
    df.insert(loc=df.columns.get_loc('time'), column='day', value=exploded['day'])
    df.insert(loc=df.columns.get_loc('time'), column='hour', value=exploded['hour'])
    return df


def explode_datetime_values(df: pd.DataFrame) -> pd.DataFrame:
    exploded = get_exploded_datetime_values(df)
    return insert_exploded_datetime_values(df, exploded)

In [33]:
# explode datetime values
df = explode_datetime_values(df)
# drop date columns
df.drop('date', axis=1, inplace=True)
df.drop('time', axis=1, inplace=True)

The result of the pre-processing looks like this:

In [34]:
df.head()

Unnamed: 0,season,round,year,month,day,hour,referee,home_team,away_team,result,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,2005,8,27,20,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,home,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,2005-06,1,2005,8,27,18,GIANLUCA ROCCHI,LIVORNO,LECCE,home,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,2005-06,1,2005,8,28,15,MASSIMO DE,ASCOLI,MILAN,draw,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,2005-06,1,2005,8,28,15,TIZIANO PIERI,PARMA,PALERMO,draw,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,2005-06,1,2005,8,28,15,PAOLO TAGLIAVENTO,INTER,TREVISO,home,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto


In [35]:
df = df.dropna()

In [36]:
df.to_csv("train.csv")