# Football matches result prediction

Let's try to predict *Serie A* matches result (i.e. home win, away win or draw) with a RNN.

## Introduction

- The dataset was created by scraping *Serie A* matches data starting from season 2005-06 to season 2020-21
- Cup matches (*Champions League*, *Europa League*, *Coppa Italia*) played over the course of each season were not taken into account

In [642]:
import pandas as pd
from _MatchNotFoundException import MatchNotFoundException
from HomeOrAway import HomeOrAway
from MatchResult import MatchResult

In [643]:
match_cols = ['season', 'round'] + \
             ['date', 'time', 'referee', 'home_team', 'away_team', 'home_team_score', 'away_team_score'] + \
             ['home_team_coach'] + \
             ['home_player_' + str(i) for i in range(1, 12)] + \
             ['home_substitute_' + str(i) for i in range(1, 8)] + \
             ['away_team_coach'] + \
             ['away_player_' + str(i) for i in range(1, 12)] + \
             ['away_substitute_' + str(i) for i in range(1, 8)]
historical_features_enabled = False
# historical_features_enabled = True

In [644]:
raw_data = pd.read_csv('raw.csv')
raw_data.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,28/08/2005,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
1,2005-06,1,27/08/2005,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
2,2005-06,1,28/08/2005,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
3,2005-06,1,28/08/2005,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
4,2005-06,1,27/08/2005,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino


## Data visualization

Let's inspect our data a little bit more

In [645]:
# todo

## Dataset construction
Now let's clean our raw data and construct the dataset. The full process for preparing the data is:
- Convert date string values to pandas datetime values and explode them
- Construct and add historical features
- Add number of rest days between matches
- Derive match results from scores
- Encode data

In [646]:
df = pd.DataFrame(raw_data)
df = df[:200]

In [647]:
# convert date str to datetime
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
# sort by date column
df = df.sort_values(by='date')
df = df.reset_index(drop=True)

### Historical features
In order to train an RNN model, we need to have series of football games, so the goal of this section is to add some historical features that will carry information about the last five games played by the home and away team of each match in the dataset.

In [648]:
def get_match_index_by_match(match: pd.DataFrame) -> int:
    return match.index.tolist()[0]


def is_team_home_or_away_in_match(team_name: str, match: pd.DataFrame):
    home_team = match.squeeze()['home_team']
    if home_team == team_name:
        return 'home'
    else:
        return 'away'


def get_last_match_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str) -> pd.DataFrame:
    """
    Find in df the last match played by team_name prior to the game identified by target_match_index
    :param df: where to search
    :param target_match_index: the index in df of the target match
    :param team_name: name of the team that has played the target match
    :return:
    """
    for i in reversed(range(target_match_index)):
        current_match = df.iloc[[i]]  # dataframe
        if current_match.at[i, 'home_team'] == team_name or current_match.at[i, 'away_team'] == team_name:
            return current_match
    raise MatchNotFoundException(f'Previous match for team {team_name} was not found')


def get_last_n_matches_played_by_team(df: pd.DataFrame, target_match_index: int, team_name: str, n: int) -> list[pd.Series]:
    """
    Find in df the last n matches played by team_name prior to the game identified by match_index
    :param df: where to search
    :param target_match_index: target match index in df
    :param team_name: the name of the team that has played all the last n matches
    :param n: the number of matches to look for
    :return:
    """
    match = df.iloc[[target_match_index]]  # dataframe
    last_match_found = match
    last_n_matches = []
    for i in range(1, n + 1):
        try:
            last_match_found = get_last_match_played_by_team(df, get_match_index_by_match(last_match_found), team_name)
            last_n_matches.append(last_match_found.squeeze())
        except MatchNotFoundException:
            pass
    return last_n_matches


def construct_historical_features_of_last_n_matches_for_target_match(target_match_index: int,
                                                                     target_home_or_away: HomeOrAway,
                                                                     last_n_matches: list[
                                                                            pd.Series]) -> pd.DataFrame:
    """
    Build a dataframe containing information about the last n matches played by home or away team of target match as historical features.
    :param target_match_index: the index of the target match in df
    :param target_home_or_away: tells whether the n matches has been played by the home or away team of target match
    :param last_n_matches: a list containing the last n matches
    :return:
    """
    # Init columns for 5 historical matches
    historical_cols = [f'{target_home_or_away.name}_team_history_{i}_{colName}' for i in range(1, 6) for colName in
                       match_cols]
    # Init empty DataFrame with those columns and specific index
    result = pd.DataFrame(columns=historical_cols, index=[target_match_index])
    # Copy values into DataFrame
    for i in range(len(last_n_matches)):
        source_match = last_n_matches[i]
        for colName, colValue in source_match.iteritems():
            result.at[target_match_index, f'{target_home_or_away.name}_team_history_{i + 1}_{colName}'] = colValue
    return result


def add_historical_features_of_last_n_matches_for_all_matches(df: pd.DataFrame, n: int) -> pd.DataFrame:
    """
    Construct a new dataframe adding information about the last five matches played by home and away team of all matches in df
    :param df: source of data
    :return: a new dataframe
    """
    new_df = pd.DataFrame()
    # for each row in dataframe
    for index, row in df.iterrows():
        team = row['home_team']
        home_team_historical_df = construct_historical_features_of_last_n_matches_for_target_match(index,
                                                                                                   HomeOrAway.home,
                                                                                                   get_last_n_matches_played_by_team(
                                                                                                       df, index, team,
                                                                                                       n))
        team = row['away_team']
        away_team_historical_df = construct_historical_features_of_last_n_matches_for_target_match(index,
                                                                                                   HomeOrAway.away,
                                                                                                   get_last_n_matches_played_by_team(
                                                                                                       df, index, team,
                                                                                                       n))
        new_row_as_df = pd.concat([df.iloc[[index]], home_team_historical_df, away_team_historical_df], axis=1)
        new_df = pd.concat([new_df, new_row_as_df], axis=0)
    return new_df

In [649]:
# Construct historical features
if historical_features_enabled:
    df = add_historical_features_of_last_n_matches_for_all_matches(df, 5)

In [650]:
if historical_features_enabled:
    # Remove the matches that do not have values for all historical features
    df = df.dropna()
    df = df.reset_index(drop=True)

In [651]:
df[0:200]

Unnamed: 0,season,round,date,time,referee,home_team,away_team,home_team_score,away_team_score,home_team_coach,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,2,1,Cesare Prandelli,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,2,1,Roberto Donadoni,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,1,1,Massimo Silva,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,1,1,Mario Beretta,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,3,0,Roberto Mancini,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2005-06,20,2006-01-18,20:30,PAOLO TAGLIAVENTO,ROMA,REGGINA,3,1,Luciano Spalletti,...,Francesco Modesto,Francesco Cozza,Luca Vigiani,Maurizio Lauro,Nicola Amoruso,Simone Missiroli,Ivan Pelizzoli,Davide Biondini,Filippo Carobbio,Simone Cavalli
196,2005-06,20,2006-01-18,20:30,GIANLUCA ROCCHI,MILAN,ASCOLI,1,0,Carlo Ancelotti,...,Cristiano Del Grosso,Sasa Bjelanovic,Fabio Quagliarella,Massimo Paci,Michele Fini,Pasquale Foggia,Carlo Zotti,Riccardo Corallo,Davide Oresti,Marco Ferrante
197,2005-06,20,2006-01-18,20:30,PASQUALE RODOMONTI,LECCE,LIVORNO,0,0,Silvio Baldini,...,Francesco Coco,Ibrahima Bakayoko,Cristiano Lucarelli,Marc Pfertzel,Cesar Prates,Raffaele Palladino,Paolo Acerbis,Stefano Fanucci,Giuseppe Colucci,Paulinho
198,2005-06,20,2006-01-18,20:30,ANDREA ROMEO,CAGLIARI,ROBUR SIENA,1,0,Nedo Sonetti,...,Cristian Molinaro,Erjon Bogdani,Enrico Chiesa,Rej Volpato,Paolo Negro,Nicola Legrottaglie,Marco Fortin,Francesco Colonnese,Roberto Nanni,


As expected, the first few retained matches comes from round 6, as we have considered 5 games for historical features.

In [652]:
# re-convert all date values to datetime
for colName, colValue in df.iteritems():
    if colName.endswith('date'):
        df[colName] = pd.to_datetime(df[colName], infer_datetime_format=True)

In [653]:
df.shape

(200, 47)

### Rest days features
Rest days are very important for recovery.

In [654]:
def count_days_between_dates(date1, date2) -> int:
    return (date1 - date2).dt.days

In [655]:
# for i in range(5):
#     for home_or_away in HomeOrAway:
#         if i == 0:
#             df[f'{home_or_away.name}_team_rest_days'] = count_days_between_dates(df['date'], df[f'{home_or_away.name}_team_history_{i+1}_date'])
#         else:
#             df[f'{home_or_away.name}_team_history_{i}_rest_days'] = count_days_between_dates(df[f'{home_or_away.name}_team_history_{i}_date'], df[f'{home_or_away.name}_team_history_{i+1}_date'])

# todo: cannot count rest days for historical 5th games because we still miss the data about the 6th historical match

In [656]:
# delete columns referring to the historical 6th matches
# df = df.loc[:, ~df.columns.str.contains('history_6')]

### Additional features

#### Result column
We don't care so much about scores because our model will try to predict match results, i.e. **home win**, **away win** or **draw. We need a result column to be used as our target column, so let's construct it from the scores.

In [657]:
def get_match_result_from_score(home_team_score: int, away_team_score: int) -> MatchResult:
    if home_team_score == away_team_score:
        return MatchResult.draw
    if home_team_score > away_team_score:
        return MatchResult.home
    return MatchResult.away


def add_target_column_for_historical_matches(df: pd.DataFrame) -> pd.DataFrame:
    results = {}
    # init
    for i in range(5):
        results[f'home_team_history_{i+1}_result'] = []
        results[f'away_team_history_{i+1}_result'] = []
    # populate
    for index, row in df.iterrows():
        for i in range(5):
            results[f'home_team_history_{i+1}_result'] += \
                [get_match_result_from_score(row[f'home_team_history_{i+1}_home_team_score'], row[f'home_team_history_{i+1}_away_team_score']).name]
            results[f'away_team_history_{i+1}_result'] += \
                [get_match_result_from_score(row[f'away_team_history_{i+1}_home_team_score'], row[f'away_team_history_{i+1}_away_team_score']).name]
    # insert in dataset
    for i in range(5):
        df.insert(loc=df.columns.get_loc(f'home_team_history_{i+1}_home_team_score'), column=f'home_team_history_{i+1}_result', value=results[f'home_team_history_{i+1}_result'])
        df.insert(loc=df.columns.get_loc(f'away_team_history_{i+1}_home_team_score'), column=f'away_team_history_{i+1}_result', value=results[f'away_team_history_{i+1}_result'])
    return df


def add_target_column(df: pd.DataFrame) -> pd.DataFrame:
    results = {'result': []}
    for index, row in df.iterrows():
        results['result'] += [get_match_result_from_score(row['home_team_score'], row['away_team_score']).name]
    df.insert(loc=df.columns.get_loc('home_team_score'), column='result', value=results['result'])
    return df

In [658]:
# add target column
add_target_column(df)
if historical_features_enabled:
    add_target_column_for_historical_matches(df)
df.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_score,away_team_score,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,home,2,1,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,home,2,1,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,draw,1,1,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,draw,1,1,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,home,3,0,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto


In [659]:
# drop score columns
df = df.drop(columns=['home_team_score', 'away_team_score'])
df.head()

Unnamed: 0,season,round,date,time,referee,home_team,away_team,result,home_team_coach,home_player_1,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,2005-06,1,2005-08-27,20:30,GIANLUCA PAPARESTA,FIORENTINA,SAMPDORIA,home,Cesare Prandelli,Sebastien Frey,...,Lamberto Zauli,Francesco Flachi,Emiliano Bonazzoli,Marco Pisano,Vitaliy Kutuzov,Marco Borriello,Luca Castellazzi,Marco Zamboni,Simone Pavan,Gionata Mingozzi
1,2005-06,1,2005-08-27,18:00,GIANLUCA ROCCHI,LIVORNO,LECCE,home,Roberto Donadoni,Marco Amelia,...,Alex Pinardi,Aleksei Eremenko,Graziano Pelle,Alfonso Camorani,Jaime Valdes,Giuseppe Cozzolino,Francesco Benussi,Marco Pecorari,Giuseppe Abruzzese,Davide Giorgino
2,2005-06,1,2005-08-28,15:00,MASSIMO DE,ASCOLI,MILAN,draw,Massimo Silva,Ferdinando Coppola,...,Kaka,Andriy Shevchenko,Alberto Gilardino,Marek Jankulovski,Clarence Seedorf,Zeljko Kalac,Gennaro Gattuso,Manuel Rui Costa,Johann Vogel,Dario Simic
3,2005-06,1,2005-08-28,15:00,TIZIANO PIERI,PARMA,PALERMO,draw,Mario Beretta,Cristiano Lupatelli,...,Massimo Bonanni,Andrea Caracciolo,Stephen Makinwa,Nicola Santoni,Franco Brienza,Massimo Mutarelli,Giuseppe Biava,Michele Ferri,Mariano Gonzalez,Simone Pepe
4,2005-06,1,2005-08-28,15:00,PAOLO TAGLIAVENTO,INTER,TREVISO,home,Roberto Mancini,Julio Cesar,...,Reginaldo,Luigi Beghetto,Pinga,Roberto Chiappara,Dino Fava,Jehad Muntasser,Adriano Zancope,Francesco Parravicini,Anderson,Alberto Giuliatto


#### Exploded datetime features
Add **year**, **month** and **day** features for all **date** value

In [660]:
def get_exploded_datetime_values(df: pd.DataFrame) -> dict:
    data = {'year': [], 'month': [], 'day': []}
    data['year'] += df['date'].map(lambda val: val.year).tolist()
    data['month'] += df['date'].map(lambda val: val.month).tolist()
    data['day'] += df['date'].map(lambda val: val.day).tolist()
    return data

def get_exploded_datetime_values_for_historical_matches(df: pd.DataFrame) -> dict:
    data = {}
    for i in range(5):
        for home_or_away in HomeOrAway:
            data[f'{home_or_away.name}_team_history_{i + 1}_year'] = []
            data[f'{home_or_away.name}_team_history_{i + 1}_month'] = []
            data[f'{home_or_away.name}_team_history_{i + 1}_day'] = []
    for i in range(5):
        for home_or_away in HomeOrAway:
            data[f'{home_or_away.name}_team_history_{i + 1}_year'] += df[f'{home_or_away.name}_team_history_{i + 1}_date'].map(lambda val: val.year).tolist()
            data[f'{home_or_away.name}_team_history_{i + 1}_month'] += df[f'{home_or_away.name}_team_history_{i + 1}_date'].map(lambda val: val.month).tolist()
            data[f'{home_or_away.name}_team_history_{i + 1}_day'] += df[f'{home_or_away.name}_team_history_{i + 1}_date'].map(lambda val: val.day).tolist()
    return data


def insert_exploded_datetime_values(df, exploded):
    df.insert(loc=df.columns.get_loc('time'), column='year', value=exploded['year'])
    df.insert(loc=df.columns.get_loc('time'), column='month', value=exploded['month'])
    df.insert(loc=df.columns.get_loc('time'), column='day', value=exploded['day'])
    return df


def insert_exploded_datetime_values_for_historical_matches(df, exploded):
    for i in range(5):
        for home_or_away in HomeOrAway:
            df.insert(loc=df.columns.get_loc(f'{home_or_away.name}_team_history_{i + 1}_time'),
                      column=f'{home_or_away.name}_team_history_{i + 1}_year',
                      value=exploded[f'{home_or_away.name}_team_history_{i + 1}_year'])
            df.insert(loc=df.columns.get_loc(f'{home_or_away.name}_team_history_{i + 1}_time'),
                      column=f'{home_or_away.name}_team_history_{i + 1}_month',
                      value=exploded[f'{home_or_away.name}_team_history_{i + 1}_month'])
            df.insert(loc=df.columns.get_loc(f'{home_or_away.name}_team_history_{i + 1}_time'),
                      column=f'{home_or_away.name}_team_history_{i + 1}_day',
                      value=exploded[f'{home_or_away.name}_team_history_{i + 1}_day'])
    return df


def explode_datetime_values(df: pd.DataFrame) -> pd.DataFrame:
    exploded = get_exploded_datetime_values(df)
    return insert_exploded_datetime_values(df, exploded)


def explode_datetime_values_for_historical_matches(df: pd.DataFrame) -> pd.DataFrame:
    exploded = get_exploded_datetime_values_for_historical_matches(df)
    return insert_exploded_datetime_values_for_historical_matches(df, exploded)


def get_column_names_containing_str(df: pd.DataFrame, substring: str) -> list[str]:
    return df.loc[:,df.columns.str.contains(substring)].columns.values.tolist()

In [661]:
# explode datetime values
df = explode_datetime_values(df)
if historical_features_enabled:
    df = explode_datetime_values_for_historical_matches(df)
# drop date columns
date_cols = get_column_names_containing_str(df, 'date')
df.drop(date_cols, axis=1, inplace=True)

### Result column balancing

### Data encoding
We need to encode the data before feeding it to the network.

In [662]:
from collections import defaultdict

#### Rounds

In [663]:
# encode rounds
df['round'] = df['round'].astype(int)

#### Results
One-hot encoding

In [664]:
# encode results
target2int = {'home': 0, 'draw': 1, 'away': 2}
result_cols = get_column_names_containing_str(df, 'result')
for col in result_cols:
    df[f'{col}'] = df[f'{col}'].map(target2int)

#### Referees
One-hot encoding

In [665]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode referees
referee_cols = get_column_names_containing_str(df, 'referee')
for col in referee_cols:
    referee_ids = [temp_dict[ele] for ele in df[f'{col}'].tolist()]
    df[f'{col}'] = referee_ids

#### Teams
One-hot encoding

In [666]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode teams
for home_or_away in HomeOrAway:
    teams_ids = [temp_dict[ele] for ele in df[f'{home_or_away.name}_team'].tolist()]
    df[f'{home_or_away.name}_team'] = teams_ids
print(df['home_team'].max())
print(df['away_team'].max())
if historical_features_enabled:
    for i in range(5):
        for home_or_away in HomeOrAway:
            teams_ids = [temp_dict[ele] for ele in df[f'{home_or_away.name}_team_history_{i+1}_home_team'].tolist()]
            df[f'{home_or_away.name}_team_history_{i+1}_home_team'] =  teams_ids
            teams_ids = [temp_dict[ele] for ele in df[f'{home_or_away.name}_team_history_{i+1}_away_team'].tolist()]
            df[f'{home_or_away.name}_team_history_{i+1}_away_team'] = teams_ids

19
19


#### Coaches
One-hot encoding

In [667]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode team coaches
coach_cols = get_column_names_containing_str(df, 'coach')
for col in coach_cols:
    team_coach_ids = [temp_dict[ele] for ele in df[f'{col}'].tolist()]
    df[f'{col}'] = team_coach_ids

#### Players
One-hot encoding

In [668]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode players
player_cols = get_column_names_containing_str(df, 'player')
player_cols += get_column_names_containing_str(df, 'substitute')
for col in player_cols:
    player_ids = [temp_dict[ele] for ele in df[f'{col}'].tolist()]
    df[f'{col}'] = player_ids

#### Seasons

In [669]:
# reset
temp_dict = defaultdict(lambda: len(temp_dict))
# encode players
season_cols = get_column_names_containing_str(df, 'season')
for col in season_cols:
    season_ids = [temp_dict[ele] for ele in df[f'{col}'].tolist()]
    df[f'{col}'] = season_ids

#### Times

In [670]:
# convert time values to datetime
df['time'] = pd.to_datetime(df['time'], format="%H:%M")
hours = []
for index, row in df.iterrows():
    hours.append(row['time'].hour)
df.insert(loc=df.columns.get_loc('time'), column='hour', value=hours)
df = df.drop(columns='time')

In [671]:
df[:200]

Unnamed: 0,season,round,year,month,day,hour,referee,home_team,away_team,result,...,away_player_9,away_player_10,away_player_11,away_substitute_1,away_substitute_2,away_substitute_3,away_substitute_4,away_substitute_5,away_substitute_6,away_substitute_7
0,0,1,2005,8,27,20,0,0,12,0,...,356,349,363,164,338,402,473,482,127,501
1,0,1,2005,8,27,18,1,1,19,0,...,247,417,354,213,308,304,31,117,174,459
2,0,1,2005,8,28,15,2,2,10,1,...,246,347,384,188,292,444,221,317,266,85
3,0,1,2005,8,28,15,3,3,11,1,...,318,367,348,11,423,272,112,408,219,427
4,0,1,2005,8,28,15,4,4,13,0,...,328,375,286,426,360,212,22,243,438,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,20,2006,1,18,20,4,14,8,0,...,297,314,178,135,399,421,8,224,262,393
196,0,20,2006,1,18,20,1,10,2,0,...,177,369,341,113,313,191,460,123,531,361
197,0,20,2006,1,18,20,14,19,1,1,...,323,387,362,155,181,340,457,476,283,437
198,0,20,2006,1,18,20,18,17,6,0,...,422,344,392,432,41,67,472,406,382,532


### Data normalization

In [672]:
# todo

## Training
Now that out dataset is ready, we can configure an RNN model and train it.

In [673]:
import torch
import torch.nn as nn
from torch import optim

In [674]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.linear = nn.Linear(input_size + hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        pre_hidden = self.linear(combined)
        hidden = self.tanh(pre_hidden)
        return hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [675]:
n_historical_features = 17
n_hidden = 128
encoder = EncoderRNN(input_size=n_historical_features, hidden_size=n_hidden)

In [676]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 3),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.flatten(x)
        output = self.layers(x)
        return output

In [677]:
n_basic_features = len(match_cols)
mlp = NeuralNetwork(n_historical_features * 2 + n_basic_features)

In [678]:
def train(x, y, encoder: EncoderRNN, nn: NeuralNetwork,
          encoder_optimizer: optim.Optimizer, nn_optimizer: optim.Optimizer, loss_fn):
    # init
    encoder_optimizer.zero_grad()
    nn_optimizer.zero_grad()
    input_length = x.size(0)
    target_length = y.size(0)
    loss = 0
    # encoder forward
    encoder_hidden = encoder.init_hidden()
    for history_index in range(input_length):
        encoder_hidden = encoder(x[history_index], encoder_hidden)
    home_team_form = encoder_hidden
    encoder_hidden = encoder.init_hidden()
    for history_index in range(input_length):
        encoder_hidden = encoder(x[history_index], encoder_hidden)
    away_team_form = encoder_hidden
    # mlp forward
    match = torch.tensor([])  # todo
    x_train = torch.cat((match, home_team_form, away_team_form), 1)
    y_hat = nn(x_train)
    # backward
    loss = loss_fn(y, y_hat)
    loss.backward()


learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
mlp_optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)
loss_fn = nn.NLLLoss()