# Football Match Predictor - Feature Engineering

## Library Imports

In [1]:
# Add all the library imports required
import pandas as pd
import numpy as np
import pickle
import data_cleaning
from tqdm import tqdm
import os

## Download the ELO Data

In [2]:
if not os.path.exists('./elo_dict.pkl'):
    !wget "https://aicore-files.s3.amazonaws.com/Data-Science/elo_dict.pkl"

In [3]:
elo_dict = pickle.load(open('elo_dict.pkl', 'rb'))
print(list(elo_dict.keys())[0])
print(elo_dict[list(elo_dict.keys())[0]])

https://www.besoccer.com/match/saarbrucken/stuttgarter-kickers/19903487
{'Elo_home': 56.0, 'Elo_away': 59.0}


In [4]:
elo_link_list = []
elo_home_list = []
elo_away_list = []
for key, value in elo_dict.items():
    elo_link_list.append(key)
    elo_home_list.append(value['Elo_home'])
    elo_away_list.append(value['Elo_away'])

elo_df = pd.DataFrame({'link': elo_link_list, 'elo_home': elo_home_list, 'elo_away': elo_away_list})
elo_df.head()

Unnamed: 0,link,elo_home,elo_away
0,https://www.besoccer.com/match/saarbrucken/stu...,56.0,59.0
1,https://www.besoccer.com/match/sc-freiburg/unt...,53.0,55.0
2,https://www.besoccer.com/match/vfl-osnabruck/m...,52.0,53.0
3,https://www.besoccer.com/match/rot-weiss-essen...,53.0,62.0
4,https://www.besoccer.com/match/alemannia-aache...,57.0,52.0


## Download the other data and join into one dataframe

In [5]:
scores_df = data_cleaning.import_leagues()
scores_df = data_cleaning.tweak_scores_df(scores_df)
scores_df = data_cleaning.create_match_id_col_from_link(scores_df)
scores_df.head()

Unnamed: 0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,home_points,away_points,match_id
0,Charlton Athletic,Derby County,0-0,https://www.besoccer.com/match/charlton-athlet...,1990,1,premier_league,0,0,draw,1,1,charlton-athletic-fc/derby-county-fc/1990
1,Tottenham Hotspur,Luton Town,2-1,https://www.besoccer.com/match/tottenham-hotsp...,1990,1,premier_league,2,1,home_win,3,0,tottenham-hotspur-fc/luton-town-fc/1990
2,Southampton,Millwall,1-2,https://www.besoccer.com/match/southampton-fc/...,1990,1,premier_league,1,2,away_win,0,3,southampton-fc/millwall-fc/1990
3,Sheffield Wednesday,Norwich City,0-2,https://www.besoccer.com/match/sheffield-wedne...,1990,1,premier_league,0,2,away_win,0,3,sheffield-wednesday-fc/norwich-city-fc/1990
4,Queens Park Rangers,Crystal Palace,2-0,https://www.besoccer.com/match/queens-park-ran...,1990,1,premier_league,2,0,home_win,3,0,queens-park-rangers-fc/crystal-palace-fc/1990


In [6]:
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122619 entries, 0 to 129062
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   home_team    122619 non-null  object  
 1   away_team    122619 non-null  object  
 2   score        122619 non-null  object  
 3   link         122619 non-null  object  
 4   season_year  122619 non-null  int64   
 5   match_round  122619 non-null  int64   
 6   league       122619 non-null  object  
 7   home_goals   122619 non-null  int64   
 8   away_goals   122619 non-null  int64   
 9   result       122619 non-null  category
 10  home_points  122619 non-null  int64   
 11  away_points  122619 non-null  int64   
 12  match_id     122619 non-null  object  
dtypes: category(1), int64(6), object(6)
memory usage: 12.3+ MB


In [7]:
scores_df.match_id.nunique()

122619

In [8]:
match_info_df = data_cleaning.import_match_info_data()
match_info_df = data_cleaning.create_match_id_col(match_info_df)
match_info_df.head()

Unnamed: 0,date,referee,home_yellow,home_red,away_yellow,away_red,match_id
0,1989-07-29 15:00:00,Hans-Jürgen Weber,0.0,0.0,3.0,0.0,saarbrucken/stuttgarter-kickers/1990
1,1989-07-29 15:00:00,Kurt Wittke,1.0,0.0,0.0,0.0,sc-freiburg/unterhaching/1990
2,1989-07-29 15:00:00,Werner Föckler,3.0,0.0,2.0,0.0,vfl-osnabruck/meppen/1990
3,1989-07-29 15:00:00,Heinz Werner,2.0,0.0,2.0,0.0,rot-weiss-essen/schalke-04/1990
4,1989-07-29 15:00:00,Hans-Peter Dellwing,1.0,0.0,1.0,0.0,alemannia-aachen/msv-duisburg/1990


In [9]:
match_info_df.dtypes

date           datetime64[ns]
referee                object
home_yellow           float64
home_red              float64
away_yellow           float64
away_red              float64
match_id               object
dtype: object

In [10]:
match_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143348 entries, 0 to 143347
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date         143348 non-null  datetime64[ns]
 1   referee      113721 non-null  object        
 2   home_yellow  122798 non-null  float64       
 3   home_red     122798 non-null  float64       
 4   away_yellow  122798 non-null  float64       
 5   away_red     122798 non-null  float64       
 6   match_id     143348 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 7.7+ MB


In [11]:
match_info_df.match_id.nunique()

143348

In [12]:
team_info_df = data_cleaning.import_team_info_data()
team_info_df.head()

Unnamed: 0,team,city,country,stadium,capacity,pitch
0,Wattenscheid 09,Bochum-Wattenscheid,Germany,Lohrheidestadion,16233,Natural
1,Hertha BSC,Berlín,Germany,Olympiastadion Berlin,76065,Natural
2,Unterhaching,Unterhaching,Germany,Sportpark Unterhaching,15053,Natural
3,Fortuna Köln,Cologne,Germany,Südstadion,14944,Natural
4,MSV Duisburg,Duisburgo,Germany,Schauinsland-Reisen-Arena,31514,Natural


In [13]:
scores_match_info_df = pd.merge(scores_df, match_info_df, how='left', on="match_id")
scores_match_info_df.head()

Unnamed: 0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,home_points,away_points,match_id,date,referee,home_yellow,home_red,away_yellow,away_red
0,Charlton Athletic,Derby County,0-0,https://www.besoccer.com/match/charlton-athlet...,1990,1,premier_league,0,0,draw,1,1,charlton-athletic-fc/derby-county-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
1,Tottenham Hotspur,Luton Town,2-1,https://www.besoccer.com/match/tottenham-hotsp...,1990,1,premier_league,2,1,home_win,3,0,tottenham-hotspur-fc/luton-town-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
2,Southampton,Millwall,1-2,https://www.besoccer.com/match/southampton-fc/...,1990,1,premier_league,1,2,away_win,0,3,southampton-fc/millwall-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
3,Sheffield Wednesday,Norwich City,0-2,https://www.besoccer.com/match/sheffield-wedne...,1990,1,premier_league,0,2,away_win,0,3,sheffield-wednesday-fc/norwich-city-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
4,Queens Park Rangers,Crystal Palace,2-0,https://www.besoccer.com/match/queens-park-ran...,1990,1,premier_league,2,0,home_win,3,0,queens-park-rangers-fc/crystal-palace-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0


In [14]:
scores_match_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122619 entries, 0 to 122618
Data columns (total 19 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    122619 non-null  object        
 1   away_team    122619 non-null  object        
 2   score        122619 non-null  object        
 3   link         122619 non-null  object        
 4   season_year  122619 non-null  int64         
 5   match_round  122619 non-null  int64         
 6   league       122619 non-null  object        
 7   home_goals   122619 non-null  int64         
 8   away_goals   122619 non-null  int64         
 9   result       122619 non-null  category      
 10  home_points  122619 non-null  int64         
 11  away_points  122619 non-null  int64         
 12  match_id     122619 non-null  object        
 13  date         119163 non-null  datetime64[ns]
 14  referee      94378 non-null   object        
 15  home_yellow  102511 non-null  floa

In [15]:
scores_match_info_df[scores_match_info_df.date.isna()]

Unnamed: 0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,home_points,away_points,match_id,date,referee,home_yellow,home_red,away_yellow,away_red
12287,Aston Villa,Tottenham Hotspur,0-2,https://www.besoccer.com/match/aston-villa-fc/...,2021,18,premier_league,0,2,away_win,0,3,aston-villa-fc/tottenham-hotspur-fc/2021,NaT,,,,,
12293,Aston Villa,Everton,0-0,https://www.besoccer.com/match/aston-villa-fc/...,2021,19,premier_league,0,0,draw,1,1,aston-villa-fc/everton-fc/2021,NaT,,,,,
12367,Everton,Southampton,1-0,https://www.besoccer.com/match/everton-fc/sout...,2021,26,premier_league,1,0,home_win,3,0,everton-fc/southampton-fc/2021,NaT,,,,,
12368,Burnley,Arsenal,1-1,https://www.besoccer.com/match/burnley-fc/arse...,2021,27,premier_league,1,1,draw,1,1,burnley-fc/arsenal/2021,NaT,,,,,
12369,Sheffield United,Southampton,0-2,https://www.besoccer.com/match/sheffield-unite...,2021,27,premier_league,0,2,away_win,0,3,sheffield-united/southampton-fc/2021,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122428,PEC Zwolle,Twente 1965,1-0,https://www.besoccer.com/match/fc-zwolle/fc-tw...,2021,29,eredivisie,1,0,home_win,3,0,fc-zwolle/fc-twente-1965/2021,NaT,,,,,
122429,Utrecht,Feyenoord,1-2,https://www.besoccer.com/match/fc-utrecht/feye...,2021,29,eredivisie,1,2,away_win,0,3,fc-utrecht/feyenoord/2021,NaT,,,,,
122430,Groningen,Heerenveen,0-2,https://www.besoccer.com/match/fc-groningen/he...,2021,29,eredivisie,0,2,away_win,0,3,fc-groningen/heerenveen/2021,NaT,,,,,
122431,VVV Venlo,PSV,0-2,https://www.besoccer.com/match/vvv/psv/202113372,2021,29,eredivisie,0,2,away_win,0,3,vvv/psv/2021,NaT,,,,,


In [16]:
team_info_reduced_df = team_info_df.copy()
team_info_reduced_df = team_info_reduced_df[['team', 'capacity']]
# team_info_reduced_df.rename(columns={'team': 'home_team'}, inplace=True)
# team_info_reduced_df.head()
scores_match_team_info_df = (pd.merge(scores_match_info_df, team_info_reduced_df, how='left', left_on='home_team', right_on='team')
                                .drop(columns = 'team'))
# scores_match_team_info_df.dropna(subset=['score'], inplace=True) 
scores_match_team_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122619 entries, 0 to 122618
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    122619 non-null  object        
 1   away_team    122619 non-null  object        
 2   score        122619 non-null  object        
 3   link         122619 non-null  object        
 4   season_year  122619 non-null  int64         
 5   match_round  122619 non-null  int64         
 6   league       122619 non-null  object        
 7   home_goals   122619 non-null  int64         
 8   away_goals   122619 non-null  int64         
 9   result       122619 non-null  category      
 10  home_points  122619 non-null  int64         
 11  away_points  122619 non-null  int64         
 12  match_id     122619 non-null  object        
 13  date         119163 non-null  datetime64[ns]
 14  referee      94378 non-null   object        
 15  home_yellow  102511 non-null  floa

In [17]:
capcity_na_idxs = scores_match_team_info_df['capacity'].isna().to_list()
scores_match_team_info_df.loc[capcity_na_idxs, 'home_team'].unique()

array(['Sheffield Wednesday', 'Queens Park Rangers',
       'Oldham Athletic AFC', 'West Bromwich Albion',
       'Brighton & Hove Albion', 'Peterborough United',
       'Gimnàstic Tarragona', 'Real Unión de Irún', 'Fortuna Düsseldorf',
       'Eintracht Frankfurt', 'B. Mönchengladbach', 'Stuttgarter Kickers',
       '1. FC Lokomotive Leipzig', 'Blau-Weiß 1890 Berlin',
       'FC Carl Zeiss Jena', 'Rot-Weiß Oberhausen',
       'Kickers Offenbach FC', 'SV Eintracht Trier',
       'SV Wacker Burghausen', 'Siegen Sportfreunde ',
       'SV Wehen Burghausen', 'Würzburger Kickers', 'Barletta', 'Licata',
       'AS Lucchese Libertas 1905', 'Taranto', 'Casertana',
       'Calcio Portogruaro-Summaga', 'Sporting Toulon Var',
       'Olympique Marseille', 'Evian Thonon Gaillard',
       'CS Louhans Cuiseaux', 'FC Libourne Saint Seurin'], dtype=object)

In [18]:
scores_match_team_info_elo_df = pd.merge(scores_match_team_info_df, elo_df, how='left', on="link").set_index('match_id')
# scores_match_team_info_elo_df.dropna(subset=['score'], inplace=True) 
scores_match_team_info_elo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 122619 entries, charlton-athletic-fc/derby-county-fc/1990 to graafschap/fc-den-bosch/2021
Data columns (total 21 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    122619 non-null  object        
 1   away_team    122619 non-null  object        
 2   score        122619 non-null  object        
 3   link         122619 non-null  object        
 4   season_year  122619 non-null  int64         
 5   match_round  122619 non-null  int64         
 6   league       122619 non-null  object        
 7   home_goals   122619 non-null  int64         
 8   away_goals   122619 non-null  int64         
 9   result       122619 non-null  category      
 10  home_points  122619 non-null  int64         
 11  away_points  122619 non-null  int64         
 12  date         119163 non-null  datetime64[ns]
 13  referee      94378 non-null   object        
 14  home_yellow  102511 non-nul

In [19]:
elo_na_idxs = scores_match_team_info_elo_df['elo_home'].isna().to_list()
scores_match_team_info_elo_df.loc[elo_na_idxs]
# scores_match_team_info_elo_df.loc[129543,'link']

Unnamed: 0_level_0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,...,away_points,date,referee,home_yellow,home_red,away_yellow,away_red,capacity,elo_home,elo_away
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sheffield-wednesday-fc/norwich-city-fc/1993,Sheffield Wednesday,Norwich City,1-0,https://www.besoccer.com/match/sheffield-wedne...,1993,23,premier_league,1,0,home_win,...,0,1993-01-10 00:00:00,David Allison,0.0,0.0,0.0,0.0,,,
aston-villa-fc/arsenal/2006,Aston Villa,Arsenal,0-0,https://www.besoccer.com/match/aston-villa-fc/...,2006,20,premier_league,0,0,draw,...,1,2005-12-31 14:45:00,Uriah Rennie,1.0,0.0,1.0,0.0,42788.0,,
charlton-athletic-fc/west-ham-united/2006,Charlton Athletic,West Ham,2-0,https://www.besoccer.com/match/charlton-athlet...,2006,20,premier_league,2,0,home_win,...,0,2005-12-31 15:00:00,Graham Poll,1.0,0.0,0.0,0.0,27111.0,,
chelsea-fc/birmingham-city-fc/2006,Chelsea,Birmingham City,2-0,https://www.besoccer.com/match/chelsea-fc/birm...,2006,20,premier_league,2,0,home_win,...,0,2005-12-31 15:00:00,Mike Dean,1.0,0.0,0.0,0.0,41841.0,,
tottenham-hotspur-fc/newcastle-united-fc/2006,Tottenham Hotspur,Newcastle,2-0,https://www.besoccer.com/match/tottenham-hotsp...,2006,20,premier_league,2,0,home_win,...,0,2005-12-31 15:00:00,Howard Webb,0.0,0.0,1.0,0.0,62062.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mvv/agovv-apeldoorn/2006,MVV Maastricht,AGOVV Apeldoorn,2-2,https://www.besoccer.com/match/mvv/agovv-apeld...,2006,1,eerste_divisie,2,2,draw,...,1,2005-08-12 20:00:00,Bas de Groot,0.0,0.0,0.0,0.0,10000.0,,
emmen/fc-den-bosch/2006,Emmen,Den Bosch,3-1,https://www.besoccer.com/match/emmen/fc-den-bo...,2006,1,eerste_divisie,3,1,home_win,...,0,2005-08-12 20:00:00,Ed Janssen,1.0,1.0,2.0,0.0,8600.0,,
vvv/stormvogels-telstar/2006,VVV Venlo,SC Telstar,1-0,https://www.besoccer.com/match/vvv/stormvogels...,2006,1,eerste_divisie,1,0,home_win,...,0,2005-08-12 20:00:00,Jack van Hulten,1.0,0.0,4.0,0.0,8000.0,,
haarlem/graafschap/2006,HFC Haarlem,De Graafschap,3-0,https://www.besoccer.com/match/haarlem/graafsc...,2006,1,eerste_divisie,3,0,home_win,...,0,2005-08-15 20:00:00,Richard Liesveld,3.0,0.0,4.0,0.0,3442.0,,


In [20]:
# Check for duplicates
scores_match_team_info_elo_df.duplicated(subset=['home_team', 'away_team', 'season_year'], keep=False).sum()

0

In [21]:
# scores_match_team_info_elo_df = scores_match_team_info_elo_df.drop_duplicates(subset=['home_team', 'away_team', 'season_year'], keep='first')
scores_match_team_info_elo_df.shape

(122619, 21)

## Create new features

 - **Form**: points gained from the last 5 games, start from zero at the start of every season `home_form`, `away_form`
 
 - **Home/Away Form**: points gained from the last 3 (home matches for home team and away matches for away team), start from zero at the start of every season `home_team_home_form`, `away_team_away_form`
 
 - **Season Goals**: Cumulative sum of all season goals scored `home_total_goals`, `away_total_goals`
 
 - **Home/Away Season Goals**: Cumulative sum of all season goals scored in (home matches for home team and away matches for away team), `home_team_home_total_goals`, `away_team_away_total_goals`
 
 - **Discipline**: A made-up construction for yellow and red cards $(\text{Discipline for each game} = \text{Number of reds}+0.2(\text{Number of yellows}))$\
Then aggregate the discipline record as sum of last 5 games, start from zero at the start of every season `home_discipline`, `away_discipline`

> **Note**: For all these new features, the result has to be shifted by one so that only past information is included, i.e. no data leakage 

In [22]:
# Form, home and away, points gained from last 5 games, start from zero at start of every season
# Home/away form, points gained from last 3 home/away games, start from zero at start of every season
# Season goals, home and away, goals scored from the start of the season
# discipline, discipline record for last 5 games, n_reds + 0.2*n_yellows

In [23]:
new_features = ['home_form','away_form', 'home_total_goals', 'away_total_goals', 'home_discipline', 'away_discipline',
                    'home_team_home_form', 'home_team_home_total_goals', 'away_team_away_form', 'away_team_away_total_goals']

In [24]:
home_away_features_df = (scores_match_team_info_elo_df
    .assign(home_team_home_form = scores_match_team_info_elo_df.groupby(['home_team', 'season_year']).home_points.transform(lambda df: df.rolling(3, min_periods=1).sum().shift(1).fillna(0)),
        away_team_away_form = scores_match_team_info_elo_df.groupby(['away_team', 'season_year']).away_points.transform(lambda df: df.rolling(3, min_periods=1).sum().shift(1).fillna(0)),
        home_team_home_total_goals = scores_match_team_info_elo_df.groupby(['home_team', 'season_year']).home_goals.transform(lambda df: df.cumsum().shift(1).fillna(0)),
        away_team_away_total_goals = scores_match_team_info_elo_df.groupby(['home_team', 'season_year']).away_goals.transform(lambda df: df.cumsum().shift(1).fillna(0)))
)

(home_away_features_df.query("(home_team == 'Arsenal') and (season_year == 2020)"))

Unnamed: 0_level_0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,...,home_red,away_yellow,away_red,capacity,elo_home,elo_away,home_team_home_form,away_team_away_form,home_team_home_total_goals,away_team_away_total_goals
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
arsenal/burnley-fc/2020,Arsenal,Burnley,2-1,https://www.besoccer.com/match/arsenal/burnley...,2020,2,premier_league,2,1,home_win,...,0.0,1.0,0.0,60355.0,91.0,72.0,0.0,0.0,0.0,0.0
arsenal/tottenham-hotspur-fc/2020,Arsenal,Tottenham Hotspur,2-2,https://www.besoccer.com/match/arsenal/tottenh...,2020,4,premier_league,2,2,draw,...,0.0,5.0,0.0,60355.0,91.0,92.0,3.0,1.0,2.0,1.0
arsenal/aston-villa-fc/2020,Arsenal,Aston Villa,3-2,https://www.besoccer.com/match/arsenal/aston-v...,2020,6,premier_league,3,2,home_win,...,1.0,1.0,0.0,60355.0,91.0,75.0,4.0,0.0,4.0,3.0
arsenal/afc-bournemouth/2020,Arsenal,AFC Bournemouth,1-0,https://www.besoccer.com/match/arsenal/afc-bou...,2020,8,premier_league,1,0,home_win,...,0.0,2.0,0.0,60355.0,91.0,73.0,7.0,6.0,7.0,5.0
arsenal/crystal-palace-fc/2020,Arsenal,Crystal Palace,2-2,https://www.besoccer.com/match/arsenal/crystal...,2020,10,premier_league,2,2,draw,...,0.0,0.0,0.0,60355.0,91.0,73.0,7.0,6.0,8.0,5.0
arsenal/wolverhampton/2020,Arsenal,Wolves,1-1,https://www.besoccer.com/match/arsenal/wolverh...,2020,11,premier_league,1,1,draw,...,0.0,2.0,0.0,60355.0,91.0,72.0,7.0,5.0,10.0,7.0
arsenal/southampton-fc/2020,Arsenal,Southampton,2-2,https://www.besoccer.com/match/arsenal/southam...,2020,13,premier_league,2,2,draw,...,0.0,2.0,0.0,60355.0,91.0,78.0,5.0,1.0,11.0,8.0
arsenal/brighton-amp-hov/2020,Arsenal,Brighton & Hove Albion,1-2,https://www.besoccer.com/match/arsenal/brighto...,2020,15,premier_league,1,2,away_win,...,0.0,1.0,0.0,60355.0,91.0,69.0,3.0,0.0,13.0,10.0
arsenal/manchester-city-fc/2020,Arsenal,Man. City,0-3,https://www.besoccer.com/match/arsenal/manches...,2020,17,premier_league,0,3,away_win,...,0.0,4.0,0.0,60355.0,90.0,96.0,2.0,4.0,14.0,12.0
arsenal/chelsea-fc/2020,Arsenal,Chelsea,1-2,https://www.besoccer.com/match/arsenal/chelsea...,2020,20,premier_league,1,2,away_win,...,0.0,4.0,0.0,60355.0,90.0,91.0,1.0,3.0,14.0,15.0


In [25]:
home_data = (scores_match_team_info_elo_df[['home_team', 'home_goals', 'home_yellow', 'home_red', 'home_points', 'match_round', 'season_year', 'league']]
                .rename(columns=lambda col_name: col_name[5:] if col_name[:4] == 'home' else col_name))
away_data = (scores_match_team_info_elo_df[['away_team', 'away_goals', 'away_yellow', 'away_red', 'away_points', 'match_round', 'season_year', 'league']]
                .rename(columns=lambda col_name: col_name[5:] if col_name[:4] == 'away' else col_name))

scores_data_long_format = (pd.concat([home_data, away_data])
                            .reset_index()
                            .assign(idx = lambda df_: np.arange(df_.shape[0]))
                            .set_index('idx'))

scores_data_long_new_features = (scores_data_long_format
    .sort_values(['league', 'season_year', 'team', 'match_round']).reset_index()
    .assign(form = lambda df_: df_.groupby(['team', 'season_year']).points.transform(lambda df: df.rolling(5, min_periods=1).sum().shift(1).fillna(0)),
        total_goals = lambda df_: df_.groupby(['team', 'season_year']).goals.transform(lambda df: df.cumsum().shift(1).fillna(0)),
        yellow = lambda df_: df_.groupby(['team', 'season_year']).yellow.transform('ffill'),
        red = lambda df_: df_.groupby(['team', 'season_year']).red.transform('ffill'))
    .assign(cards_temp = lambda df_: df_.red.add(df_.yellow.mul(0.2)))
    .assign(discipline = lambda df_: df_.groupby(['team', 'season_year']).cards_temp.transform(lambda df: df.rolling(5, min_periods=1).sum().shift(1).fillna(0)))
    .set_index('idx').sort_index()
)

(scores_data_long_new_features
    .query("(team == 'Arsenal') and (season_year == 2020)")
    .sort_values('match_round')
)

Unnamed: 0_level_0,match_id,team,goals,yellow,red,points,match_round,season_year,league,form,total_goals,cards_temp,discipline
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
134355,newcastle-united-fc/arsenal/2020,Arsenal,1,3.0,0.0,3,1,2020,premier_league,0.0,0.0,0.6,0.0
11738,arsenal/burnley-fc/2020,Arsenal,2,2.0,0.0,3,2,2020,premier_league,3.0,1.0,0.4,0.6
134373,liverpool/arsenal/2020,Arsenal,1,1.0,0.0,0,3,2020,premier_league,6.0,3.0,0.2,1.0
11767,arsenal/tottenham-hotspur-fc/2020,Arsenal,2,3.0,0.0,1,4,2020,premier_league,6.0,4.0,0.6,1.2
134395,watford-fc/arsenal/2020,Arsenal,2,3.0,0.0,1,5,2020,premier_league,7.0,6.0,0.6,1.8
11786,arsenal/aston-villa-fc/2020,Arsenal,3,7.0,1.0,3,6,2020,premier_league,8.0,8.0,2.4,2.4
134416,manchester-united-fc/arsenal/2020,Arsenal,1,2.0,0.0,1,7,2020,premier_league,8.0,11.0,0.4,4.2
11804,arsenal/afc-bournemouth/2020,Arsenal,1,1.0,0.0,3,8,2020,premier_league,6.0,12.0,0.2,4.2
134436,sheffield-united/arsenal/2020,Arsenal,0,4.0,0.0,0,9,2020,premier_league,9.0,13.0,0.8,4.2
11825,arsenal/crystal-palace-fc/2020,Arsenal,2,2.0,0.0,1,10,2020,premier_league,8.0,13.0,0.4,4.4


In [26]:
home_data_transformed = (scores_data_long_new_features[:home_data.shape[0]]
                            .set_index('match_id')
                            .drop(columns=['team', 'goals', 'yellow', 'red', 'points', 'match_round',
                                'season_year', 'league', 'cards_temp'])
                            .rename(columns = lambda col: 'home_' + col))

away_data_transformed = (scores_data_long_new_features[home_data.shape[0]:]
                            .set_index('match_id')
                            .drop(columns=['team', 'goals', 'yellow', 'red', 'points', 'match_round',
                                'season_year', 'league', 'cards_temp'])
                            .rename(columns = lambda col: 'away_' + col))
home_data_transformed.shape, away_data_transformed.shape



((122619, 3), (122619, 3))

In [27]:
transformed_df = home_away_features_df.join(home_data_transformed).join(away_data_transformed)
transformed_df.sample(5)

Unnamed: 0_level_0,home_team,away_team,score,link,season_year,match_round,league,home_goals,away_goals,result,...,home_team_home_form,away_team_away_form,home_team_home_total_goals,away_team_away_total_goals,home_form,home_total_goals,home_discipline,away_form,away_total_goals,away_discipline
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
barcelona/cadiz/1992,Barcelona,Cádiz,4-1,https://www.besoccer.com/match/barcelona/cadiz...,1992,15,primera_division,4,1,home_win,...,9.0,0.0,17.0,7.0,11.0,30.0,0.0,4.0,10.0,1.0
tenerife/malaga/2020,Tenerife,Málaga,0-0,https://www.besoccer.com/match/tenerife/malaga...,2020,33,segunda_division,0,0,draw,...,7.0,4.0,18.0,16.0,7.0,36.0,2.8,7.0,28.0,3.2
real-madrid/deportivo/2016,Real Madrid,Deportivo,5-0,https://www.besoccer.com/match/real-madrid/dep...,2016,19,primera_division,5,0,home_win,...,9.0,5.0,29.0,9.0,10.0,47.0,2.2,6.0,26.0,3.0
recreativo/real-murcia/2006,Recreativo,Real Murcia,1-0,https://www.besoccer.com/match/recreativo/real...,2006,32,segunda_division,1,0,home_win,...,7.0,3.0,27.0,9.0,8.0,44.0,0.0,6.0,27.0,0.0
ac-cesena/lecce/1993,Cesena,Lecce,1-1,https://www.besoccer.com/match/ac-cesena/lecce...,1993,3,serie_b,1,1,draw,...,3.0,0.0,4.0,1.0,3.0,5.0,0.0,3.0,2.0,0.0
