# Football Match Predictor - Feature Engineering

## Library Imports

In [1]:
# Add all the library imports required
import pandas as pd
import numpy as np
import pickle
import data_cleaning

## Download the ELO Data

In [2]:
!wget "https://aicore-files.s3.amazonaws.com/Data-Science/elo_dict.pkl"

--2022-05-26 19:55:35--  https://aicore-files.s3.amazonaws.com/Data-Science/elo_dict.pkl
Resolving aicore-files.s3.amazonaws.com (aicore-files.s3.amazonaws.com)... 52.217.233.193
Connecting to aicore-files.s3.amazonaws.com (aicore-files.s3.amazonaws.com)|52.217.233.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12235364 (12M) [binary/octet-stream]
Saving to: ‘elo_dict.pkl.1’


2022-05-26 19:55:44 (1.42 MB/s) - ‘elo_dict.pkl.1’ saved [12235364/12235364]



In [31]:
elo_dict = pickle.load(open('elo_dict.pkl', 'rb'))
print(list(elo_dict.keys())[0])
print(elo_dict[list(elo_dict.keys())[0]])

https://www.besoccer.com/match/saarbrucken/stuttgarter-kickers/19903487
{'Elo_home': 56.0, 'Elo_away': 59.0}


In [32]:
elo_link_list = []
elo_home_list = []
elo_away_list = []
for key, value in elo_dict.items():
    elo_link_list.append(key)
    elo_home_list.append(value['Elo_home'])
    elo_away_list.append(value['Elo_away'])

elo_df = pd.DataFrame({'link': elo_link_list, 'elo_home': elo_home_list, 'elo_away': elo_away_list})
elo_df.head()

Unnamed: 0,link,elo_home,elo_away
0,https://www.besoccer.com/match/saarbrucken/stu...,56.0,59.0
1,https://www.besoccer.com/match/sc-freiburg/unt...,53.0,55.0
2,https://www.besoccer.com/match/vfl-osnabruck/m...,52.0,53.0
3,https://www.besoccer.com/match/rot-weiss-essen...,53.0,62.0
4,https://www.besoccer.com/match/alemannia-aache...,57.0,52.0


## Download the other data and join into one dataframe

In [2]:
scores_df = data_cleaning.import_leagues()
scores_df = data_cleaning.create_home_away_goals_and_result_attributes(scores_df)
scores_df = data_cleaning.create_match_id_col_from_link(scores_df)
scores_df.head()

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,match_id
0,Charlton Athletic,Derby County,0-0,https://www.besoccer.com/match/charlton-athlet...,1990,1,premier_league,0,0,draw,charlton-athletic-fc/derby-county-fc/1990
1,Tottenham Hotspur,Luton Town,2-1,https://www.besoccer.com/match/tottenham-hotsp...,1990,1,premier_league,2,1,home_win,tottenham-hotspur-fc/luton-town-fc/1990
2,Southampton,Millwall,1-2,https://www.besoccer.com/match/southampton-fc/...,1990,1,premier_league,1,2,away_win,southampton-fc/millwall-fc/1990
3,Sheffield Wednesday,Norwich City,0-2,https://www.besoccer.com/match/sheffield-wedne...,1990,1,premier_league,0,2,away_win,sheffield-wednesday-fc/norwich-city-fc/1990
4,Queens Park Rangers,Crystal Palace,2-0,https://www.besoccer.com/match/queens-park-ran...,1990,1,premier_league,2,0,home_win,queens-park-rangers-fc/crystal-palace-fc/1990


In [7]:
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129393 entries, 0 to 129392
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   home_team   129393 non-null  object
 1   away_team   129393 non-null  object
 2   score       129393 non-null  object
 3   link        129393 non-null  object
 4   year        129393 non-null  object
 5   round       129393 non-null  object
 6   league      129393 non-null  object
 7   home_goals  129393 non-null  int64 
 8   away_goals  129393 non-null  int64 
 9   result      129393 non-null  object
 10  match_id    129393 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.9+ MB


In [4]:
match_info_df = data_cleaning.import_match_info_data()
match_info_df = data_cleaning.create_match_id_col(match_info_df)
match_info_df.head()

Unnamed: 0,date,referee,home_yellow,home_red,away_yellow,away_red,match_id
0,1989-07-29 15:00:00,Hans-Jürgen Weber,0.0,0.0,3.0,0.0,saarbrucken/stuttgarter-kickers/1990
1,1989-07-29 15:00:00,Kurt Wittke,1.0,0.0,0.0,0.0,sc-freiburg/unterhaching/1990
2,1989-07-29 15:00:00,Werner Föckler,3.0,0.0,2.0,0.0,vfl-osnabruck/meppen/1990
3,1989-07-29 15:00:00,Heinz Werner,2.0,0.0,2.0,0.0,rot-weiss-essen/schalke-04/1990
4,1989-07-29 15:00:00,Hans-Peter Dellwing,1.0,0.0,1.0,0.0,alemannia-aachen/msv-duisburg/1990


In [8]:
match_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143348 entries, 0 to 143347
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date         143348 non-null  datetime64[ns]
 1   referee      113748 non-null  object        
 2   home_yellow  122798 non-null  float64       
 3   home_red     122798 non-null  float64       
 4   away_yellow  122798 non-null  float64       
 5   away_red     122798 non-null  float64       
 6   match_id     143348 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 7.7+ MB


In [16]:
team_info_df = data_cleaning.import_team_info_data()
team_info_df.head()

Unnamed: 0,team,city,country,stadium,capacity,pitch
0,Wattenscheid 09,Bochum-Wattenscheid,Germany,Lohrheidestadion,16233,Natural
1,Hertha BSC,Berlín,Germany,Olympiastadion Berlin,76065,Natural
2,Unterhaching,Unterhaching,Germany,Sportpark Unterhaching,15053,Natural
3,Fortuna Köln,Cologne,Germany,Südstadion,14944,Natural
4,MSV Duisburg,Duisburgo,Germany,Schauinsland-Reisen-Arena,31514,Natural


In [5]:
scores_match_info_df = pd.merge(scores_df, match_info_df, how='outer', on="match_id")
scores_match_info_df.head()

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,match_id,date,referee,home_yellow,home_red,away_yellow,away_red
0,Charlton Athletic,Derby County,0-0,https://www.besoccer.com/match/charlton-athlet...,1990,1,premier_league,0.0,0.0,draw,charlton-athletic-fc/derby-county-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
1,Tottenham Hotspur,Luton Town,2-1,https://www.besoccer.com/match/tottenham-hotsp...,1990,1,premier_league,2.0,1.0,home_win,tottenham-hotspur-fc/luton-town-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
2,Southampton,Millwall,1-2,https://www.besoccer.com/match/southampton-fc/...,1990,1,premier_league,1.0,2.0,away_win,southampton-fc/millwall-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
3,Sheffield Wednesday,Norwich City,0-2,https://www.besoccer.com/match/sheffield-wedne...,1990,1,premier_league,0.0,2.0,away_win,sheffield-wednesday-fc/norwich-city-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0
4,Queens Park Rangers,Crystal Palace,2-0,https://www.besoccer.com/match/queens-park-ran...,1990,1,premier_league,2.0,0.0,home_win,queens-park-rangers-fc/crystal-palace-fc/1990,1989-08-19,,0.0,0.0,0.0,0.0


In [10]:
scores_match_info_df.dropna(subset=['score'], inplace=True) 
scores_match_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129543 entries, 0 to 129542
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    129543 non-null  object        
 1   away_team    129543 non-null  object        
 2   score        129543 non-null  object        
 3   link         129543 non-null  object        
 4   year         129543 non-null  object        
 5   round        129543 non-null  object        
 6   league       129543 non-null  object        
 7   home_goals   129543 non-null  float64       
 8   away_goals   129543 non-null  float64       
 9   result       129543 non-null  object        
 10  match_id     129543 non-null  object        
 11  date         125762 non-null  datetime64[ns]
 12  referee      100146 non-null  object        
 13  home_yellow  108252 non-null  float64       
 14  home_red     108252 non-null  float64       
 15  away_yellow  108252 non-null  floa

In [13]:
date_na_idx = scores_match_info_df['date'].isna().to_list()
scores_match_info_df.loc[date_na_idx]

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,match_id,date,referee,home_yellow,home_red,away_yellow,away_red
12042,Arsenal,Norwich City,4-0,https://www.besoccer.com/match/arsenal/norwich...,2020,32,premier_league,4.0,0.0,home_win,arsenal/norwich-city-fc/2020,NaT,,,,,
12043,AFC Bournemouth,Newcastle,1-4,https://www.besoccer.com/match/afc-bournemouth...,2020,32,premier_league,1.0,4.0,away_win,afc-bournemouth/newcastle-united-fc/2020,NaT,,,,,
12044,Everton,Leicester,2-1,https://www.besoccer.com/match/everton-fc/leic...,2020,32,premier_league,2.0,1.0,home_win,everton-fc/leicester-city-fc/2020,NaT,,,,,
12045,West Ham,Chelsea,3-2,https://www.besoccer.com/match/west-ham-united...,2020,32,premier_league,3.0,2.0,home_win,west-ham-united/chelsea-fc/2020,NaT,,,,,
12046,Sheffield United,Tottenham Hotspur,3-1,https://www.besoccer.com/match/sheffield-unite...,2020,32,premier_league,3.0,1.0,home_win,sheffield-united/tottenham-hotspur-fc/2020,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122952,PEC Zwolle,Twente 1965,1-0,https://www.besoccer.com/match/fc-zwolle/fc-tw...,2021,29,eredivisie,1.0,0.0,home_win,fc-zwolle/fc-twente-1965/2021,NaT,,,,,
122953,Utrecht,Feyenoord,1-2,https://www.besoccer.com/match/fc-utrecht/feye...,2021,29,eredivisie,1.0,2.0,away_win,fc-utrecht/feyenoord/2021,NaT,,,,,
122954,Groningen,Heerenveen,0-2,https://www.besoccer.com/match/fc-groningen/he...,2021,29,eredivisie,0.0,2.0,away_win,fc-groningen/heerenveen/2021,NaT,,,,,
122955,VVV Venlo,PSV,0-2,https://www.besoccer.com/match/vvv/psv/202113372,2021,29,eredivisie,0.0,2.0,away_win,vvv/psv/2021,NaT,,,,,


In [14]:
match_info_df[match_info_df.match_id == 'arsenal/norwich-city-fc/2020']

Unnamed: 0,date,referee,home_yellow,home_red,away_yellow,away_red,match_id


In [35]:
team_info_reduced_df = team_info_df.copy()
team_info_reduced_df = team_info_reduced_df[['team', 'capacity']]
team_info_reduced_df.rename(columns={'team': 'home_team'}, inplace=True)
# team_info_reduced_df.head()
scores_match_team_info_df = pd.merge(scores_match_info_df, team_info_reduced_df, how='outer', on="home_team")
scores_match_team_info_df.dropna(subset=['score'], inplace=True) 
scores_match_team_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129543 entries, 0 to 129542
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    129543 non-null  object        
 1   away_team    129543 non-null  object        
 2   score        129543 non-null  object        
 3   link         129543 non-null  object        
 4   year         129543 non-null  object        
 5   round        129543 non-null  object        
 6   league       129543 non-null  object        
 7   home_goals   129543 non-null  float64       
 8   away_goals   129543 non-null  float64       
 9   result       129543 non-null  object        
 10  match_id     129543 non-null  object        
 11  date         125762 non-null  datetime64[ns]
 12  referee      100146 non-null  object        
 13  home_yellow  108252 non-null  float64       
 14  home_red     108252 non-null  float64       
 15  away_yellow  108252 non-null  floa

In [36]:
capcity_na_idxs = scores_match_team_info_df['capacity'].isna().to_list()
scores_match_team_info_df.loc[capcity_na_idxs, 'home_team'].unique()

array(['Sheffield Wednesday', 'Queens Park Rangers',
       'Oldham Athletic AFC', 'West Bromwich Albion',
       'Brighton & Hove Albion', 'Peterborough United',
       'Gimnàstic Tarragona', 'Real Unión de Irún', 'Fortuna Düsseldorf',
       'Eintracht Frankfurt', 'B. Mönchengladbach', 'Stuttgarter Kickers',
       '1. FC Lokomotive Leipzig', 'Blau-Weiß 1890 Berlin',
       'FC Carl Zeiss Jena', 'Rot-Weiß Oberhausen',
       'Kickers Offenbach FC', 'SV Eintracht Trier',
       'SV Wacker Burghausen', 'Siegen Sportfreunde ',
       'SV Wehen Burghausen', 'Würzburger Kickers', 'Barletta', 'Licata',
       'AS Lucchese Libertas 1905', 'Taranto', 'Casertana',
       'Calcio Portogruaro-Summaga', 'Sporting Toulon Var',
       'Olympique Marseille', 'Evian Thonon Gaillard',
       'CS Louhans Cuiseaux', 'FC Libourne Saint Seurin'], dtype=object)

In [47]:
scores_match_team_info_elo_df = pd.merge(scores_match_team_info_df, elo_df, how='outer', on="link")
scores_match_team_info_elo_df.dropna(subset=['score'], inplace=True) 
scores_match_team_info_elo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129543 entries, 0 to 129542
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   home_team    129543 non-null  object        
 1   away_team    129543 non-null  object        
 2   score        129543 non-null  object        
 3   link         129543 non-null  object        
 4   year         129543 non-null  object        
 5   round        129543 non-null  object        
 6   league       129543 non-null  object        
 7   home_goals   129543 non-null  float64       
 8   away_goals   129543 non-null  float64       
 9   result       129543 non-null  object        
 10  match_id     129543 non-null  object        
 11  date         125762 non-null  datetime64[ns]
 12  referee      100146 non-null  object        
 13  home_yellow  108252 non-null  float64       
 14  home_red     108252 non-null  float64       
 15  away_yellow  108252 non-null  floa

In [49]:
elo_na_idxs = scores_match_team_info_elo_df['elo_home'].isna().to_list()
scores_match_team_info_elo_df.loc[elo_na_idxs]
# scores_match_team_info_elo_df.loc[129543,'link']

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,match_id,date,referee,home_yellow,home_red,away_yellow,away_red,capacity,elo_home,elo_away
143,Charlton Athletic,West Ham,2-0,https://www.besoccer.com/match/charlton-athlet...,2006,20,premier_league,2.0,0.0,home_win,charlton-athletic-fc/west-ham-united/2006,2005-12-31 15:00:00,Graham Poll,1.0,0.0,0.0,0.0,27111.0,,
263,Charlton Athletic,Barnsley,2-2,https://www.besoccer.com/match/charlton-athlet...,1995,2,championship,2.0,2.0,draw,charlton-athletic-fc/barnsley-fc/1995,1994-08-20 00:00:00,,,,,,27111.0,,
264,Charlton Athletic,Sheffield United,1-1,https://www.besoccer.com/match/charlton-athlet...,1995,4,championship,1.0,1.0,draw,charlton-athletic-fc/sheffield-united/1995,1994-08-30 00:00:00,,,,,,27111.0,,
265,Charlton Athletic,Bristol City,3-2,https://www.besoccer.com/match/charlton-athlet...,1995,5,championship,3.0,2.0,home_win,charlton-athletic-fc/bristol-city-fc/1995,1994-09-03 00:00:00,,,,,,27111.0,,
266,Charlton Athletic,Swindon Town,1-0,https://www.besoccer.com/match/charlton-athlet...,1995,8,championship,1.0,0.0,home_win,charlton-athletic-fc/swindon-town/1995,1994-09-17 00:00:00,,,,,,27111.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129086,FC Omniworld,BV Veendam,1-2,https://www.besoccer.com/match/omniworld/veend...,2006,34,eerste_divisie,1.0,2.0,away_win,omniworld/veendam/2006,2005-09-05 20:00:00,Mike van der Roest,2.0,0.0,3.0,0.0,4501.0,,
129087,FC Omniworld,BV Veendam,1-2,https://www.besoccer.com/match/omniworld/veend...,2006,35,eerste_divisie,1.0,2.0,away_win,omniworld/veendam/2006,2005-09-05 20:00:00,Mike van der Roest,2.0,0.0,3.0,0.0,4501.0,,
129088,FC Omniworld,BV Veendam,1-2,https://www.besoccer.com/match/omniworld/veend...,2006,36,eerste_divisie,1.0,2.0,away_win,omniworld/veendam/2006,2005-09-05 20:00:00,Mike van der Roest,2.0,0.0,3.0,0.0,4501.0,,
129089,FC Omniworld,BV Veendam,1-2,https://www.besoccer.com/match/omniworld/veend...,2006,37,eerste_divisie,1.0,2.0,away_win,omniworld/veendam/2006,2005-09-05 20:00:00,Mike van der Roest,2.0,0.0,3.0,0.0,4501.0,,


## Create new features

In [None]:
# Form, home and away, points gained from last 5 games, start from zero at start of every season
# Home/away form, points gained from last 3 home/away games, start from zero at start of every season
# Season goals, home and away, goals scored from the start of the season
# discipline, discipline record for last 5 games, n_reds + 0.2*n_yellows

In [109]:
league_names = ["premier_league", "championship", "primera_division", "segunda_division",
                "bundesliga", "2_liga", "serie_a", "serie_b", "ligue_1", "ligue_2", 
                "eredivisie", "eerste_divisie"]
years = list(range(1990,2022))


def get_season_table(data, league, year):
    data_season = data[(data['league'] == league) & (data['year'] == year)]
    return data_season

def get_team_names(season_data):
    all_teams = season_data["home_team"].unique()
    return sorted(all_teams)

def get_team_table(season_data, team, type):
    # type can be home, away or all
    if type == 'home':
        team_table = season_data[season_data['home_team'] == team]
    elif type == 'away':
        team_table = season_data[season_data['away_team'] == team]
    else:
        team_table = season_data[(season_data['home_team'] == team) | (season_data['away_team'] == team)]

    team_table = team_table.copy()
    team_table['team'] = [team] * team_table.shape[0]
    
    return team_table.sort_values(by=['round'])

def get_points(data):
    if data.result == 'draw':
        points = 1
    elif data.home_team == data.team:
        if data.result == 'home_win':
            points = 3
        else:
            points = 0
    else:
        if data.result == 'home_win':
            points = 0
        else:
            points = 3
    return points

def get_goals(data):
    if data.home_team == data.team:
        goals = data.home_goals
    else:
        goals = data.away_goals
    return goals

def get_cards_stat(data):
    if data.home_team == data.team:
        cards = data.home_red + 0.2 * data.home_yellow
    else:
        cards = data.away_red + 0.2 * data.away_yellow
    return cards

def get_team_stats(team_table):
    team_table['points'] = team_table.apply(get_points, axis = 1)
    team_table['form'] = team_table['points'].rolling(5, min_periods=1).sum()
    team_table['goals'] = team_table.apply(get_goals, axis=1)
    team_table['total_goals'] = team_table['goals'].cumsum()
    team_table['cards_stat'] = team_table.apply(get_cards_stat, axis=1)
    team_table['discipline'] = team_table['cards_stat'].rolling(5, min_periods=1).sum()
    cards_nan_idx = team_table['cards_stat'].isna()
    nan_array = nan_array = np.empty((cards_nan_idx.sum(),1))
    nan_array[:] = np.nan
    team_table.loc[cards_nan_idx, 'discipline'] = nan_array
    team_table.drop(['points', 'goals', 'cards_stat'], axis=1, inplace=True)

    return team_table

def drop_duplicate_columns(team_table):
    cols = list(team_table.columns)
    wanted_cols = ['home_team', 'away_team', 'match_id', 'team', 'form', 'total_goals', 'discipline']
    cols_to_drop = [ele for ele in cols if ele not in wanted_cols]
    team_table.drop(cols_to_drop, axis=1, inplace=True)
    return team_table

prem_2020 = get_season_table(scores_match_team_info_elo_df, 'premier_league', 2020)
prem_2020_teams = get_team_names(prem_2020)
team0_table = get_team_table(prem_2020, prem_2020_teams[0], 'all')
team0_table = get_team_stats(team0_table)
team0_table = drop_duplicate_columns(team0_table)
team0_table.head()

Unnamed: 0,home_team,away_team,match_id,team,form,total_goals,discipline
19057,AFC Bournemouth,Sheffield United,afc-bournemouth/sheffield-united/2020,AFC Bournemouth,1.0,1.0,0.4
7683,Aston Villa,AFC Bournemouth,aston-villa-fc/afc-bournemouth/2020,AFC Bournemouth,4.0,3.0,0.8
19058,AFC Bournemouth,Man. City,afc-bournemouth/manchester-city-fc/2020,AFC Bournemouth,4.0,4.0,1.0
13726,Leicester,AFC Bournemouth,leicester-city-fc/afc-bournemouth/2020,AFC Bournemouth,4.0,5.0,1.6
19059,AFC Bournemouth,Everton,afc-bournemouth/everton-fc/2020,AFC Bournemouth,7.0,8.0,1.6


In [101]:
def get_home_away_team_stats(team_table):
    team_table['points'] = team_table.apply(get_points, axis = 1)
    team_table['form'] = team_table['points'].rolling(3, min_periods=1).sum()
    team_table['goals'] = team_table.apply(get_goals, axis=1)
    team_table['total_goals'] = team_table['goals'].cumsum()
    team_table.drop(['points', 'goals'], axis=1, inplace=True)
    return team_table


team0_home_table = get_team_table(prem_2020, prem_2020_teams[0], 'home')
team0_home_table = get_home_away_team_stats(team0_home_table)
team0_home_table.head()

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,...,home_yellow,home_red,away_yellow,away_red,capacity,elo_home,elo_away,team,form,total_goals
19057,AFC Bournemouth,Sheffield United,1-1,https://www.besoccer.com/match/afc-bournemouth...,2020,1,premier_league,1.0,1.0,draw,...,2.0,0.0,1.0,0.0,12000.0,72.0,66.0,AFC Bournemouth,1.0,1.0
19058,AFC Bournemouth,Man. City,1-3,https://www.besoccer.com/match/afc-bournemouth...,2020,3,premier_league,1.0,3.0,away_win,...,1.0,0.0,3.0,0.0,12000.0,72.0,97.0,AFC Bournemouth,1.0,2.0
19059,AFC Bournemouth,Everton,3-1,https://www.besoccer.com/match/afc-bournemouth...,2020,5,premier_league,3.0,1.0,home_win,...,0.0,0.0,4.0,0.0,12000.0,73.0,78.0,AFC Bournemouth,4.0,5.0
19060,AFC Bournemouth,West Ham,2-2,https://www.besoccer.com/match/afc-bournemouth...,2020,7,premier_league,2.0,2.0,draw,...,3.0,0.0,1.0,0.0,12000.0,73.0,77.0,AFC Bournemouth,4.0,7.0
19061,AFC Bournemouth,Norwich City,0-0,https://www.besoccer.com/match/afc-bournemouth...,2020,9,premier_league,0.0,0.0,draw,...,1.0,0.0,3.0,0.0,12000.0,73.0,71.0,AFC Bournemouth,5.0,7.0


In [102]:
team0_away_table = get_team_table(prem_2020, prem_2020_teams[0], 'away')
team0_away_table = get_home_away_team_stats(team0_away_table)
team0_away_table.head()

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,...,home_yellow,home_red,away_yellow,away_red,capacity,elo_home,elo_away,team,form,total_goals
7683,Aston Villa,AFC Bournemouth,1-2,https://www.besoccer.com/match/aston-villa-fc/...,2020,2,premier_league,1.0,2.0,away_win,...,0.0,0.0,2.0,0.0,42788.0,73.0,72.0,AFC Bournemouth,3.0,2.0
13726,Leicester,AFC Bournemouth,3-1,https://www.besoccer.com/match/leicester-city-...,2020,4,premier_league,3.0,1.0,home_win,...,1.0,0.0,3.0,0.0,32500.0,81.0,72.0,AFC Bournemouth,3.0,3.0
1440,Southampton,AFC Bournemouth,1-3,https://www.besoccer.com/match/southampton-fc/...,2020,6,premier_league,1.0,3.0,away_win,...,1.0,0.0,3.0,0.0,32689.0,78.0,73.0,AFC Bournemouth,6.0,6.0
4968,Arsenal,AFC Bournemouth,1-0,https://www.besoccer.com/match/arsenal/afc-bou...,2020,8,premier_league,1.0,0.0,home_win,...,1.0,0.0,2.0,0.0,60355.0,91.0,73.0,AFC Bournemouth,3.0,6.0
14763,Watford,AFC Bournemouth,0-0,https://www.besoccer.com/match/watford-fc/afc-...,2020,10,premier_league,0.0,0.0,draw,...,5.0,0.0,3.0,0.0,19920.0,75.0,73.0,AFC Bournemouth,4.0,6.0


In [135]:
def create_new_features_for_season(season_data):
    season_data = season_data.copy()
    get_teams = get_team_names(season_data)
    new_features = ['home_form','away_form', 'home_total_goals', 'away_total_goals', 'home_discipline', 'away_discipline',
                    'home_team_home_form', 'home_team_home_total_goals', 'away_team_away_form', 'away_team_away_total_goals']
    season_data[new_features] = np.empty((season_data.shape[0],10))
    season_data[new_features] = np.nan
    for team in get_teams:
        team_table = get_team_table(season_data, team, 'all')
        team_table = get_team_stats(team_table)
        
        team_home_table = get_team_table(season_data, team, 'home')
        team_home_table = get_home_away_team_stats(team_home_table)

        team_away_table = get_team_table(season_data, team, 'away')
        team_away_table = get_home_away_team_stats(team_away_table)
        
        season_data.sort_values(by=['home_team', 'round'], inplace=True)
        season_data.loc[(season_data.home_team == team),['home_form', 'home_total_goals', 'home_discipline']] = team_table.loc[(team_table.home_team == team), ['form', 'total_goals', 'discipline']].values
        season_data.loc[(season_data.home_team == team),['home_team_home_form', 'home_team_home_total_goals']] = team_home_table.loc[: , ['form', 'total_goals']].values
        season_data.sort_values(by=['away_team', 'round'], inplace=True)
        season_data.loc[(season_data.away_team == team),['away_form', 'away_total_goals', 'away_discipline']] = team_table.loc[(team_table.away_team == team), ['form', 'total_goals', 'discipline']].values
        season_data.loc[(season_data.away_team == team),['away_team_away_form', 'away_team_away_total_goals']] = team_away_table.loc[: , ['form', 'total_goals']].values

    season_data.sort_values(by=['round','home_team'], inplace=True)
    return season_data

prem_2020 = get_season_table(scores_match_team_info_elo_df, 'premier_league', 2020)
prem_2020_new = create_new_features_for_season(prem_2020)
prem_2020_new.head()
# Hmm need to think about potential data leakage!!!

Unnamed: 0,home_team,away_team,score,link,year,round,league,home_goals,away_goals,result,...,home_form,away_form,home_total_goals,away_total_goals,home_discipline,away_discipline,home_team_home_form,home_team_home_total_goals,away_team_away_form,away_team_away_total_goals
19057,AFC Bournemouth,Sheffield United,1-1,https://www.besoccer.com/match/afc-bournemouth...,2020,1,premier_league,1.0,1.0,draw,...,1.0,1.0,1.0,1.0,0.4,0.2,1.0,1.0,1.0,1.0
18528,Burnley,Southampton,3-0,https://www.besoccer.com/match/burnley-fc/sout...,2020,1,premier_league,3.0,0.0,home_win,...,3.0,0.0,3.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0
6306,Crystal Palace,Everton,0-0,https://www.besoccer.com/match/crystal-palace-...,2020,1,premier_league,0.0,0.0,draw,...,1.0,1.0,0.0,0.0,0.4,1.6,1.0,0.0,1.0,0.0
13725,Leicester,Wolves,0-0,https://www.besoccer.com/match/leicester-city-...,2020,1,premier_league,0.0,0.0,draw,...,1.0,1.0,0.0,0.0,0.0,0.4,1.0,0.0,1.0,0.0
3694,Liverpool,Norwich City,4-1,https://www.besoccer.com/match/liverpool/norwi...,2020,1,premier_league,4.0,1.0,home_win,...,3.0,0.0,4.0,1.0,0.0,0.4,3.0,4.0,0.0,1.0
