# Import & Ingest

In [1]:
import sys
import pathlib
SOURCE_PATH = pathlib.Path.cwd().resolve().parent
sys.path.append(str(SOURCE_PATH))

In [2]:
from config import *
from utils import ingest_kaggle_database, join_player_attributes
import xgboost
from sklearn.linear_model import LogisticRegression
import joblib
import pandas as pd
import sqlite3

In [3]:
ingest_kaggle_database(SOCCER_DATABASE)



In [4]:
%load_ext sql
%sql sqlite:///../data/database.sqlite
%config SqlMagic.style = "_deprecated_default"

In [5]:
%%sql
SELECT name FROM sqlite_schema
WHERE type = "table"
AND name NOT LIKE "sqlite%"
ORDER BY 1;

 * sqlite:///../data/database.sqlite
Done.


name
Country
League
Match
Player
Player_Attributes
Team
Team_Attributes


# Relevant Metadata & Initializing DataFrames

## Metadata

In [6]:
%%sql
PRAGMA table_info(Country)

 * sqlite:///../data/database.sqlite
Done.


cid,name,type,notnull,dflt_value,pk
0,id,INTEGER,0,,1
1,name,TEXT,0,,0


In [7]:
%%sql
PRAGMA table_info(League)

 * sqlite:///../data/database.sqlite
Done.


cid,name,type,notnull,dflt_value,pk
0,id,INTEGER,0,,1
1,country_id,INTEGER,0,,0
2,name,TEXT,0,,0


In [8]:
%%sql
PRAGMA table_info(Match)

 * sqlite:///../data/database.sqlite
Done.


cid,name,type,notnull,dflt_value,pk
0,id,INTEGER,0,,1
1,country_id,INTEGER,0,,0
2,league_id,INTEGER,0,,0
3,season,TEXT,0,,0
4,stage,INTEGER,0,,0
5,date,TEXT,0,,0
6,match_api_id,INTEGER,0,,0
7,home_team_api_id,INTEGER,0,,0
8,away_team_api_id,INTEGER,0,,0
9,home_team_goal,INTEGER,0,,0


In [9]:
%%sql
PRAGMA table_info(Team_Attributes)

 * sqlite:///../data/database.sqlite
Done.


cid,name,type,notnull,dflt_value,pk
0,id,INTEGER,0,,1
1,team_fifa_api_id,INTEGER,0,,0
2,team_api_id,INTEGER,0,,0
3,date,TEXT,0,,0
4,buildUpPlaySpeed,INTEGER,0,,0
5,buildUpPlaySpeedClass,TEXT,0,,0
6,buildUpPlayDribbling,INTEGER,0,,0
7,buildUpPlayDribblingClass,TEXT,0,,0
8,buildUpPlayPassing,INTEGER,0,,0
9,buildUpPlayPassingClass,TEXT,0,,0


In [10]:
%%sql
PRAGMA table_info(Player_Attributes)

 * sqlite:///../data/database.sqlite
Done.


cid,name,type,notnull,dflt_value,pk
0,id,INTEGER,0,,1
1,player_fifa_api_id,INTEGER,0,,0
2,player_api_id,INTEGER,0,,0
3,date,TEXT,0,,0
4,overall_rating,INTEGER,0,,0
5,potential,INTEGER,0,,0
6,preferred_foot,TEXT,0,,0
7,attacking_work_rate,TEXT,0,,0
8,defensive_work_rate,TEXT,0,,0
9,crossing,INTEGER,0,,0


## DataFrames (Match, Team_Attributes, Player_Attributes)

In [11]:
%%sql
Match << SELECT * FROM Match;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable Match


In [12]:
df_match = pd.DataFrame(Match)
df_match.date = pd.to_datetime(df_match.date)
df_match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [13]:
%%sql
ta << SELECT * FROM Team_Attributes;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable ta


In [14]:
df_ta = ta.DataFrame()
df_ta.date = pd.to_datetime(df_ta.date)
df_ta.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [15]:
%%sql
pa << SELECT * FROM Player_Attributes;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable pa


In [16]:
df_pa = pa.DataFrame()
df_pa.date = pd.to_datetime(df_pa.date)
df_pa.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


# Joining Team Attributes

- Need: Match DataFrame with relevant Match columns plus the difference between the teams' respective stats, with both sets of stats being the measurements closest to that date for that team BEFORE the date of the match; matches earlier than first stats for the majority of teams will be dropped; dropping teams entirely whose earliest stats come exceptionally late.
- Have: Match table with home_team_api_id, away_team_api_id, and date, plus Team_Attributes table with team_api_id
and repeated measurements of all team stats across several different dates.

## First Attempt: Analysis & Troubleshooting

In [17]:
%%sql
id << SELECT id FROM Match;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable id


In [18]:
%%sql
match_api_id << SELECT match_api_id FROM Match;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable match_api_id


In [19]:
#Number of unique matches? Any duplicates?
id = id.DataFrame()
match_api_id = match_api_id.DataFrame()
len(set(id.id)), len(set(match_api_id.match_api_id))

(25979, 25979)

In [20]:
%%sql
df_home << WITH MatchCols AS (SELECT country_id, league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal FROM Match)                     
SELECT * FROM MatchCols m INNER JOIN Country c on m.country_id = c.id
INNER JOIN League l ON l.id = m.league_id
INNER JOIN Team_Attributes ta ON ta.team_api_id = m.home_team_api_id;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable df_home


In [21]:
%%sql
df_away << WITH AwayID AS (SELECT home_team_api_id, away_team_api_id FROM Match)
SELECT * FROM AwayID a INNER JOIN Team_Attributes ta
ON a.away_team_api_id = ta.team_api_id;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable df_away


In [22]:
df_home = df_home.DataFrame()
df_away = df_away.DataFrame()

df_home.shape, df_away.shape

((142093, 40), (142100, 27))

In [23]:
df_home[["home_team_api_id", "away_team_api_id"]].head()

Unnamed: 0,home_team_api_id,away_team_api_id
0,9987,9993
1,9987,9993
2,9987,9993
3,9987,9993
4,9987,9993


In [24]:
#df_home and df_away don't match in their correspondence of the home teams and away teams; something is going on.
df_away[["home_team_api_id", "away_team_api_id"]].head()

Unnamed: 0,home_team_api_id,away_team_api_id
0,9987,9993
1,9987,9993
2,9987,9993
3,9987,9993
4,10000,9994


In [25]:
#Indeed, between the two DataFrames, the away team only matches a fraction of the time (48%)
away_team_match_count = int((df_home.away_team_api_id == df_away.away_team_api_id.iloc[:len(df_home)]).sum())
away_team_match_perc = round(100 * away_team_match_count/len(df_home), 2)
print(f"{away_team_match_perc}%")

48.22%


In [26]:
df_ta.shape

(1458, 25)

### Dates Analysis

In [27]:
#Some of the player attributes are measured earlier than the eariest matches, but the matches have to wait from 2008 to 2010
#to pick up the earliest team attributes! Moreover, the latest dates for both team and player stat measurement come toward or even
#after the last matches. We will see if joining the latest pre-existing stats for each match leaves us with enough matches
#for predictive modeling or if it scrambles the whole project.
print("Match dates:\n", df_match.date.describe())
print("\nTeam attribute measurement dates:\n", df_ta.date.describe())
print("\nPlayer attribute measurement dates:\n", df_pa.date.describe())

Match dates:
 count                            25979
mean     2012-06-30 17:53:53.334616320
min                2008-07-18 00:00:00
25%                2010-05-09 00:00:00
50%                2012-05-13 00:00:00
75%                2014-08-17 00:00:00
max                2016-05-25 00:00:00
Name: date, dtype: object

Team attribute measurement dates:
 count                             1458
mean     2012-12-06 22:40:59.259259392
min                2010-02-22 00:00:00
25%                2011-02-22 00:00:00
50%                2013-09-20 00:00:00
75%                2014-09-19 00:00:00
max                2015-09-10 00:00:00
Name: date, dtype: object

Player attribute measurement dates:
 count                           183978
mean     2012-12-07 05:49:56.771353088
min                2007-02-22 00:00:00
25%                2011-02-22 00:00:00
50%                2013-09-20 00:00:00
75%                2014-12-19 00:00:00
max                2016-07-07 00:00:00
Name: date, dtype: object


In [28]:
df_ta.groupby("team_api_id")["date"].min().describe()

count                    288
mean     2010-08-17 23:10:00
min      2010-02-22 00:00:00
25%      2010-02-22 00:00:00
50%      2010-02-22 00:00:00
75%      2010-02-22 00:00:00
max      2015-09-10 00:00:00
Name: date, dtype: object

In [29]:
df_ta.date.value_counts()

date
2015-09-10    245
2014-09-19    244
2011-02-22    244
2012-02-22    242
2013-09-20    242
2010-02-22    241
Name: count, dtype: int64

In [30]:
#We will drop the matches from 2008 and 2009 for lack of stats, but that's okay, especially because
#2008 has fewer matches in the dataset than other years.
    #Using the list() approach because the .rename(columns = "...") method would rename
    #both columns named "date" and not distinguish match_date from (stats) date. .index gets the index of the first instance.
cols = list(df_home.columns)
cols[cols.index("date")] = "match_date"
df_home.columns = cols

df_home.match_date = pd.to_datetime(df_home.match_date)
100 * df_home.match_date.dt.year.value_counts()/len(df_home)

match_date
2012    12.884519
2011    12.797956
2010    12.677613
2015    12.565010
2009    12.536860
2013    12.268022
2014    12.217351
2016     6.048152
2008     6.004518
Name: count, dtype: float64

### Troubleshooting misalignment

In [31]:
%%sql
home << SELECT home_team_api_id FROM Match;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable home


In [32]:
home = home.DataFrame()

In [33]:
%%sql
away << SELECT away_team_api_id FROM Match;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable away


In [34]:
away = away.DataFrame()

In [35]:
#Lengths of home and away are the same
len(home), len(away)

(25979, 25979)

In [36]:
#There are 299 unique teams
len(home.groupby("home_team_api_id")), len(away.groupby("away_team_api_id"))

(299, 299)

In [37]:
#But only 288 teams have stats
len(df_ta.team_api_id.unique())

288

In [38]:
%%sql
home_stats << WITH Home AS (SELECT home_team_api_id FROM Match)
SELECT * FROM
Home h INNER JOIN Team_Attributes ta ON h.home_team_api_id = ta.team_api_id;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable home_stats


In [39]:
home_stats = home_stats.DataFrame()

In [40]:
%%sql
away_stats << WITH Away AS (SELECT away_team_api_id FROM Match)
SELECT * FROM
Away a INNER JOIN Team_Attributes ta ON a.away_team_api_id = ta.team_api_id;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable away_stats


In [41]:
away_stats = away_stats.DataFrame()

In [42]:
home_stats.shape, away_stats.shape

((142093, 26), (142100, 26))

LEFT JOIN: ((142271, 26), (142278, 26))

INNER JOIN: ((142093, 26), (142100, 26))

Seven rows difference either way--interesting

In [43]:
#Consistent with earlier finding that only 288 teams have stats
len(home_stats.groupby("home_team_api_id")), len(away_stats.groupby("away_team_api_id"))

(288, 288)

In [44]:
#The ID is just a counter, so we can use it for our counts
df_ta.id

0          1
1          2
2          3
3          4
4          5
        ... 
1453    1454
1454    1455
1455    1456
1456    1457
1457    1458
Name: id, Length: 1458, dtype: int64

In [45]:
#The number of stats measurements per team vary--possibly explaining the difference
df_ta.groupby("team_api_id")["id"].count().describe()

count    288.000000
mean       5.062500
std        1.540255
min        1.000000
25%        5.000000
50%        6.000000
75%        6.000000
max        6.000000
Name: id, dtype: float64

In [46]:
#Each team with stats shows up as both a home team and an away team in the match table
unique_home = sorted([int(val) for val in set(home_stats["home_team_api_id"].values)])
unique_away = sorted([int(val) for val in set(away_stats["away_team_api_id"].values)])
unique_home == unique_away, len(unique_home)

(True, 288)

In [47]:
#So every team is represented at least once, but different teams have different numbers
#of stats snapshots--possibly explaining the different numbers

## Team Attribute Join Pipeline

In [48]:
%%sql
matches_cl << WITH matches AS (SELECT match_api_id, country_id, league_id, season, stage, date, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal FROM Match)
SELECT * FROM matches m INNER JOIN Country c ON m.country_id = c.id INNER JOIN League l on m.league_id = l.id;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable matches_cl


In [49]:
#Matches dataframe with country and league names joined
matches_cl = matches_cl.DataFrame()

In [50]:
cols = list(matches_cl.columns)
cols[cols.index("date")] = "match_date"
matches_cl.columns = cols
matches_cl.match_date = pd.to_datetime(matches_cl.match_date)
matches_cl.head()

Unnamed: 0,match_api_id,country_id,league_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,id,name,id.1,country_id.1,name.1
0,492473,1,1,2008/2009,1,2008-08-17,9987,9993,1,1,1,Belgium,1,1,Belgium Jupiler League
1,492474,1,1,2008/2009,1,2008-08-16,10000,9994,0,0,1,Belgium,1,1,Belgium Jupiler League
2,492475,1,1,2008/2009,1,2008-08-16,9984,8635,0,3,1,Belgium,1,1,Belgium Jupiler League
3,492476,1,1,2008/2009,1,2008-08-17,9991,9998,5,0,1,Belgium,1,1,Belgium Jupiler League
4,492477,1,1,2008/2009,1,2008-08-16,7947,9985,1,3,1,Belgium,1,1,Belgium Jupiler League


In [51]:
#Dropping extraneous ID's
matches_cl = matches_cl.drop(columns = ["country_id", "league_id", "id"])
matches_cl.head()

Unnamed: 0,match_api_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,name,name.1
0,492473,2008/2009,1,2008-08-17,9987,9993,1,1,Belgium,Belgium Jupiler League
1,492474,2008/2009,1,2008-08-16,10000,9994,0,0,Belgium,Belgium Jupiler League
2,492475,2008/2009,1,2008-08-16,9984,8635,0,3,Belgium,Belgium Jupiler League
3,492476,2008/2009,1,2008-08-17,9991,9998,5,0,Belgium,Belgium Jupiler League
4,492477,2008/2009,1,2008-08-16,7947,9985,1,3,Belgium,Belgium Jupiler League


In [52]:
#Renaming duplicate columns again (with the list() approach instead of .rename(columns = "..."))
cols = list(matches_cl.columns)
cols[cols.index("name")] = "Country"
matches_cl.columns = cols
matches_cl = matches_cl.rename(columns = {"name":"League"})
matches_cl.head()

Unnamed: 0,match_api_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,Country,League
0,492473,2008/2009,1,2008-08-17,9987,9993,1,1,Belgium,Belgium Jupiler League
1,492474,2008/2009,1,2008-08-16,10000,9994,0,0,Belgium,Belgium Jupiler League
2,492475,2008/2009,1,2008-08-16,9984,8635,0,3,Belgium,Belgium Jupiler League
3,492476,2008/2009,1,2008-08-17,9991,9998,5,0,Belgium,Belgium Jupiler League
4,492477,2008/2009,1,2008-08-16,7947,9985,1,3,Belgium,Belgium Jupiler League


In [53]:
%%sql
ta_info << PRAGMA table_info(Team_Attributes)

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable ta_info


In [54]:
ta_info = ta_info.DataFrame()

In [55]:
ta_info.iloc[4:, :][ta_info.type == "INTEGER"]

  ta_info.iloc[4:, :][ta_info.type == "INTEGER"]


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
4,4,buildUpPlaySpeed,INTEGER,0,,0
6,6,buildUpPlayDribbling,INTEGER,0,,0
8,8,buildUpPlayPassing,INTEGER,0,,0
11,11,chanceCreationPassing,INTEGER,0,,0
13,13,chanceCreationCrossing,INTEGER,0,,0
15,15,chanceCreationShooting,INTEGER,0,,0
18,18,defencePressure,INTEGER,0,,0
20,20,defenceAggression,INTEGER,0,,0
22,22,defenceTeamWidth,INTEGER,0,,0


In [56]:
ta_info

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,1
1,1,team_fifa_api_id,INTEGER,0,,0
2,2,team_api_id,INTEGER,0,,0
3,3,date,TEXT,0,,0
4,4,buildUpPlaySpeed,INTEGER,0,,0
5,5,buildUpPlaySpeedClass,TEXT,0,,0
6,6,buildUpPlayDribbling,INTEGER,0,,0
7,7,buildUpPlayDribblingClass,TEXT,0,,0
8,8,buildUpPlayPassing,INTEGER,0,,0
9,9,buildUpPlayPassingClass,TEXT,0,,0


In [57]:
#Here I get all the stats while eliminating the redundant classes; I'll copy and paste the output for the below query.
team_stats = list(ta_info.name)[4:]
stat_cols = [stat for stat in team_stats if stat[:-5] not in team_stats]
class_stat_cols = [stat for stat in stat_cols if "class" in str(stat).lower()] #This is for later
stat_cols
for stat in stat_cols:
    print(stat, end = ", ")

buildUpPlaySpeed, buildUpPlayDribbling, buildUpPlayPassing, buildUpPlayPositioningClass, chanceCreationPassing, chanceCreationCrossing, chanceCreationShooting, chanceCreationPositioningClass, defencePressure, defenceAggression, defenceTeamWidth, defenceDefenderLineClass, 

In [58]:
%%sql
team_stats << SELECT team_api_id, date, buildUpPlaySpeed, buildUpPlayDribbling, buildUpPlayPassing, buildUpPlayPositioningClass, chanceCreationPassing, chanceCreationCrossing, chanceCreationShooting, chanceCreationPositioningClass, defencePressure, defenceAggression, defenceTeamWidth, defenceDefenderLineClass FROM Team_Attributes;

 * sqlite:///../data/database.sqlite
Done.
Returning data to local variable team_stats


In [59]:
team_stats = team_stats.DataFrame()

In [60]:
team_stats.date = pd.to_datetime(team_stats.date)
team_stats.head()

Unnamed: 0,team_api_id,date,buildUpPlaySpeed,buildUpPlayDribbling,buildUpPlayPassing,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,chanceCreationPositioningClass,defencePressure,defenceAggression,defenceTeamWidth,defenceDefenderLineClass
0,9930,2010-02-22,60,,50,Organised,60,65,55,Organised,50,55,45,Cover
1,9930,2014-09-19,52,48.0,56,Organised,54,63,64,Organised,47,44,54,Cover
2,9930,2015-09-10,47,41.0,54,Organised,54,63,64,Organised,47,44,54,Cover
3,8485,2010-02-22,70,,70,Organised,70,70,70,Organised,60,70,70,Cover
4,8485,2011-02-22,47,,52,Organised,53,48,52,Organised,47,47,52,Cover


In [61]:
#Sorting by date required for pd.merge_asof
matches_cl = matches_cl.sort_values(by = "match_date")
matches_cl.head()

Unnamed: 0,match_api_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,Country,League
24558,486263,2008/2009,1,2008-07-18,10192,9931,1,2,Switzerland,Switzerland Super League
24559,486264,2008/2009,1,2008-07-19,9930,10179,3,1,Switzerland,Switzerland Super League
24560,486265,2008/2009,1,2008-07-20,10199,9824,1,2,Switzerland,Switzerland Super League
24561,486266,2008/2009,1,2008-07-20,7955,10243,1,2,Switzerland,Switzerland Super League
24613,486268,2008/2009,2,2008-07-23,6493,7955,1,2,Switzerland,Switzerland Super League


In [62]:
team_stats = team_stats.sort_values(by = "date")
team_stats = team_stats.rename(columns = {"team_api_id":"home_team_api_id"})
team_stats.head()

Unnamed: 0,home_team_api_id,date,buildUpPlaySpeed,buildUpPlayDribbling,buildUpPlayPassing,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,chanceCreationPositioningClass,defencePressure,defenceAggression,defenceTeamWidth,defenceDefenderLineClass
0,9930,2010-02-22,60,,50,Organised,60,65,55,Organised,50,55,45,Cover
426,8674,2010-02-22,41,,32,Organised,40,47,69,Organised,30,30,30,Cover
1147,10189,2010-02-22,65,,55,Organised,55,70,70,Organised,70,45,70,Cover
419,8722,2010-02-22,55,,65,Organised,65,40,60,Organised,45,55,70,Cover
418,8596,2010-02-22,70,,70,Organised,60,70,70,Organised,60,70,70,Cover


In [63]:
#For each match, add the most recent home team stats before the date of that match
df_home = pd.merge_asof(matches_cl, team_stats, left_on = "match_date", right_on = "date", by = "home_team_api_id", direction = "backward")
df_home[df_home["defencePressure"] > 0].head(20)

Unnamed: 0,match_api_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,Country,League,...,buildUpPlayPassing,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,chanceCreationPositioningClass,defencePressure,defenceAggression,defenceTeamWidth,defenceDefenderLineClass
5469,684955,2009/2010,23,2010-02-22,10267,8305,2,1,Spain,Spain LIGA BBVA,...,30.0,Free Form,55.0,60.0,70.0,Organised,55.0,60.0,60.0,Offside Trap
5470,686171,2009/2010,20,2010-02-22,9807,10215,1,2,Portugal,Portugal Liga ZON Sagres,...,30.0,Organised,55.0,45.0,60.0,Organised,30.0,30.0,30.0,Offside Trap
5471,659091,2009/2010,28,2010-02-23,10260,8654,3,0,England,England Premier League,...,45.0,Organised,45.0,70.0,65.0,Free Form,40.0,50.0,40.0,Cover
5472,704654,2009/2010,17,2010-02-24,8600,8529,2,1,Italy,Italy Serie A,...,35.0,Organised,30.0,50.0,70.0,Organised,35.0,30.0,55.0,Offside Trap
5473,704632,2009/2010,17,2010-02-24,8535,8564,1,2,Italy,Italy Serie A,...,30.0,Free Form,50.0,60.0,60.0,Free Form,45.0,30.0,60.0,Offside Trap
5474,665677,2009/2010,23,2010-02-24,9984,9986,1,0,Belgium,Belgium Jupiler League,...,35.0,Organised,70.0,45.0,55.0,Organised,65.0,60.0,70.0,Cover
5475,665694,2009/2010,24,2010-02-24,9999,9987,1,1,Belgium,Belgium Jupiler League,...,60.0,Organised,50.0,35.0,40.0,Organised,65.0,70.0,70.0,Cover
5476,674600,2009/2010,18,2010-02-26,2186,8673,1,2,Poland,Poland Ekstraklasa,...,45.0,Organised,40.0,35.0,65.0,Organised,70.0,70.0,65.0,Cover
5477,686186,2009/2010,21,2010-02-26,9771,7844,0,1,Portugal,Portugal Liga ZON Sagres,...,50.0,Organised,50.0,50.0,50.0,Organised,50.0,50.0,50.0,Cover
5478,674483,2009/2010,24,2010-02-26,10189,9789,2,1,Germany,Germany 1. Bundesliga,...,55.0,Organised,55.0,70.0,70.0,Organised,70.0,45.0,70.0,Cover


In [64]:
dates_check = df_home.groupby(["home_team_api_id", "date"])[["match_date"]].min()
dates_check.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,match_date
home_team_api_id,date,Unnamed: 2_level_1
1601,2010-02-22,2010-03-06
1601,2011-02-22,2011-02-26
1601,2012-02-22,2012-02-24
1601,2013-09-20,2013-09-22
1601,2014-09-19,2014-09-21
1601,2015-09-10,2015-09-13
1773,2012-02-22,2012-03-03
1773,2014-09-19,2015-08-02
1957,2010-02-22,2010-03-12
1957,2011-02-22,2011-03-08


In [65]:
#Is every match date on or after the date of the stats snapshot? Did this method work properly?
stat_dates = [j for i, j in dates_check.index]
count = 0
for i in range(len(dates_check)):
    if list(dates_check.values)[i] >= stat_dates[i]: #That is, if the match date is equal to or greater than the date on which stats were last measured
        count += 1
print(f"{count == len(dates_check)}: {count, len(dates_check)}")

True: (1171, 1171)


In [66]:
team_stats = team_stats.rename(columns = {"home_team_api_id":"away_team_api_id"})
matches_cl.columns, team_stats.columns

(Index(['match_api_id', 'season', 'stage', 'match_date', 'home_team_api_id',
        'away_team_api_id', 'home_team_goal', 'away_team_goal', 'Country',
        'League'],
       dtype='object'),
 Index(['away_team_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlayDribbling',
        'buildUpPlayPassing', 'buildUpPlayPositioningClass',
        'chanceCreationPassing', 'chanceCreationCrossing',
        'chanceCreationShooting', 'chanceCreationPositioningClass',
        'defencePressure', 'defenceAggression', 'defenceTeamWidth',
        'defenceDefenderLineClass'],
       dtype='object'))

In [67]:
#We have a table joined by home_team_api_id. Let's get 
df_away = pd.merge_asof(matches_cl, team_stats, left_on = "match_date", right_on = "date", by = "away_team_api_id", direction = "backward")
df_away[df_away["defencePressure"] > 0].head()
df_away_stats = df_away.iloc[:, -12:]
df_away_stats

Unnamed: 0,buildUpPlaySpeed,buildUpPlayDribbling,buildUpPlayPassing,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,chanceCreationPositioningClass,defencePressure,defenceAggression,defenceTeamWidth,defenceDefenderLineClass
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,50.0,60.0,54.0,Organised,52.0,54.0,46.0,Organised,36.0,43.0,47.0,Cover
25975,52.0,46.0,48.0,Organised,39.0,39.0,37.0,Organised,40.0,36.0,52.0,Cover
25976,61.0,57.0,37.0,Organised,62.0,50.0,53.0,Organised,45.0,47.0,54.0,Cover
25977,52.0,56.0,64.0,Organised,39.0,66.0,46.0,Organised,44.0,34.0,50.0,Cover


In [68]:
#Here we concatenate the joined DataFrames together to get one that has the team stats for both the home and away teams
df_away_stats = df_away_stats.add_prefix("away_") #Adding this prefix before concatenation to avoid duplicate stat columns
both_stats = pd.concat([df_home, df_away_stats], axis = 1)

#Adding "home_" prefix
home_stat_cols = [f"home_{stat}" for stat in stat_cols]
both_stats = both_stats.rename(columns = dict(zip(stat_cols, home_stat_cols)))

In [69]:
#Success
both_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   match_api_id                         25979 non-null  int64         
 1   season                               25979 non-null  object        
 2   stage                                25979 non-null  int64         
 3   match_date                           25979 non-null  datetime64[ns]
 4   home_team_api_id                     25979 non-null  int64         
 5   away_team_api_id                     25979 non-null  int64         
 6   home_team_goal                       25979 non-null  int64         
 7   away_team_goal                       25979 non-null  int64         
 8   Country                              25979 non-null  object        
 9   League                               25979 non-null  object        
 10  date      

In [70]:
#BuildUpPlayDribbling only has 5874 entries and should just be dropped
both_stats = both_stats.drop(columns = ["home_buildUpPlayDribbling", "away_buildUpPlayDribbling"])
stat_cols.remove("buildUpPlayDribbling")
home_stat_cols.remove("home_buildUpPlayDribbling")

In [71]:
#There are a ton of missing values in the stats columns, likely from cases where one or both teams didn't have stats measurements
#at all or any yet before the match date. With 23% missing, we may just drop these as well. It would be hard to meaningfully and
#usefully impute these without drawing on future stat measurements (e.g., imputing average stat by league and season would incorporate
#future data without elaborate data engineering gymnastics to maintain temporal integrity). The missing values are probably directly
#reflective of the data's fundamental temporal limits anyway. Better safe than sorry, and quality over quantity of records.

print(round(1 - 19916/25979, 2))
both_stats = both_stats.dropna()

0.23


In [72]:
both_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19355 entries, 5469 to 25978
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   match_api_id                         19355 non-null  int64         
 1   season                               19355 non-null  object        
 2   stage                                19355 non-null  int64         
 3   match_date                           19355 non-null  datetime64[ns]
 4   home_team_api_id                     19355 non-null  int64         
 5   away_team_api_id                     19355 non-null  int64         
 6   home_team_goal                       19355 non-null  int64         
 7   away_team_goal                       19355 non-null  int64         
 8   Country                              19355 non-null  object        
 9   League                               19355 non-null  object        
 10  date        

In [73]:
both_stats.head()

Unnamed: 0,match_api_id,season,stage,match_date,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,Country,League,...,away_buildUpPlayPassing,away_buildUpPlayPositioningClass,away_chanceCreationPassing,away_chanceCreationCrossing,away_chanceCreationShooting,away_chanceCreationPositioningClass,away_defencePressure,away_defenceAggression,away_defenceTeamWidth,away_defenceDefenderLineClass
5469,684955,2009/2010,23,2010-02-22,10267,8305,2,1,Spain,Spain LIGA BBVA,...,35.0,Organised,35.0,50.0,70.0,Free Form,40.0,30.0,50.0,Offside Trap
5470,686171,2009/2010,20,2010-02-22,9807,10215,1,2,Portugal,Portugal Liga ZON Sagres,...,30.0,Organised,50.0,60.0,55.0,Organised,30.0,30.0,30.0,Offside Trap
5471,659091,2009/2010,28,2010-02-23,10260,8654,3,0,England,England Premier League,...,30.0,Organised,31.0,70.0,50.0,Organised,30.0,70.0,30.0,Cover
5472,704654,2009/2010,17,2010-02-24,8600,8529,2,1,Italy,Italy Serie A,...,30.0,Organised,30.0,35.0,55.0,Organised,40.0,55.0,60.0,Cover
5473,704632,2009/2010,17,2010-02-24,8535,8564,1,2,Italy,Italy Serie A,...,30.0,Free Form,55.0,45.0,70.0,Free Form,30.0,35.0,60.0,Offside Trap


In [74]:
both_stats_cols = list(both_stats.columns)
both_stats_cols

['match_api_id',
 'season',
 'stage',
 'match_date',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal',
 'Country',
 'League',
 'date',
 'home_buildUpPlaySpeed',
 'home_buildUpPlayPassing',
 'home_buildUpPlayPositioningClass',
 'home_chanceCreationPassing',
 'home_chanceCreationCrossing',
 'home_chanceCreationShooting',
 'home_chanceCreationPositioningClass',
 'home_defencePressure',
 'home_defenceAggression',
 'home_defenceTeamWidth',
 'home_defenceDefenderLineClass',
 'away_buildUpPlaySpeed',
 'away_buildUpPlayPassing',
 'away_buildUpPlayPositioningClass',
 'away_chanceCreationPassing',
 'away_chanceCreationCrossing',
 'away_chanceCreationShooting',
 'away_chanceCreationPositioningClass',
 'away_defencePressure',
 'away_defenceAggression',
 'away_defenceTeamWidth',
 'away_defenceDefenderLineClass']

In [75]:
#Compute the stat differences and drop the raw columns
for stat in stat_cols:
    if ("int" in str(both_stats[f"home_{stat}"].dtype)) | ("float" in str(both_stats[f"home_{stat}"].dtype)):
        both_stats[f"{stat}_home_diff"] = both_stats[f"home_{stat}"] - both_stats[f"away_{stat}"]
        both_stats = both_stats.drop(columns = [f"home_{stat}", f"away_{stat}"])

In [76]:
#Success
both_stats = both_stats.drop(columns = ["home_team_api_id", "away_team_api_id", "date"])
both_stats.head()

Unnamed: 0,match_api_id,season,stage,match_date,home_team_goal,away_team_goal,Country,League,home_buildUpPlayPositioningClass,home_chanceCreationPositioningClass,...,away_chanceCreationPositioningClass,away_defenceDefenderLineClass,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff
5469,684955,2009/2010,23,2010-02-22,2,1,Spain,Spain LIGA BBVA,Free Form,Organised,...,Free Form,Offside Trap,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0
5470,686171,2009/2010,20,2010-02-22,1,2,Portugal,Portugal Liga ZON Sagres,Organised,Organised,...,Organised,Offside Trap,0.0,0.0,5.0,-15.0,5.0,0.0,0.0,0.0
5471,659091,2009/2010,28,2010-02-23,3,0,England,England Premier League,Organised,Free Form,...,Organised,Cover,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0
5472,704654,2009/2010,17,2010-02-24,2,1,Italy,Italy Serie A,Organised,Organised,...,Organised,Cover,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0
5473,704632,2009/2010,17,2010-02-24,1,2,Italy,Italy Serie A,Free Form,Free Form,...,Free Form,Offside Trap,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0


In [77]:
#Now, we calculate the home score advantage
both_stats["Home_Score_Adv"] = both_stats["home_team_goal"] - both_stats["away_team_goal"]
both_stats = both_stats.drop(columns = ["home_team_goal", "away_team_goal"])
both_stats = both_stats.reset_index().drop(columns = "index")

In [78]:
both_stats.head(10)

Unnamed: 0,match_api_id,season,stage,match_date,Country,League,home_buildUpPlayPositioningClass,home_chanceCreationPositioningClass,home_defenceDefenderLineClass,away_buildUpPlayPositioningClass,...,away_defenceDefenderLineClass,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,Home_Score_Adv
0,684955,2009/2010,23,2010-02-22,Spain,Spain LIGA BBVA,Free Form,Organised,Offside Trap,Organised,...,Offside Trap,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,1
1,686171,2009/2010,20,2010-02-22,Portugal,Portugal Liga ZON Sagres,Organised,Organised,Offside Trap,Organised,...,Offside Trap,0.0,0.0,5.0,-15.0,5.0,0.0,0.0,0.0,-1
2,659091,2009/2010,28,2010-02-23,England,England Premier League,Organised,Free Form,Cover,Organised,...,Cover,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,3
3,704654,2009/2010,17,2010-02-24,Italy,Italy Serie A,Organised,Organised,Offside Trap,Organised,...,Cover,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,1
4,704632,2009/2010,17,2010-02-24,Italy,Italy Serie A,Free Form,Free Form,Offside Trap,Free Form,...,Offside Trap,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-1
5,665677,2009/2010,23,2010-02-24,Belgium,Belgium Jupiler League,Organised,Organised,Cover,Organised,...,Cover,5.0,-15.0,25.0,2.0,-5.0,-5.0,-10.0,0.0,1
6,665694,2009/2010,24,2010-02-24,Belgium,Belgium Jupiler League,Organised,Organised,Cover,Organised,...,Cover,20.0,15.0,0.0,0.0,-20.0,-5.0,5.0,0.0,0
7,674600,2009/2010,18,2010-02-26,Poland,Poland Ekstraklasa,Organised,Organised,Cover,Organised,...,Cover,0.0,10.0,-15.0,-10.0,-5.0,5.0,0.0,5.0,-1
8,686186,2009/2010,21,2010-02-26,Portugal,Portugal Liga ZON Sagres,Organised,Organised,Cover,Organised,...,Offside Trap,20.0,20.0,-5.0,-5.0,-5.0,20.0,20.0,20.0,-1
9,674483,2009/2010,24,2010-02-26,Germany,Germany 1. Bundesliga,Organised,Organised,Cover,Organised,...,Offside Trap,-5.0,10.0,-15.0,36.0,4.0,16.0,-25.0,17.0,1


# Joining Player Attributes

In [79]:
#The Match table includes home_player_api_id and away_player_api_id. Player Attributes includes player_api_id,
#date (of stat measurement), and player stats.

#For each match, we want columns that represent the differences in the averages of each player stat across the teams
#(Home team average player stats minus away team average player stat for each player stat measured)

In [80]:
%%sql
SELECT * FROM Match
ORDER BY date
LIMIT 5;

 * sqlite:///../data/database.sqlite
Done.


id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
24559,24558,24558,2008/2009,1,2008-07-18 00:00:00,486263,10192,9931,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24560,24558,24558,2008/2009,1,2008-07-19 00:00:00,486264,9930,10179,3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24561,24558,24558,2008/2009,1,2008-07-20 00:00:00,486265,10199,9824,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24562,24558,24558,2008/2009,1,2008-07-20 00:00:00,486266,7955,10243,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24613,24558,24558,2008/2009,2,2008-07-23 00:00:00,486267,9931,9956,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [81]:
%%sql
SELECT * FROM Player_Attributes
ORDER BY player_api_id, date
LIMIT 5;

 * sqlite:///../data/database.sqlite
Done.


id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
139857,148544,2625,2007-02-22 00:00:00,63,64,right,high,medium,48,48,47,64,38,57,50,46,67,57,67,64,59,52,49,61,56,78,56,59,72,52,55,56,46,64,66,63,14,11,67,9,10
139856,148544,2625,2007-08-30 00:00:00,63,64,right,high,medium,48,48,47,64,38,57,50,51,67,57,67,64,59,52,49,61,56,78,56,59,72,52,55,56,46,64,66,63,14,24,67,24,24
139855,148544,2625,2008-08-30 00:00:00,60,64,right,high,medium,48,48,47,64,38,57,50,51,67,57,67,64,59,52,49,61,56,78,56,59,72,52,55,56,46,64,66,63,14,24,67,24,24
139854,148544,2625,2010-08-30 00:00:00,60,64,right,high,medium,48,48,47,64,38,57,50,51,67,57,67,64,59,50,49,71,56,78,56,59,72,71,50,56,69,64,66,63,12,11,6,8,8
139853,148544,2625,2011-02-22 00:00:00,59,63,right,high,medium,52,47,46,63,37,56,49,50,66,58,66,63,58,49,48,68,55,77,55,58,71,70,49,55,66,63,63,62,12,11,6,8,8


In [82]:
%%sql
SELECT * FROM Match
ORDER BY date
LIMIT 5;

 * sqlite:///../data/database.sqlite
Done.


id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
24559,24558,24558,2008/2009,1,2008-07-18 00:00:00,486263,10192,9931,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24560,24558,24558,2008/2009,1,2008-07-19 00:00:00,486264,9930,10179,3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24561,24558,24558,2008/2009,1,2008-07-20 00:00:00,486265,10199,9824,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24562,24558,24558,2008/2009,1,2008-07-20 00:00:00,486266,7955,10243,1,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24613,24558,24558,2008/2009,2,2008-07-23 00:00:00,486267,9931,9956,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [83]:
df_pa.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [84]:
#Now, we prepare to query and join all the player stats

In [85]:
#Can we determine which player is the goalkeeper via the player positions? Then, we can join goalkeeper attributes only for
#goalkeepers and compute the differences in the goalkeeper stats between teams without averaging over every player.

In [86]:
xy = df_match.dropna()[list(df_match.columns)[11:55]]
xy_agg = xy[list(xy.columns)[22:44]].agg(("min", "max"))
xy_agg.iloc[:, :11]

Unnamed: 0,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11
min,1.0,3.0,3.0,3.0,3.0,3.0,5.0,5.0,7.0,6.0,10.0
max,1.0,3.0,3.0,3.0,7.0,8.0,8.0,9.0,10.0,11.0,11.0


In [87]:
xy_agg.iloc[:, 11:]

Unnamed: 0,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11
min,1.0,3.0,3.0,3.0,3.0,3.0,5.0,5.0,6.0,6.0,10.0
max,1.0,3.0,3.0,3.0,7.0,7.0,8.0,9.0,10.0,11.0,11.0


In [88]:
len(xy.home_player_Y1.unique()) == len(xy.away_player_Y1.unique()) == 1
#This confirms that player 1 is always the goal keeper; uniquely, player 1 is always at y position 1, closest to the goal.

True

In [89]:
all_atts = list(df_pa.columns)[4:]
core_atts = list(df_pa.columns)[4:-5] #Without goalkeeper attributes
for object in ["preferred_foot", "attacking_work_rate", "defensive_work_rate"]:
    all_atts.remove(object)
    core_atts.remove(object)
    #Encoding the categorical player attributes somehow--turning that into some kind of aggregated
    #value (e.g., most common) for each team and even trying to derive some sort of relationship between teams--
    #doesn't seem useful for the model or worth the time and inter-team computation.

In [90]:
df = join_player_attributes(all_attributes = all_atts, core_attributes = core_atts)

In [91]:
df.head()

Unnamed: 0,match_api_id,match_date,home_1_stat_date,home_1_overall_rating,home_1_potential,home_1_crossing,home_1_finishing,home_1_heading_accuracy,home_1_short_passing,home_1_volleys,...,away_11_strength,away_11_long_shots,away_11_aggression,away_11_interceptions,away_11_positioning,away_11_vision,away_11_penalties,away_11_marking,away_11_standing_tackle,away_11_sliding_tackle
28,483134,2008-08-09,2007-02-22,69.0,75.0,61.0,30.0,62.0,72.0,32.0,...,71.0,60.0,55.0,33.0,65.0,57.0,71.0,15.0,21.0,11.0
43,483133,2008-08-10,2007-08-30,75.0,82.0,22.0,22.0,23.0,22.0,21.0,...,70.0,67.0,55.0,59.0,78.0,78.0,52.0,22.0,25.0,12.0
52,489050,2008-08-16,2007-08-30,68.0,79.0,21.0,20.0,26.0,27.0,21.0,...,78.0,75.0,59.0,70.0,76.0,72.0,72.0,37.0,50.0,28.0
77,489044,2008-08-16,2007-08-30,78.0,85.0,22.0,21.0,21.0,21.0,9.0,...,76.0,75.0,63.0,79.0,89.0,81.0,86.0,21.0,24.0,29.0
78,489043,2008-08-16,2007-08-30,75.0,78.0,22.0,22.0,22.0,22.0,10.0,...,80.0,45.0,66.0,59.0,63.0,54.0,57.0,22.0,31.0,20.0


In [92]:
#Now, let's verify temporal integrity
stat_dates = [f"{team}_{num}_stat_date" for team in ["home", "away"] for num in range(1,12)]
stat_dates.insert(0, "match_date")
df_dates = df[stat_dates]
df_dates

Unnamed: 0,match_date,home_1_stat_date,home_2_stat_date,home_3_stat_date,home_4_stat_date,home_5_stat_date,home_6_stat_date,home_7_stat_date,home_8_stat_date,home_9_stat_date,...,away_2_stat_date,away_3_stat_date,away_4_stat_date,away_5_stat_date,away_6_stat_date,away_7_stat_date,away_8_stat_date,away_9_stat_date,away_10_stat_date,away_11_stat_date
28,2008-08-09,2007-02-22,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,...,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2008-02-22
43,2008-08-10,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-02-22,2007-08-30,...,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-02-22,2007-08-30
52,2008-08-16,2007-08-30,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,...,2007-02-22,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2008-02-22,2007-08-30
77,2008-08-16,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2008-02-22,2007-08-30,...,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-02-22,2007-08-30,2007-08-30,2007-08-30
78,2008-08-16,2007-08-30,2007-08-30,2007-08-30,2008-02-22,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,...,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-08-30,2007-02-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25974,2016-05-25,2015-10-02,2016-05-12,2016-05-12,2016-04-14,2016-05-12,2016-05-05,2016-04-21,2015-11-12,2016-03-17,...,2016-01-21,2016-03-10,2016-04-14,2016-03-10,2016-01-21,2015-10-30,2015-09-21,2015-10-23,2015-09-25,2016-05-19
25975,2016-05-25,2016-03-10,2015-12-24,2016-03-10,2015-11-06,2016-03-10,2016-04-28,2016-03-10,2016-01-07,2016-03-10,...,2016-01-07,2016-03-03,2016-04-28,2016-03-10,2015-10-02,2015-10-09,2016-05-12,2016-03-10,2015-10-09,2015-10-30
25976,2016-05-25,2016-03-10,2016-05-19,2016-03-17,2016-03-10,2016-03-10,2016-04-28,2016-03-10,2016-03-31,2016-03-10,...,2016-03-10,2016-03-31,2016-03-10,2016-03-10,2016-03-31,2015-09-21,2016-03-10,2016-04-14,2016-04-14,2016-02-04
25977,2016-05-25,2015-09-25,2015-09-21,2015-09-25,2015-09-25,2015-09-21,2016-03-10,2015-09-21,2015-09-21,2016-04-28,...,2015-09-21,2015-09-21,2015-10-09,2016-04-14,2015-09-21,2016-05-12,2016-03-24,2015-10-16,2015-10-09,2015-09-21


In [93]:
#This verifies that, for each match, all the players' stats measurements were made before the match date
sum(df_dates.iloc[:, 1:].max(axis = 1) <= df_dates.match_date) == len(df_dates)

True

In [94]:
df = df.drop(columns = stat_dates)

In [95]:
df.head()

Unnamed: 0,match_api_id,home_1_overall_rating,home_1_potential,home_1_crossing,home_1_finishing,home_1_heading_accuracy,home_1_short_passing,home_1_volleys,home_1_dribbling,home_1_curve,...,away_11_strength,away_11_long_shots,away_11_aggression,away_11_interceptions,away_11_positioning,away_11_vision,away_11_penalties,away_11_marking,away_11_standing_tackle,away_11_sliding_tackle
28,483134,69.0,75.0,61.0,30.0,62.0,72.0,32.0,44.0,42.0,...,71.0,60.0,55.0,33.0,65.0,57.0,71.0,15.0,21.0,11.0
43,483133,75.0,82.0,22.0,22.0,23.0,22.0,21.0,22.0,11.0,...,70.0,67.0,55.0,59.0,78.0,78.0,52.0,22.0,25.0,12.0
52,489050,68.0,79.0,21.0,20.0,26.0,27.0,21.0,30.0,8.0,...,78.0,75.0,59.0,70.0,76.0,72.0,72.0,37.0,50.0,28.0
77,489044,78.0,85.0,22.0,21.0,21.0,21.0,9.0,21.0,9.0,...,76.0,75.0,63.0,79.0,89.0,81.0,86.0,21.0,24.0,29.0
78,489043,75.0,78.0,22.0,22.0,22.0,22.0,10.0,22.0,11.0,...,80.0,45.0,66.0,59.0,63.0,54.0,57.0,22.0,31.0,20.0


In [96]:
#This verifies that only the 1 players have the five GK attributes, just as intended
gk_atts = [q for q in list(df.columns) if "gk" in str(q)]
gk_atts

['home_1_gk_diving',
 'home_1_gk_handling',
 'home_1_gk_kicking',
 'home_1_gk_positioning',
 'home_1_gk_reflexes',
 'away_1_gk_diving',
 'away_1_gk_handling',
 'away_1_gk_kicking',
 'away_1_gk_positioning',
 'away_1_gk_reflexes']

In [97]:
df = df.reset_index().drop(columns = "index")

In [98]:
avg_player_stats = pd.DataFrame(df.match_api_id)

In [99]:
for team in ["home", "away"]:
    for att in core_atts:
        cols = [col.strip() for col in list(df.columns) if (f"{team}" in col) & (f"{att}" in col)]
        avg_player_stats[f"avg_{team}_player_{att}"] = df[cols].mean(axis = 1)

In [100]:
avg_player_stats[["avg_home_player_overall_rating", "avg_away_player_overall_rating"]]

Unnamed: 0,avg_home_player_overall_rating,avg_away_player_overall_rating
0,67.909091,73.363636
1,78.818182,70.363636
2,68.272727,74.909091
3,73.727273,79.909091
4,78.000000,71.181818
...,...,...
18846,68.000000,64.545455
18847,59.363636,63.181818
18848,65.727273,67.636364
18849,60.181818,68.363636


In [101]:
#A test to confirm the means were computed correctly
idx = 4 #An example test of index 4

home_rating_test_cols = [col.strip() for col in df.columns if "home" in col and "overall_rating" in col]
print(home_rating_test_cols)

val_list = []
for val in df.loc[idx, home_rating_test_cols]:
    val_list.append(val)
    
print("DataFrame mean:", df.loc[idx, home_rating_test_cols].mean(), "Manual mean:", sum(val_list)/len(val_list)) #Indeed, the averages computed manually and taken
#from the DataFrame are the same! You can verify this works for any index you select (idx = ...)

print(val_list == [val for val in df.loc[idx, home_rating_test_cols]]) #And of course, the manually curated values and the ones
#from the DataFrame are the same

df.loc[idx, home_rating_test_cols]

['home_1_overall_rating', 'home_2_overall_rating', 'home_3_overall_rating', 'home_4_overall_rating', 'home_5_overall_rating', 'home_6_overall_rating', 'home_7_overall_rating', 'home_8_overall_rating', 'home_9_overall_rating', 'home_10_overall_rating', 'home_11_overall_rating']
DataFrame mean: 78.0 Manual mean: 78.0
True


home_1_overall_rating     75.0
home_2_overall_rating     76.0
home_3_overall_rating     85.0
home_4_overall_rating     74.0
home_5_overall_rating     77.0
home_6_overall_rating     72.0
home_7_overall_rating     79.0
home_8_overall_rating     75.0
home_9_overall_rating     85.0
home_10_overall_rating    87.0
home_11_overall_rating    73.0
Name: 4, dtype: float64

In [102]:
avg_player_stats.head()

Unnamed: 0,match_api_id,avg_home_player_overall_rating,avg_home_player_potential,avg_home_player_crossing,avg_home_player_finishing,avg_home_player_heading_accuracy,avg_home_player_short_passing,avg_home_player_volleys,avg_home_player_dribbling,avg_home_player_curve,...,avg_away_player_strength,avg_away_player_long_shots,avg_away_player_aggression,avg_away_player_interceptions,avg_away_player_positioning,avg_away_player_vision,avg_away_player_penalties,avg_away_player_marking,avg_away_player_standing_tackle,avg_away_player_sliding_tackle
0,483134,67.909091,75.454545,51.454545,44.545455,60.727273,61.545455,42.727273,49.636364,46.454545,...,69.0,50.818182,67.181818,68.363636,68.166667,70.363636,70.818182,54.363636,53.909091,54.090909
1,483133,78.818182,85.272727,67.818182,59.0,70.363636,73.363636,58.909091,64.818182,63.181818,...,69.363636,53.272727,64.181818,67.272727,66.083333,70.545455,58.636364,53.272727,54.909091,52.909091
2,489050,68.272727,74.363636,56.454545,50.636364,59.727273,58.454545,49.090909,54.636364,46.0,...,70.181818,60.545455,69.454545,75.818182,74.833333,69.181818,74.363636,53.545455,57.0,54.454545
3,489044,73.727273,78.727273,63.0,53.909091,57.818182,62.272727,52.090909,58.545455,58.545455,...,74.545455,55.090909,70.818182,77.454545,77.333333,73.818182,76.0,54.454545,58.090909,54.545455
4,489043,78.0,84.545455,60.545455,51.454545,64.909091,65.454545,59.181818,66.727273,60.090909,...,66.818182,51.636364,69.0,65.272727,64.75,63.909091,67.636364,53.272727,57.181818,54.545455


In [103]:
#Base DataFrame for adding columns
home_diffs = pd.DataFrame(df.match_api_id)

In [104]:
#Now, for each match, we want to average each specific stat for each home player (average overall rating, then average potential, etc.)

for team in ["home", "away"]:
    for att in core_atts:
        cols = [col.strip() for col in list(df.columns) if (f"{team}" in col) & (f"{att}" in col)]
        home_diffs[f"avg_{team}_player_{att}"] = df[cols].mean(axis = 1)
        if team == "away":
            home_diffs[f"home_diff_avg_player_{att}"] = home_diffs[f"avg_home_player_{att}"] - avg_player_stats[f"avg_away_player_{att}"]
            home_diffs = home_diffs.drop(columns = [f"avg_home_player_{att}", f"avg_away_player_{att}"])


In [105]:
gk_attributes = all_atts[-5:]
gk_attributes

['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']

In [106]:
for col in gk_attributes:
    home_diffs[f"home_diff_{col}"] = (df[f"home_1_{col}"] - df[f"away_1_{col}"])
    
    #These goalkeeper stats were not averaged because they only belong to one player per team, not averaged for each team.
    #I could divide the differences by 11 to scale them with the values that were divided by 11 as part of the averaging process,
    #but the inflation won't perturb XGBoost.

In [107]:
#Now, at last, we have both the joined team stats DataFrame and the aggregated and computed player stats DataFrame.
home_diffs.head()

Unnamed: 0,match_api_id,home_diff_avg_player_overall_rating,home_diff_avg_player_potential,home_diff_avg_player_crossing,home_diff_avg_player_finishing,home_diff_avg_player_heading_accuracy,home_diff_avg_player_short_passing,home_diff_avg_player_volleys,home_diff_avg_player_dribbling,home_diff_avg_player_curve,...,home_diff_avg_player_vision,home_diff_avg_player_penalties,home_diff_avg_player_marking,home_diff_avg_player_standing_tackle,home_diff_avg_player_sliding_tackle,home_diff_gk_diving,home_diff_gk_handling,home_diff_gk_kicking,home_diff_gk_positioning,home_diff_gk_reflexes
0,483134,-5.454545,-4.272727,-7.636364,-2.363636,1.545455,-3.818182,-7.636364,-10.272727,-10.272727,...,-10.818182,-15.636364,-3.0,-2.818182,-1.272727,-78.0,-63.0,-7.0,-62.0,-63.0
1,483133,8.454545,7.272727,12.090909,13.545455,7.727273,11.090909,12.909091,11.909091,7.181818,...,-0.272727,10.0,2.818182,3.454545,1.818182,7.0,-1.0,5.0,-5.0,6.0
2,489050,-6.636364,-4.545455,-4.272727,-5.363636,-7.272727,-9.636364,-11.363636,-1.909091,-13.181818,...,-8.272727,-11.818182,2.0,1.909091,0.454545,-3.0,-13.0,-3.0,-13.0,-10.0
3,489044,-6.181818,-3.727273,6.636364,-5.181818,-8.909091,-3.545455,-9.454545,3.727273,1.272727,...,-8.545455,-10.272727,-2.727273,-3.090909,-1.454545,-9.0,1.0,-6.0,-7.0,-3.0
4,489043,6.818182,8.0,6.909091,4.727273,6.0,3.727273,10.545455,12.727273,14.727273,...,4.454545,2.0,-0.090909,-1.0,6.090909,-11.0,-1.0,4.0,-5.0,-1.0


# Combining DataFrames and Saving .csv

In [108]:
#Final check
home_diffs

Unnamed: 0,match_api_id,home_diff_avg_player_overall_rating,home_diff_avg_player_potential,home_diff_avg_player_crossing,home_diff_avg_player_finishing,home_diff_avg_player_heading_accuracy,home_diff_avg_player_short_passing,home_diff_avg_player_volleys,home_diff_avg_player_dribbling,home_diff_avg_player_curve,...,home_diff_avg_player_vision,home_diff_avg_player_penalties,home_diff_avg_player_marking,home_diff_avg_player_standing_tackle,home_diff_avg_player_sliding_tackle,home_diff_gk_diving,home_diff_gk_handling,home_diff_gk_kicking,home_diff_gk_positioning,home_diff_gk_reflexes
0,483134,-5.454545,-4.272727,-7.636364,-2.363636,1.545455,-3.818182,-7.636364,-10.272727,-10.272727,...,-10.818182,-15.636364,-3.000000,-2.818182,-1.272727,-78.0,-63.0,-7.0,-62.0,-63.0
1,483133,8.454545,7.272727,12.090909,13.545455,7.727273,11.090909,12.909091,11.909091,7.181818,...,-0.272727,10.000000,2.818182,3.454545,1.818182,7.0,-1.0,5.0,-5.0,6.0
2,489050,-6.636364,-4.545455,-4.272727,-5.363636,-7.272727,-9.636364,-11.363636,-1.909091,-13.181818,...,-8.272727,-11.818182,2.000000,1.909091,0.454545,-3.0,-13.0,-3.0,-13.0,-10.0
3,489044,-6.181818,-3.727273,6.636364,-5.181818,-8.909091,-3.545455,-9.454545,3.727273,1.272727,...,-8.545455,-10.272727,-2.727273,-3.090909,-1.454545,-9.0,1.0,-6.0,-7.0,-3.0
4,489043,6.818182,8.000000,6.909091,4.727273,6.000000,3.727273,10.545455,12.727273,14.727273,...,4.454545,2.000000,-0.090909,-1.000000,6.090909,-11.0,-1.0,4.0,-5.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18846,1992225,3.454545,2.545455,1.181818,4.818182,2.181818,3.090909,3.636364,-3.454545,-4.363636,...,1.909091,1.727273,0.727273,3.363636,0.909091,-1.0,3.0,-3.0,4.0,-11.0
18847,1992226,-3.818182,-3.000000,-10.545455,-10.363636,-6.818182,-7.727273,-4.363636,-6.727273,-13.272727,...,-3.636364,-3.454545,2.000000,1.727273,0.181818,-5.0,10.0,5.0,-3.0,2.0
18848,1992227,-1.909091,-4.000000,1.727273,-4.727273,-4.000000,0.818182,-1.909091,-2.545455,-2.909091,...,-2.272727,-0.636364,-2.090909,-2.909091,-3.090909,0.0,7.0,0.0,-5.0,3.0
18849,1992228,-8.181818,-8.272727,-6.818182,-7.363636,-3.545455,-7.363636,-5.000000,-6.545455,-8.545455,...,-9.727273,-7.090909,-10.909091,-7.636364,-5.636364,-7.0,1.0,-22.0,-6.0,-8.0


In [109]:
#Final check
both_stats

Unnamed: 0,match_api_id,season,stage,match_date,Country,League,home_buildUpPlayPositioningClass,home_chanceCreationPositioningClass,home_defenceDefenderLineClass,away_buildUpPlayPositioningClass,...,away_defenceDefenderLineClass,buildUpPlaySpeed_home_diff,buildUpPlayPassing_home_diff,chanceCreationPassing_home_diff,chanceCreationCrossing_home_diff,chanceCreationShooting_home_diff,defencePressure_home_diff,defenceAggression_home_diff,defenceTeamWidth_home_diff,Home_Score_Adv
0,684955,2009/2010,23,2010-02-22,Spain,Spain LIGA BBVA,Free Form,Organised,Offside Trap,Organised,...,Offside Trap,0.0,-5.0,20.0,10.0,0.0,15.0,30.0,10.0,1
1,686171,2009/2010,20,2010-02-22,Portugal,Portugal Liga ZON Sagres,Organised,Organised,Offside Trap,Organised,...,Offside Trap,0.0,0.0,5.0,-15.0,5.0,0.0,0.0,0.0,-1
2,659091,2009/2010,28,2010-02-23,England,England Premier League,Organised,Free Form,Cover,Organised,...,Cover,12.0,15.0,14.0,0.0,15.0,10.0,-20.0,10.0,3
3,704654,2009/2010,17,2010-02-24,Italy,Italy Serie A,Organised,Organised,Offside Trap,Organised,...,Cover,24.0,5.0,0.0,15.0,15.0,-5.0,-25.0,-5.0,1
4,704632,2009/2010,17,2010-02-24,Italy,Italy Serie A,Free Form,Free Form,Offside Trap,Free Form,...,Offside Trap,22.0,0.0,-5.0,15.0,-10.0,15.0,-5.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19350,1992221,2015/2016,35,2016-05-22,Switzerland,Switzerland Super League,Organised,Organised,Cover,Organised,...,Cover,-8.0,19.0,-12.0,18.0,-20.0,-11.0,-2.0,-6.0,0
19351,1992225,2015/2016,36,2016-05-25,Switzerland,Switzerland Super League,Organised,Organised,Cover,Organised,...,Cover,11.0,-8.0,14.0,12.0,8.0,11.0,15.0,6.0,-1
19352,1992227,2015/2016,36,2016-05-25,Switzerland,Switzerland Super League,Organised,Organised,Cover,Organised,...,Cover,2.0,14.0,-8.0,-10.0,13.0,3.0,6.0,-4.0,0
19353,1992228,2015/2016,36,2016-05-25,Switzerland,Switzerland Super League,Organised,Organised,Cover,Organised,...,Cover,6.0,-29.0,25.0,-30.0,20.0,3.0,11.0,3.0,-3


In [110]:
#Merge
combined = pd.merge(both_stats, home_diffs, on = "match_api_id")
combined.head()

Unnamed: 0,match_api_id,season,stage,match_date,Country,League,home_buildUpPlayPositioningClass,home_chanceCreationPositioningClass,home_defenceDefenderLineClass,away_buildUpPlayPositioningClass,...,home_diff_avg_player_vision,home_diff_avg_player_penalties,home_diff_avg_player_marking,home_diff_avg_player_standing_tackle,home_diff_avg_player_sliding_tackle,home_diff_gk_diving,home_diff_gk_handling,home_diff_gk_kicking,home_diff_gk_positioning,home_diff_gk_reflexes
0,684955,2009/2010,23,2010-02-22,Spain,Spain LIGA BBVA,Free Form,Organised,Offside Trap,Organised,...,8.727273,9.181818,-1.636364,-4.909091,-0.636364,3.0,-5.0,-7.0,13.0,-6.0
1,659091,2009/2010,28,2010-02-23,England,England Premier League,Organised,Free Form,Cover,Organised,...,9.545455,6.727273,7.272727,3.727273,2.909091,-3.0,-3.0,-1.0,-10.0,-8.0
2,704654,2009/2010,17,2010-02-24,Italy,Italy Serie A,Organised,Organised,Offside Trap,Organised,...,0.909091,5.636364,6.181818,8.272727,1.727273,-4.0,0.0,1.0,-2.0,-7.0
3,704632,2009/2010,17,2010-02-24,Italy,Italy Serie A,Free Form,Free Form,Offside Trap,Free Form,...,-3.0,-4.363636,2.636364,3.090909,4.636364,7.0,4.0,-2.0,7.0,5.0
4,674484,2009/2010,24,2010-02-27,Germany,Germany 1. Bundesliga,Organised,Organised,Cover,Organised,...,1.454545,2.454545,-3.454545,-3.909091,-6.181818,-6.0,-1.0,0.0,1.0,-7.0


In [111]:
combined = combined.drop(columns = ["match_api_id", "season", "match_date"])
combined.info()

#Even more records are lost after concatenation--probably because of core misalignments resulting from the
#inherent structure and limits of the data. 16k records is less than I hoped for but plenty for scikit-learn.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16554 entries, 0 to 16553
Data columns (total 53 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   stage                                    16554 non-null  int64  
 1   Country                                  16554 non-null  object 
 2   League                                   16554 non-null  object 
 3   home_buildUpPlayPositioningClass         16554 non-null  object 
 4   home_chanceCreationPositioningClass      16554 non-null  object 
 5   home_defenceDefenderLineClass            16554 non-null  object 
 6   away_buildUpPlayPositioningClass         16554 non-null  object 
 7   away_chanceCreationPositioningClass      16554 non-null  object 
 8   away_defenceDefenderLineClass            16554 non-null  object 
 9   buildUpPlaySpeed_home_diff               16554 non-null  float64
 10  buildUpPlayPassing_home_diff             16554

In [112]:
#On to the next notebook
file_name = "match_predict.csv"
path = DATA_PATH.joinpath(file_name)
combined.to_csv(path, index = False)