In [58]:
import pandas as pd
import numpy as np
import os

pd.set_option('max_columns', None)
pd.set_option('max_rows', 2500)

# **Paired xT**

So here we want to do a join on itself to produce a summary statistic for actions that move the ball to another player on the same team.

The summary statistic will be the aggregate sum of xT between pairs of players, and we'll call this *paired xT*.

## **Loading Opta Bayesian xT Data**

In [15]:
%%time

df_opta_xT = pd.read_csv('/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Project/Data/Analysis Ready/Opta Bayesian xT/Bayesian_Opta_xT.csv')

# converting the timestamp string to a datetime
df_opta_xT['timeStamp'] = pd.to_datetime(df_opta_xT.timeStamp, format='%Y-%m-%d %H:%M:%S.%f')
df_opta_xT['kickOffDateTime'] = pd.to_datetime(df_opta_xT.kickOffDateTime, format='%Y-%m-%d %H:%M:%S.%f')

print (f'{len(df_opta_xT)} rows loaded.\n')

df_opta_xT.head()

3126182 rows loaded.

CPU times: user 14 s, sys: 1.75 s, total: 15.7 s
Wall time: 16.1 s


Unnamed: 0,competition,season,seasonIndex,gameMonthIndex,matchId,playerId,playerName,position,detailedPosition,playerTeamId,minsPlayed,subIn,subOut,replacedReplacingPlayerId,booking,eventType,eventSubType,eventTypeId,x1,y1,x2,y2,gameTime,timeStamp,periodId,homeTeamName,homeTeamId,awayTeamName,awayTeamId,kickOffDateTime,minute,second,x1_m,y1_m,x2_m,y2_m,xT
0,English Premier League,2017/18,1,24212,918893,59966,Alexandre Lacazette,Forward,Striker,3,95,,,,,attack,Pass,1,50.0,50.7,28.8,30.1,0:1,2017-08-11 19:46:04.968,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,1,52.5,34.476,30.24,20.468,-0.003278
1,English Premier League,2017/18,1,24212,918893,156074,Rob Holding,Defender,FullBack,3,67,,1.0,,,attack,Pass,1,29.7,26.7,52.3,21.5,0:2,2017-08-11 19:46:05.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,2,31.185,18.156,54.915,14.62,0.003008
2,English Premier League,2017/18,1,24212,918893,37605,Mesut Özil,Forward,AttackingMidfielder,3,95,,,,,attack,Pass,1,52.8,21.3,44.3,20.7,0:5,2017-08-11 19:46:08.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,5,55.44,14.484,46.515,14.076,-0.001186
3,English Premier League,2017/18,1,24212,918893,153256,Mohamed Elneny,Midfielder,CentralMidfielder,3,66,,1.0,,,attack,Pass,1,44.0,19.6,50.3,4.2,0:7,2017-08-11 19:46:10.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,7,46.2,13.328,52.815,2.856,-0.000542
4,English Premier League,2017/18,1,24212,918893,98745,Héctor Bellerín,Midfielder,RightMidfielder,3,95,,,,,attack,Pass,1,51.0,4.2,70.5,5.0,0:9,2017-08-11 19:46:13.519,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,9,53.55,2.856,74.025,3.4,0.00585


## **Opta Pass Events**

In [23]:
# pass events (inc. crosses)
opta_successful_pass_events = ['2nd Assist','Assist','Chance Created','Cross','Pass']
opta_failed_pass_events = ['Failed Pass','Offside Pass']

## **Producing Pairs**

In [89]:
%%time

# really we just want to filter on all attack events, and then look for when a (successful) pass is followed up by a subsequent attack event by the same team.
df_attack = df_opta_xT.loc[df_opta_xT['eventType'] == 'attack'].sort_values(['seasonIndex','competition','matchId','timeStamp'], ascending=[True, True, True, True]).reset_index(drop=True)

# initialising two new columns: 1) receiver player and 2) receiver player id
df_attack['receiverPlayerName'] = None
df_attack['receiverPlayerId'] = None

# logic: different players, same match, same half, first event is a successful pass-like action, and the follow on event is still an attack (which is by the same team)
df_attack.loc[((df_attack['playerId'] != df_attack['playerId'].shift(-1)) & (df_attack['matchId'] == df_attack['matchId'].shift(-1)) & (df_attack['periodId'] == df_attack['periodId'].shift(-1)) & (df_attack['eventSubType'].isin(opta_successful_pass_events)) & (df_attack['eventType'] == 'attack')), 'receiverPlayerName'] = df_attack['playerName'].shift(-1)
df_attack.loc[((df_attack['playerId'] != df_attack['playerId'].shift(-1)) & (df_attack['matchId'] == df_attack['matchId'].shift(-1)) & (df_attack['periodId'] == df_attack['periodId'].shift(-1)) & (df_attack['eventSubType'].isin(opta_successful_pass_events)) & (df_attack['eventType'] == 'attack')), 'receiverPlayerId'] = df_attack['playerId'].shift(-1)

# getting rid of Nones where the pass was not successfully mapped
df_attack = df_attack.loc[pd.isna(df_attack['receiverPlayerName']) == False].reset_index(drop=True).copy()
df_attack['receiverPlayerId'] = df_attack.receiverPlayerId.astype(int)

# and now creating pair objects to group over (this is the bit that takes a little bit of time)
df_attack['pairName'] = df_attack.apply(lambda x: ('-').join(sorted([x.receiverPlayerName] + [x.playerName])), axis=1)
df_attack['pairIds'] = df_attack.apply(lambda x: sorted([x.receiverPlayerId] + [x.playerId]), axis=1)
df_attack['playerA'] = df_attack.pairIds.apply(lambda x: min(x))
df_attack['playerB'] = df_attack.pairIds.apply(lambda x: max(x))

CPU times: user 1min 11s, sys: 3.37 s, total: 1min 15s
Wall time: 1min 15s


## **Producing Pair Statistics**

In [102]:
df_epl = df_attack.loc[df_attack['competition'] == 'English Premier League'].reset_index(drop=True)
df_cl = df_attack.loc[df_attack['competition'] == 'Champions League'].reset_index(drop=True)

### Directional Pairs

In [91]:
"""
df_pairs = df_epl.groupby(['competition','season','playerName','playerId','receiverPlayerName','receiverPlayerId'])\
            .agg({'xT':np.sum,'x1':'count'})\
            .reset_index()\
            .rename(columns={'x1':'numActions'})

df_pairs.loc[df_pairs['numActions'] > 50].sort_values(['xT'], ascending=[False]).head()
"""

"\ndf_pairs = df_epl.groupby(['competition','season','playerName','playerId','receiverPlayerName','receiverPlayerId'])            .agg({'xT':np.sum,'x1':'count'})            .reset_index()            .rename(columns={'x1':'numActions'})\n\ndf_pairs.loc[df_pairs['numActions'] > 50].sort_values(['xT'], ascending=[False]).head()\n"

### Directionless pairs: pure chemistry between pairs of players

#### 1. Premier League

In [131]:
df_epl_pairs = df_epl.groupby(['competition','playerTeamId','season','pairName'])\
            .agg({'xT':np.sum,'x1':'count'})\
            .reset_index()\
            .rename(columns={'x1':'numActions'})

# applying season by season rank
df_epl_pairs['seasonRank'] = df_epl_pairs.sort_values(['season','xT'], ascending=[True, False]).groupby('season').cumcount() + 1

# sorting by season
df_epl_pairs.loc[(df_epl_pairs['seasonRank'] <= 20) & (df_epl_pairs['season'].isin(['2020/21']))].sort_values(['season', 'seasonRank'], ascending=[True, True]).reset_index(drop=True)

Unnamed: 0,competition,playerTeamId,season,pairName,xT,numActions,seasonRank
0,English Premier League,14,2020/21,Andrew Robertson-Sadio Mané,2.433447,567,1
1,English Premier League,14,2020/21,Mohamed Salah-Sadio Mané,2.349852,125,2
2,English Premier League,20,2020/21,James Ward-Prowse-Jannik Vestergaard,2.235552,421,3
3,English Premier League,90,2020/21,Ashley Westwood-James Tarkowski,2.162142,368,4
4,English Premier League,31,2020/21,Andros Townsend-Christian Benteke,2.107945,76,5
5,English Premier League,6,2020/21,Harry Kane-Son Heung-Min,1.950966,182,6
6,English Premier League,14,2020/21,Andrew Robertson-Roberto Firmino,1.950019,357,7
7,English Premier League,21,2020/21,Aaron Cresswell-Tomas Soucek,1.817882,217,8
8,English Premier League,90,2020/21,Dwight McNeil-James Tarkowski,1.809613,84,9
9,English Premier League,90,2020/21,Ashley Westwood-Chris Wood,1.798031,171,10


#### 2. Champions League

In [97]:
df_cl_pairs = df_cl.groupby(['competition','season','pairName'])\
            .agg({'xT':np.sum,'x1':'count'})\
            .reset_index()\
            .rename(columns={'x1':'numActions'})

# applying season by season rank
df_cl_pairs['seasonRank'] = df_cl_pairs.sort_values(['season','xT'], ascending=[True, False]).groupby('season').cumcount() + 1

# sorting by season
df_cl_pairs.loc[df_cl_pairs['seasonRank'] <= 20].sort_values(['season', 'seasonRank'], ascending=[True, True]).reset_index(drop=True)

Unnamed: 0,competition,season,pairName,xT,numActions,seasonRank
0,Champions League,2017/18,Joshua Kimmich-Robert Lewandowski,1.108728,46,1
1,Champions League,2017/18,Mohamed Salah-Roberto Firmino,0.869003,82,2
2,Champions League,2017/18,Cristiano Ronaldo-Marcelo,0.783347,109,3
3,Champions League,2017/18,Cesc Fàbregas-Jan Oblak,0.736201,6,4
4,Champions League,2017/18,Anthony Martial-Romelu Lukaku,0.730775,19,5
5,Champions League,2017/18,Clément Lenglet-Éver Banega,0.729998,138,6
6,Champions League,2017/18,Cristiano Ronaldo-Isco,0.7128,73,7
7,Champions League,2017/18,Aleksandar Kolarov-Edin Dzeko,0.700249,68,8
8,Champions League,2017/18,Alex Telles-Iván Marcano,0.665525,90,9
9,Champions League,2017/18,Edin Dzeko-Stephan El Shaarawy,0.611083,18,10


## **Interesting Metric! Team Pair Concentration**

> How much xT is concentrated between pairs?

> Do teams share the load or do they create threat between just a few combinations of players?

In [136]:
# there's a small amount of noise in the mapping of these actions, so putting a min filter in there to get rid of that
df_epl_denom = df_epl_pairs.loc[df_epl_pairs['numActions'] >= 5].reset_index(drop=True)\
                .groupby(['competition','season','playerTeamId'])\
                .agg({'xT':np.sum,'numActions':np.sum})\
                .reset_index()\
                .rename(columns={'xT':'xT_Team','numActions':'numActions_Team'})

df_epl_conc = df_epl_pairs.merge(df_epl_denom, how='inner', on=['competition','season','playerTeamId'])

df_epl_conc['xT_Prop'] = df_epl_conc.xT / df_epl_conc.xT_Team
df_epl_conc['numActions_Prop'] = df_epl_conc.numActions / df_epl_conc.numActions_Team

df_epl_conc['xT_Prop_Rank'] = df_epl_conc.sort_values(['season','xT_Prop'], ascending=[True,False]).groupby('season').cumcount() + 1

In [149]:
df_epl_conc.loc[(df_epl_conc['xT_Prop_Rank'] <= 20) & (df_epl_conc['season'].isin(['2020/21','2019/20','2018/19','2017/18']))].sort_values(['season','xT_Prop_Rank'], ascending=[True, True]).reset_index(drop=True)

Unnamed: 0,competition,playerTeamId,season,pairName,xT,numActions,seasonRank,xT_Team,numActions_Team,xT_Prop,numActions_Prop,xT_Prop_Rank
0,English Premier League,36,2017/18,Pascal Groß-Shane Duffy,2.426302,93,2,22.919044,11213,0.105864,0.008294,1
1,English Premier League,36,2017/18,Lewis Dunk-Pascal Groß,2.055232,93,3,22.919044,11213,0.089674,0.008294,2
2,English Premier League,38,2017/18,Florent Hadergjonaj-Steve Mounie,1.28606,89,33,16.321101,10164,0.078797,0.008756,3
3,English Premier League,6,2017/18,Christian Eriksen-Harry Kane,2.495666,197,1,32.870777,18802,0.075924,0.010478,4
4,English Premier League,31,2017/18,Andros Townsend-Christian Benteke,2.02199,103,4,27.797693,11366,0.072739,0.009062,5
5,English Premier League,13,2017/18,Harry Maguire-Riyad Mahrez,1.847349,136,6,25.785822,11930,0.071642,0.0114,6
6,English Premier League,57,2017/18,José Holebas-Richarlison,1.872073,214,5,27.90319,11943,0.067092,0.017918,7
7,English Premier League,110,2017/18,Ryan Shawcross-Xherdan Shaqiri,1.485159,74,21,24.199793,8544,0.061371,0.008661,8
8,English Premier League,13,2017/18,Harry Maguire-Marc Albrighton,1.514876,162,17,25.785822,11930,0.058748,0.013579,9
9,English Premier League,43,2017/18,Kevin De Bruyne-Raheem Sterling,1.71392,401,9,29.50293,25658,0.058093,0.015629,10


### **Kane-Son**

> Deeply impacted by loss of Christian Eriksen

In [152]:
df_epl_conc.loc[(df_epl_conc['season'].isin(['2020/21','2019/20','2018/19','2017/18']) & (   (df_epl_conc.pairName.str.contains('Harry Kane-Son Heung-Min')) | df_epl_conc.pairName.str.contains('Christian Eriksen-Harry Kane'))  )].sort_values(['season','xT_Prop_Rank'], ascending=[True, True]).reset_index(drop=True)

Unnamed: 0,competition,playerTeamId,season,pairName,xT,numActions,seasonRank,xT_Team,numActions_Team,xT_Prop,numActions_Prop,xT_Prop_Rank
0,English Premier League,6,2017/18,Christian Eriksen-Harry Kane,2.495666,197,1,32.870777,18802,0.075924,0.010478,4
1,English Premier League,6,2017/18,Harry Kane-Son Heung-Min,0.332124,81,541,32.870777,18802,0.010104,0.004308,763
2,English Premier League,6,2018/19,Christian Eriksen-Harry Kane,1.248848,120,49,27.792189,18351,0.044935,0.006539,44
3,English Premier League,6,2018/19,Harry Kane-Son Heung-Min,0.347044,83,526,27.792189,18351,0.012487,0.004523,508
4,English Premier League,6,2019/20,Harry Kane-Son Heung-Min,1.054963,98,45,19.434139,15651,0.054284,0.006262,17
5,English Premier League,6,2019/20,Christian Eriksen-Harry Kane,0.488594,60,286,19.434139,15651,0.025141,0.003834,147
6,English Premier League,6,2020/21,Harry Kane-Son Heung-Min,1.950966,182,6,20.014417,16149,0.097478,0.01127,2


## Way of showing this off in report:

1. Provide tables per season (maybe top 5 pairs per season)
2. Provide specific changes at Spurs: show grouped bar charts (rather than stacked), one bar per player, specifically for Kane, where one bar will be Eriksen, one will be Son, and pick a couple of others. And show that for the four years.