**3. Calculating metrics for passes**

The following tasks are taken into account in this notebook:

1. Cluster the defender lineups into sub categories based on the footedness pattern of the defender lineup starting from the right back (RB) position

    For example, **right-right-right-left (rrrl)** category indicates that this is a lineup of four defenders where -

    **right back (RB) is right footed**

    **right center back (RCB) is right footed**

    **left center back (LCB) is right footed**

    **left back (LB) is left footed**

2. Compute multiple passing based attributes for defenders for each match using match lineup data (from **match+def_lineup+footedness_ver2.pkl**) and events data (from **events_v2.pkl**)

The following are the resulting pickle files:

1. Cluster wise files with passing attributes for each defender for each match




In [1]:
import pandas as pd
import numpy as np
from unidecode import unidecode
from tqdm import tqdm
import re
from difflib import SequenceMatcher
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns",1000)

**Loading pickle file with Premier League 2017-18 events data (along with player roles i.e. whether the player is a goalkeeper (GKP), defender (DEF), midfielder (MID) or forward (FWD))**

In [2]:
df_events_roles = pd.read_pickle("../data/events/events_v2.pkl")

**Loading the pickle file with defence lineup information for each team participating in a particular match.**

In [3]:
df_defence_footed = pd.read_pickle("../data/matches/match+def_lineup+footedness_ver2.pkl")

**Observing the unique footedness categories in the dataframe**

In [4]:
footedness_patterns = df_defence_footed["footedness"].unique()

**Renaming certain positional columns for better understanding**

In [5]:
df_defence_footed.rename(columns={'R-CB':'R_CB',"L-CB":'L_CB'},inplace=True)

In [6]:
df_events_roles.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,wyId,foot,playerName,role
0,8,Simple pass,[Accurate],25413,"[[50.96, 34.68], [32.24, 14.96]]",2499719,Pass,1609,1H,2.758649,85,177959171,25413.0,right,AlexandreLacazette,FWD
1,1,Air duel,"[Lost, Not accurate]",25413,"[[73.84, 31.28], [52.0, 27.88]]",2499719,Duel,1609,1H,22.551816,10,177959181,25413.0,right,AlexandreLacazette,FWD
2,10,Shot,"[Goal, Right foot, Opportunity, Position: Goal...",25413,"[[91.52, 40.12], [0.0, 68.0]]",2499719,Shot,1609,1H,94.595788,100,177959212,25413.0,right,AlexandreLacazette,FWD
3,8,Head pass,[Not accurate],25413,"[[73.84, 20.4], [73.84, 25.16]]",2499719,Pass,1609,1H,397.881307,82,177959276,25413.0,right,AlexandreLacazette,FWD
4,1,Ground defending duel,"[Take on left, Lost, Not accurate]",25413,"[[63.44, 24.48], [53.04, 8.84]]",2499719,Duel,1609,1H,494.461238,12,177959303,25413.0,right,AlexandreLacazette,FWD


**Filtering out pass data for defenders and finding league wise total passes and total accurate passes for defenders**

In [7]:
df_events_pass = df_events_roles.loc[df_events_roles['eventName'].str.contains('Pass')].loc[df_events_roles['role']=='DEF']

In [8]:
league_pass_info = dict()
league_pass_info['totalpasses'] = len(df_events_pass)

In [9]:
# df_events_pass['playerName'].value_counts()

In [10]:
league_pass_info['totalaccuratepasses']=len(df_events_pass[df_events_pass['tags'].apply(lambda x: "Accurate" in x)])

In [11]:
league_pass_info

{'totalpasses': 138538, 'totalaccuratepasses': 114943}

**Creating seperate dataframes for each defensive lineup based for preferred foot of each defender**

In [12]:
# footedness_patterns

In [13]:
df_rrrl = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right-left']
df_rrll = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-left-left']
df_rrl = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-left']
df_rrr = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right']
df_rll = df_defence_footed.loc[df_defence_footed['footedness']=='right-left-left']
df_rrrll = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right-left-left']
df_rrlr = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-left-right']
df_rrrr = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right-right']
df_rrrrl = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right-right-left']
df_rlr = df_defence_footed.loc[df_defence_footed['footedness']=='right-left-right']
df_rrrlr = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-right-left-right']
df_rrlll = df_defence_footed.loc[df_defence_footed['footedness']=='right-right-left-left-left']
df_rlll = df_defence_footed.loc[df_defence_footed['footedness']=='right-left-left-left']

**Creating a list of such dataframes**

In [14]:
df_clusters = [df_rrrl,df_rrll,df_rrl,df_rrr,df_rll,df_rrrll,df_rrlr,df_rrrr,df_rrrrl,df_rlr,df_rrrlr,df_rrlll,df_rlll]

**Creating a dictionary of mapping of players with a mismatch in names in events data and Premier League parsed data**

In [15]:
player_map = {  'RamiroFunesMori': 'JoseRamiroFunesMori',
                'KurtZouma': 'KurtHappyZouma',
                'Danilo': 'DaniloLuizdaSilva',
                'CesarAzpilicueta': 'CesarAzpilicuetaTanco',
                'EzequielSchelotto': 'MatiasEzequielSchelotto',
                'GaetanBong': 'GaetanBongSongo',
                'HectorBellerin': 'HectorBellerinMoruno',
                'AhmedHegazi': 'AhmedHegazy',
                'JamaalLascelles': 'JamalLascelles',
                'AngelRangel': 'AngelRangelZaragoza',
                'Zanka': 'MathiasJattahNjieJorgensen',
                'EricBailly': 'EricBertrandBailly',
                'MarcosRojo': 'FaustinoMarcosAlbertoRojo',
                'AngeloOgbonna': 'AngeloObinzeOgbonna',
                'DavinsonSanchez': 'DavinsonSanchezMina',
                'JavierManquillo': 'JavierManquilloGaitan',
                'TommySmith': 'TomSmith',
                'Bruno': 'BrunoSaltorGrau',
                'JosephGomez': 'JoeGomez',
                'AlbertoMoreno':'AlbertoMorenoPerez',
                'LuisAntonioValencia':'LuisAntonioValenciaMosquera',
                'NicolasOtamendi':'NicolasHernanOtamendi',
                'NachoMonreal':'IgnacioMonrealEraso',
                'CedricSoares':'CedricRicardoAlvesSoares',
                'JoelMatip':'JoelAndreJobMatip',
                'MiguelBritos':'MiguelAngelBritosCabrera',
                'VictorLindelof':'VictorNilssonLindelof',
                'JamesCollins':'JamesMichaelCollins',
                'CucoMartina':'RhuendlyMartina',
                'DavidLuiz':'DavidLuizMoreiraMarinho',
                'ChancelMbemba':'ChancelMbembaMangulu',
                'PabloZabaleta':'PabloJavierZabaletaGirod',
                'KikoFemenia':'FranciscoFemeniaFar',
                'JoseFonte':'JoseMigueldaRochaFonte',
                'JesusGamez':'JesusGamezDuarte'}


**Creating a metrics collection function that takes in x (match_id) and y (player name) and returns the following metrics-**

**numpasses** - number of passes made by the player in the queried match

**numaccpasses** - number of accurate passes made by the player in the queried match

**numhighpasses** - number of high (aerial) passes made by the player in the queried match

**numhighaccpasses** - number of high (aerial) accurate passes made by the player in the queried match

**accpasslocs** - starting and ending coordinates of all the accurate passes made by the player in the queried match

**inaccpasslocs** - starting and ending coordinates of all the inaccurate passes made by the player in the queried match

**acchighpasslocs** - starting and ending coordinates of all the accurate high passes made by the player in the queried match

**inacchighpasslocs** - starting and ending coordinates of all the inaccurate high passes made by the player in the queried match

In [16]:
def getmetrics(x,y):
    try:
        y = player_map[y]
    except:
        pass
    split_y = re.findall('[A-Z][^A-Z]*',y)
    try:
        pass_df = df_events_pass.loc[(df_events_pass['playerName'].str.contains(split_y[-1]))&
                                 (df_events_pass['playerName'].str.contains(split_y[-2]))&
                                 (df_events_pass['matchId']==int(x))]
    except:
        pass_df = df_events_pass.loc[(df_events_pass['playerName'].str.contains(split_y[-1]))&
                                 (df_events_pass['matchId']==int(x))]
    numpasses = len(pass_df)
    numaccpasses = len(pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)])
    numhighpasses = len(pass_df.loc[pass_df['subEventName']=='High pass'])
    numhighaccpasses = len(pass_df.loc[(pass_df['subEventName']=='High pass') & (pass_df['tags'].apply(lambda a: "Accurate" in a))])
    accpasslocs = pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)]['positions'].tolist()
    inaccpasslocs = pass_df.loc[pass_df['tags'].apply(lambda a: "Not accurate" in a)]['positions'].tolist()
    acchighpasslocs = pass_df.loc[(pass_df['subEventName']=='High pass') & (pass_df['tags'].apply(lambda a: "Accurate" in a))]['positions'].tolist()
    inacchighpasslocs = pass_df.loc[(pass_df['subEventName']=='High pass') & (pass_df['tags'].apply(lambda a: "Not accurate" in a))]['positions'].tolist()
    return [numpasses,numaccpasses,numhighpasses,numhighaccpasses,accpasslocs,inaccpasslocs,acchighpasslocs,inacchighpasslocs]


In [17]:
# getmetrics(2500081,"Bruno")

In [18]:
new_cols = ['RB_all',
            'R_CB_all',
            'L_CB_all',
            'LB_all',
            'RCB_all',
            'CB_all',
            'LCB_all',
            'RWB_all',
            'LWB_all']

**Collecting metrics for each defender location for various clusters**

In [19]:
#R_CB - Right center back for 4 defender formation
#RCB - Right center back for 3 or 5 defender formation
#L_CB - Left center back for 4 defender formation
#LCB - Left center back for 3 or 5 defender formation
df_clusters_updated = list()
for df in tqdm(df_clusters):
    df = df.reindex(columns = df.columns.tolist() + new_cols)
    if df.iloc[0]['backline'] == 4.0:     
        df['RB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RB), axis=1)
        df['R_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['R_CB']), axis=1)
        df['L_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['L_CB']), axis=1)
        df['LB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LB), axis=1)
        df_clusters_updated.append(df)
    
    elif df.iloc[0]['backline'] == 3.0:
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df_clusters_updated.append(df)
        
    elif df.iloc[0]['backline'] == 5.0:
        df['RWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RWB), axis=1)
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df['LWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LWB), axis=1)
        df_clusters_updated.append(df)

100%|██████████| 13/13 [07:58<00:00, 36.81s/it]


In [20]:
# df_players.loc[df_players['playerName'].str.contains("DavidLuizMoreiraMarinho")]

In [21]:
# df_clusters_updated[6].loc[df_clusters_updated[6]['RB_all'].apply(lambda p : p[0]==0)]

**Splitting the metrics into individual columns**

In [22]:
df_clusters_metrics = list()
for df in tqdm(df_clusters_updated):
    if df.iloc[0]['backline'] == 4.0:
        df[['RB_pass','RB_accpass','RB_highpass','RB_acchighpass','RB_accpassloc','RB_inaccpassloc','RB_acchighpassloc','RB_inacchighpassloc']] = pd.DataFrame(df['RB_all'].to_list(), index=df.index)
        df[['R_CB_pass','R_CB_accpass','R_CB_highpass','R_CB_acchighpass','R_CB_accpassloc','R_CB_inaccpassloc','R_CB_acchighpassloc','R_CB_inacchighpassloc']] = pd.DataFrame(df['R_CB_all'].to_list(), index=df.index)
        df[['L_CB_pass','L_CB_accpass','L_CB_highpass','L_CB_acchighpass','L_CB_accpassloc','L_CB_inaccpassloc','L_CB_acchighpassloc','L_CB_inacchighpassloc']] = pd.DataFrame(df['L_CB_all'].to_list(), index=df.index)
        df[['LB_pass','LB_accpass','LB_highpass','LB_acchighpass','LB_accpassloc','LB_inaccpassloc','LB_acchighpassloc','LB_inacchighpassloc']] = pd.DataFrame(df['LB_all'].to_list(), index=df.index)
        df.drop(['RB_all','R_CB_all','L_CB_all','LB_all','RCB_all','LCB_all','CB_all','RWB_all','LWB_all'], axis=1, inplace = True)
        df_clusters_metrics.append(df)
       
    elif df.iloc[0]['backline'] == 3.0:
        df[['RCB_pass','RCB_accpass','RCB_highpass','RCB_acchighpass','RCB_accpassloc','RCB_inaccpassloc','RCB_acchighpassloc','RCB_inacchighpassloc']] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[['CB_pass','CB_accpass','CB_highpass','CB_acchighpass','CB_accpassloc','CB_inaccpassloc','CB_acchighpassloc','CB_inacchighpassloc']] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[['LCB_pass','LCB_accpass','LCB_highpass','LCB_acchighpass','LCB_accpassloc','LCB_inaccpassloc','LCB_acchighpassloc','LCB_inacchighpassloc']] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df.drop(['RB_all','R_CB_all','L_CB_all','LB_all','RCB_all','LCB_all','CB_all','RWB_all','LWB_all'], axis=1, inplace = True)
        df_clusters_metrics.append(df)
       
    elif df.iloc[0]['backline'] == 5.0:
        df[['RCB_pass','RCB_accpass','RCB_highpass','RCB_acchighpass','RCB_accpassloc','RCB_inaccpassloc','RCB_acchighpassloc','RCB_inacchighpassloc']] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[['CB_pass','CB_accpass','CB_highpass','CB_acchighpass','CB_accpassloc','CB_inaccpassloc','CB_acchighpassloc','CB_inacchighpassloc']] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[['LCB_pass','LCB_accpass','LCB_highpass','LCB_acchighpass','LCB_accpassloc','LCB_inaccpassloc','LCB_acchighpassloc','LCB_inacchighpassloc']] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df[['RWB_pass','RWB_accpass','RWB_highpass','RWB_acchighpass','RWB_accpassloc','RWB_inaccpassloc','RWB_acchighpassloc','RWB_inacchighpassloc']] = pd.DataFrame(df['RWB_all'].to_list(), index=df.index)
        df[['LWB_pass','LWB_accpass','LWB_highpass','LWB_acchighpass','LWB_accpassloc','LWB_inaccpassloc','LWB_acchighpassloc','LWB_inacchighpassloc']] = pd.DataFrame(df['LWB_all'].to_list(), index=df.index)
        df.drop(['RB_all','R_CB_all','L_CB_all','LB_all','RCB_all','LCB_all','CB_all','RWB_all','LWB_all'], axis=1, inplace = True)
        df_clusters_metrics.append(df)

100%|██████████| 13/13 [00:00<00:00, 32.23it/s]


In [24]:
names = ['rrrl','rrll','rrl','rrr','rll','rrrll','rrlr','rrrr','rrrrl','rlr','rrrlr','rrlll','rlll']

for i,df in enumerate(df_clusters_metrics):
    df.to_pickle(f'../data/clusters/clusters_v3/cluster_{names[i]}.pkl')

In [6]:
# df_events_pass.loc[(df_events_pass['matchId']==2500065) & (df_events_pass['playerName'].str.contains('FedericoFernandez'))]