In [1]:
# Created: Ben Griffis (@BeGriffis on Twitter)
### apologies, but the full-season files for ISL & WSL are too large for GitHub.
### You can get StatsBomb's data several ways. This explains how to get it in mplsoccer:
##### https://mplsoccer.readthedocs.io/en/latest/gallery/statsbomb/plot_statsbomb_data.html

import pandas as pd

In [2]:
# Load our dataset. For this example code, I'll be using StatsBomb's 21/22 Indian Super League
df = pd.read_csv('ISL 21-22.csv')

# This will filter the data to only show  open-play passes
df = df[(df['type_name']=='Pass') & (df['sub_type_name'].isna())].reset_index(drop=True)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Get a list of all players in the league
players = df.player_name.unique().tolist()

# Create a blank df we will populate with all the passing connections
connection_df = pd.DataFrame(columns=['pass_recipient_name', 'passes', 'passer', 'completions'])

# Iterate the creation of our full connection_df
for i in range(len(players)):
    # create a new df with just one focal player
    df1 = df[df['player_name']==players[i]]
    
    # count up all that focal player's receivers & make it a pandas df
    a = df1.groupby(['pass_recipient_name'])['pass_recipient_name'].count()
    a = pd.DataFrame(a)
    a = a.rename(columns={"pass_recipient_name": "passes"}).reset_index()
    
    # make sure we add in our focal player's name!
    a['passer'] = players[i]
    
    # now we sum up their total completions by receiver
    df1['completions'] = df1.outcome_name.isnull().groupby(df1['pass_recipient_name']).transform('sum')
    b = df1.groupby(['pass_recipient_name'])['completions'].mean().astype(int)
    b = pd.DataFrame(b)
    b = b.rename(columns={"pass_recipient_name": "completions"}).reset_index()
    
    # and finally, we merge our pass attempts and completions & add that receiver to the connection_df
    c = a.merge(b)
    connection_df = connection_df.append(c)
    
# Clean up the index and calculate the completion % for each passer-receiver pair
connection_df.reset_index(drop=True, inplace=True)
connection_df['completion_pct'] = connection_df['completions'] / connection_df['passes']
connection_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['completions'] = df1.outcome_name.isnull().groupby(df1['pass_recipient_name']).transform('sum')


Unnamed: 0,pass_recipient_name,passes,passer,completions,completion_pct
0,Abhishek Dhananjay Suryavanshi,4,Pritam Kotal,4,1.0
1,Amrinder Singh,80,Pritam Kotal,80,1.0
2,Ashutosh Mehta,47,Pritam Kotal,41,0.87234
3,Bidyananda Singh Ningthoujam,3,Pritam Kotal,3,1.0
4,Carl Gerard McHugh,94,Pritam Kotal,90,0.957447
...,...,...,...,...,...
5002,Makan Winkle Chote,1,Hrithik Tiwari,0,0.0
5003,Sanson Pereira,1,Hrithik Tiwari,1,1.0
5004,Seriton Fernandes,1,Hrithik Tiwari,1,1.0
5005,Brison Fernandes,2,Danstan Randall Fernandes,2,1.0


In [4]:
# Fliter out connections with fewer than 10 passes
connection_df = connection_df[connection_df['passes']>=10].reset_index(drop=True)

# Create a blank df we'll use as the base of our ARR data
testing_df = pd.DataFrame(columns=['pass_recipient_name', 'exclusion', 'mean_completion_pct'])

# Create lists with the unique passers and receivers
all_recipients = connection_df.pass_recipient_name.unique().tolist()
all_passers = connection_df.passer.unique().tolist()

# This is where I KNOW I could make the code more efficient...
# Iterate through every single unique recevier to show their receiving % from all passers ('None') and the focal passer (deemed 'exclusion')
for i in range(len(all_recipients)):
    
    # this first for loop creates their total pass received %
    focal_p = all_recipients[i]
    focaldf = connection_df[connection_df['pass_recipient_name']==focal_p]
    mu = focaldf.completions.sum()/focaldf.passes.sum()
    ex = 'None'
    row = pd.DataFrame({'pass_recipient_name':[focal_p], 'exclusion':[ex], 'mean_completion_pct':[mu]})
    testing_df = testing_df.append(row)

    for j in range(len(all_passers)):
        
        # this for loop creates their pass received % EXCLUDING each player
        # I hnow this part could be more efficient since it also excludes players not on the same team....... oh well! We'll filter those out later
        ex2 = all_passers[j]
        focaldf2 = connection_df[(connection_df['pass_recipient_name']==focal_p) & (connection_df['passer']!=ex2)]
        
        # this try/except basically says, if there is just one player our receiver has a connection with, we don't want an error so mu = their completion %
        try:
            mu2 = focaldf2.completions.sum()/focaldf2.passes.sum()
        except:
            m2 = focaldf2.completion_pct.mean()
        row2 = pd.DataFrame({'pass_recipient_name':[focal_p], 'exclusion':[ex2], 'mean_completion_pct':[mu2]})
        testing_df = testing_df.append(row2)

testing_df


Unnamed: 0,pass_recipient_name,exclusion,mean_completion_pct
0,Amrinder Singh,,1.0
0,Amrinder Singh,Pritam Kotal,1.0
0,Amrinder Singh,Marko Lešković,1.0
0,Amrinder Singh,Subhasish Bose,1.0
0,Amrinder Singh,Jorge Rolando Pereyra Díaz,1.0
...,...,...,...
0,Lara Sharma,Deepak Devrani,1.0
0,Lara Sharma,PC Laldinpuia,1.0
0,Lara Sharma,Banana Yaya,1.0
0,Lara Sharma,Sandesh Jhingan,1.0


In [5]:
testing_df = testing_df.reset_index(drop=True)

# Create a column to note the excluded player's difference from the receiving player's overall mean %
testing_df['diff_from_all'] = 0.0

# Iterate through each player to get their diff_from_all value
for i in range(len(testing_df)):
    sample = testing_df[testing_df['pass_recipient_name']==testing_df.pass_recipient_name[i]]
    mu = sample[sample['exclusion']=='None'].mean_completion_pct.values[0]
    testing_df['diff_from_all'][i] = testing_df.mean_completion_pct.values[i] - mu
testing_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_df['diff_from_all'][i] = testing_df.mean_completion_pct.values[i] - mu


Unnamed: 0,pass_recipient_name,exclusion,mean_completion_pct,diff_from_all
0,Amrinder Singh,,1.0,0.0
1,Amrinder Singh,Pritam Kotal,1.0,0.0
2,Amrinder Singh,Marko Lešković,1.0,0.0
3,Amrinder Singh,Subhasish Bose,1.0,0.0
4,Amrinder Singh,Jorge Rolando Pereyra Díaz,1.0,0.0
...,...,...,...,...
51749,Lara Sharma,Deepak Devrani,1.0,0.0
51750,Lara Sharma,PC Laldinpuia,1.0,0.0
51751,Lara Sharma,Banana Yaya,1.0,0.0
51752,Lara Sharma,Sandesh Jhingan,1.0,0.0


In [6]:
# Drop all players with no difference from the mean
## This drops all players not on their team. Statistically speaking, we should see even a very minute difference in completion %
## In the future I want to figure a way to do this in the steps above, but can't think of a way at the moment.
testing_df = testing_df[testing_df['diff_from_all']!=0]
testing_df = testing_df.sort_values(by=['diff_from_all'], ascending=False).reset_index(drop=True)

# Now we are getting into our last stage. Grouping all our 'exclusian' players, i.e. the passers, and getting their average diff_from_all
## this average diff_from_all is their ARR! The average % chance greater(+) or less (-) than the average player that our focal passer gives the receiver to successfully control their passes
final = pd.DataFrame(testing_df.groupby(['exclusion'])['diff_from_all'].mean()).reset_index()

# Add in info on their total connections, coming into play next
final['connections'] = testing_df.groupby(['exclusion'])['pass_recipient_name'].count().tolist()

# Now we drop all players with fewer than five 10-pass connections
final = final[final['connections']>=5].sort_values(by=['diff_from_all'], ascending=False).reset_index(drop=True)

# Let's add their most-passed-from position (per StatsBomb)
final['position'] = ''
all_sample_players = final.exclusion.unique().tolist()
for i in range(len(final)):
    df_1 = df[df['player_name']==all_sample_players[i]]
    
    # long line, but it's getting the position the player passes from the most
    pos = pd.DataFrame(df_1.groupby(['position_name'])['position_id'].count()).reset_index().sort_values(by=['position_id'],ascending=False).position_name.values[0]
    final['position'][i] = pos

# Rename our 'exclusion' column to 'player' which is more accurate
final.rename(columns={'exclusion':'player'},inplace=True)
final



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['position'][i] = pos


Unnamed: 0,player,diff_from_all,connections,position
0,Provat Lakra,0.016250,13,Right Back
1,Prabhushkan Singh Gill,0.015762,7,Goalkeeper
2,Suhair Vadakkepeedika,0.014783,14,Left Wing
3,Jayesh Dilip Rane,0.014074,8,Left Center Midfield
4,Antonio Perošević,0.012961,8,Right Wing
...,...,...,...,...
168,Pritam Kotal,-0.009810,15,Right Center Back
169,Marko Lešković,-0.010254,17,Left Center Back
170,Alberto Noguera Ripoll,-0.010824,18,Center Attacking Midfield
171,Sunil Chhetri,-0.011316,10,Left Wing


In [7]:
### Use this block to see all the passing or receiving connections from (p) or to (r) a player

p = 'Provat Lakra'
connection_df[connection_df['passer']==p]

# r = 'Jorge Ortiz Mendoza'
# connection_df[connection_df['pass_recipient_name']==r]


Unnamed: 0,pass_recipient_name,passes,passer,completions,completion_pct
433,Deshorn Brown,26,Provat Lakra,10,0.384615
434,Hernán Daniel Santana Trujillo,28,Provat Lakra,26,0.928571
435,Imran Khan,14,Provat Lakra,10,0.714286
436,Khassa Camara,18,Provat Lakra,16,0.888889
437,Laldanmawia Ralte,19,Provat Lakra,10,0.526316
438,Marco Sahanek,10,Provat Lakra,6,0.6
439,Mashoor Shereef Thangalakath,24,Provat Lakra,21,0.875
440,Mathias Coureur,11,Provat Lakra,6,0.545455
441,Mohammed Irshad,11,Provat Lakra,9,0.818182
442,Patrick Flottmann,12,Provat Lakra,11,0.916667
