# How to Calculate Expected Threat (xT) in Python

Based on the [video](https://www.youtube.com/watch?v=cMVzNQ6nytU) by McKay Johns

### Importing

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from statsbombpy import sb

from mplsoccer.pitch import Pitch

### Getting Data

In [4]:
# sb.competitions()
# sb.matches(competition_id=55, season_id=43) # Euro 2020
MATCH_ID = 3795506
events = sb.events(match_id=MATCH_ID) # Final Italy - England

credentials were not supplied. open data access only


In [5]:
events.columns

Index(['ball_receipt_outcome', 'ball_recovery_recovery_failure',
       'carry_end_location', 'clearance_aerial_won', 'clearance_body_part',
       'clearance_head', 'clearance_left_foot', 'clearance_right_foot',
       'counterpress', 'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun',
       'duel_outcome', 'duel_type', 'duration', 'foul_committed_advantage',
       'foul_committed_card', 'foul_committed_offensive',
       'foul_committed_type', 'foul_won_advantage', 'foul_won_defensive',
       'goalkeeper_body_part', 'goalkeeper_end_location', 'goalkeeper_outcome',
       'goalkeeper_penalty_saved_to_post', 'goalkeeper_position',
       'goalkeeper_shot_saved_off_target', 'goalkeeper_shot_saved_to_post',
       'goalkeeper_technique', 'goalkeeper_type', 'id', 'index',
       'injury_stoppage_in_chain', 'interception_outcome', 'location',
       'match_id', 'minute', 'miscontrol_aerial_won', 'off_camera', 'out',
       'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id',
  

### Filtering Data

In [17]:
def filter_pass_data_from_events(events, team, team_lineup):    
    # Create mask to filter events dataframe
    mask_team = (events['type'] == 'Pass') & (events['team'] == team)

    # Filter
    df_pass_team = events.loc[mask_team, ['team', 'player', 'minute', 'second', 'type', 'pass_outcome', 'location', 'pass_end_location']]
    
    # Split location column to x, y columns
    df_pass_team[['x','y']] = df_pass_team['location'].to_list()

    # Split pass_end_location column to endX and endY columns
    df_pass_team[['endX','endY']] = df_pass_team['pass_end_location'].to_list()
    
    # Fill NaN at pass_outcome column with Successful string
    df_pass_team['pass_outcome'] = df_pass_team['pass_outcome'].fillna('Successful')
    
    # Add passer number to dataframe
    df_pass_team = pd.merge(df_pass_team, team_lineup, left_on='player', right_on='player_name', how='left')
    df_pass_team.rename(columns={'pass_outcome': 'outcome', 'jersey_number': 'passer'}, inplace=True)
    
    df_pass_team.drop(['player_name', 'location', 'pass_end_location'], axis=1, inplace=True)
    
    mask_successful = df_pass_team['outcome'] == 'Successful'
    df_pass_team = df_pass_team[mask_successful]

    return df_pass_team

In [18]:
TEAM = 'Italy'
# Italy lineup
team_lineup = sb.lineups(match_id = MATCH_ID)[TEAM]
team_lineup.drop(['player_id', 'player_nickname', 'country'], axis=1, inplace=True)

passes = filter_pass_data_from_events(events, TEAM, team_lineup)
passes

credentials were not supplied. open data access only


Unnamed: 0,team,player,minute,second,type,outcome,x,y,endX,endY,passer
0,Italy,Ciro Immobile,0,0,Pass,Successful,60.0,40.0,49.5,33.2,17
1,Italy,Marco Verratti,0,2,Pass,Successful,48.0,35.7,39.0,70.7,6
2,Italy,Giovanni Di Lorenzo,0,5,Pass,Successful,39.2,68.4,24.9,46.7,2
3,Italy,Leonardo Bonucci,0,8,Pass,Successful,25.3,46.4,24.4,17.7,19
4,Italy,Giorgio Chiellini,0,10,Pass,Successful,23.9,18.6,16.4,44.1,3
...,...,...,...,...,...,...,...,...,...,...,...
878,Italy,Bryan Cristante,121,51,Pass,Successful,66.7,26.6,90.3,2.6,16
879,Italy,Alessandro Florenzi,121,55,Pass,Successful,89.2,8.9,76.4,16.5,24
880,Italy,Bryan Cristante,121,57,Pass,Successful,74.9,19.1,78.1,32.1,16
881,Italy,Jorge Luiz Frello Filho,122,0,Pass,Successful,78.7,39.1,76.1,60.9,8


### Calculating xT

In [21]:
xT = pd.read_csv('xT_grid.csv', header = None)
xT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.006383,0.007796,0.008449,0.009777,0.011263,0.012483,0.014736,0.017451,0.021221,0.027563,0.034851,0.037926
1,0.007501,0.008786,0.009424,0.010595,0.012147,0.013845,0.016118,0.018703,0.024015,0.029533,0.04067,0.046477
2,0.00888,0.009777,0.010013,0.011105,0.012692,0.014291,0.016856,0.019351,0.024122,0.028552,0.054911,0.064426
3,0.009411,0.010827,0.010165,0.011324,0.012626,0.014846,0.016895,0.019971,0.023851,0.035113,0.108051,0.257454
4,0.009411,0.010827,0.010165,0.011324,0.012626,0.014846,0.016895,0.019971,0.023851,0.035113,0.108051,0.257454
5,0.00888,0.009777,0.010013,0.011105,0.012692,0.014291,0.016856,0.019351,0.024122,0.028552,0.054911,0.064426
6,0.007501,0.008786,0.009424,0.010595,0.012147,0.013845,0.016118,0.018703,0.024015,0.029533,0.04067,0.046477
7,0.006383,0.007796,0.008449,0.009777,0.011263,0.012483,0.014736,0.017451,0.021221,0.027563,0.034851,0.037926


In [22]:
xT = np.array(xT)

In [24]:
xT

array([[0.00638303, 0.00779616, 0.00844854, 0.00977659, 0.01126267,
        0.01248344, 0.01473596, 0.0174506 , 0.02122129, 0.02756312,
        0.03485072, 0.0379259 ],
       [0.00750072, 0.00878589, 0.00942382, 0.0105949 , 0.01214719,
        0.0138454 , 0.01611813, 0.01870347, 0.02401521, 0.02953272,
        0.04066992, 0.04647721],
       [0.0088799 , 0.00977745, 0.01001304, 0.01110462, 0.01269174,
        0.01429128, 0.01685596, 0.01935132, 0.0241224 , 0.02855202,
        0.05491138, 0.06442595],
       [0.00941056, 0.01082722, 0.01016549, 0.01132376, 0.01262646,
        0.01484598, 0.01689528, 0.0199707 , 0.02385149, 0.03511326,
        0.10805102, 0.25745362],
       [0.00941056, 0.01082722, 0.01016549, 0.01132376, 0.01262646,
        0.01484598, 0.01689528, 0.0199707 , 0.02385149, 0.03511326,
        0.10805102, 0.25745362],
       [0.0088799 , 0.00977745, 0.01001304, 0.01110462, 0.01269174,
        0.01429128, 0.01685596, 0.01935132, 0.0241224 , 0.02855202,
        0.05491138,

In [25]:
xT_rows, xT_cols = xT.shape

In [28]:
xT_cols

12

In [30]:
passes['x1_bin'] = pd.cut(passes['x'], bins = xT_cols, labels = False)
passes['y1_bin'] = pd.cut(passes['y'], bins = xT_rows, labels = False)
passes['x2_bin'] = pd.cut(passes['endX'], bins = xT_cols, labels = False)
passes['y2_bin'] = pd.cut(passes['endY'], bins = xT_rows, labels = False)

In [35]:
passes.head()

Unnamed: 0,team,player,minute,second,type,outcome,x,y,endX,endY,passer,x1_bin,y1_bin,x2_bin,y2_bin,start_zone_value,end_zone_value,xT
0,Italy,Ciro Immobile,0,0,Pass,Successful,60.0,40.0,49.5,33.2,17,5,3,4,3,0.014846,0.012626,-0.00222
1,Italy,Marco Verratti,0,2,Pass,Successful,48.0,35.7,39.0,70.7,6,4,3,3,7,0.012626,0.009777,-0.00285
2,Italy,Giovanni Di Lorenzo,0,5,Pass,Successful,39.2,68.4,24.9,46.7,2,3,6,2,4,0.010595,0.010165,-0.000429
3,Italy,Leonardo Bonucci,0,8,Pass,Successful,25.3,46.4,24.4,17.7,19,2,4,2,1,0.010165,0.009424,-0.000742
4,Italy,Giorgio Chiellini,0,10,Pass,Successful,23.9,18.6,16.4,44.1,3,2,1,1,4,0.009424,0.010827,0.001403


In [32]:
passes['start_zone_value'] = passes[['x1_bin', 'y1_bin']].apply(lambda pair: xT[pair[1]][pair[0]], axis=1)
passes['end_zone_value'] = passes[['x2_bin', 'y2_bin']].apply(lambda pair: xT[pair[1]][pair[0]], axis=1)

In [34]:
passes['xT'] = passes['end_zone_value'] - passes['start_zone_value']

In [38]:
passes['xT'].sum()

1.4524357700000001

In [42]:
grouped = passes.groupby('player')
grouped['xT'].sum()

player
Alessandro Florenzi            0.010029
Andrea Belotti                -0.002997
Bryan Cristante                0.199321
Ciro Immobile                 -0.003213
Domenico Berardi               0.013758
Emerson Palmieri dos Santos    0.017367
Federico Bernardeschi          0.232105
Federico Chiesa                0.000689
Gianluigi Donnarumma           0.020018
Giorgio Chiellini              0.070386
Giovanni Di Lorenzo            0.309517
Jorge Luiz Frello Filho        0.185843
Leonardo Bonucci               0.271554
Lorenzo Insigne               -0.023914
Manuel Locatelli              -0.009738
Marco Verratti                 0.140804
Nicolò Barella                 0.020907
Name: xT, dtype: float64

In [44]:
grouped['xT']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fc1c8962190>