In [1]:
# standard imports
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from collections import Counter
#import json
import os

pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', 100)

# stats packages to fit classification models
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.calibration import calibration_curve

# importing xG xGils library
import xGils.xG as xG

# **xG Feature Engineering**

1. Load in pre-made Opta dataset.
2. Load in synthetic data.
2. Add additional features:
    * Basic features;
    * Added features;
    * Advanced features.
3. Fit logistic / probit regression model.

**Note, we'll have to construct some of the features before we construct a `df_shots` dataframe.**

**Will also want to integrate the synthetic shots and see if that improves things.**
(Will have to generate some dummy data for the synthetic shots).

## **1) Loading Opta dataset (which includes Bayesian xT)**

In [2]:
%%time

df = pd.read_csv('/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Project/Data/Analysis Ready/Opta Bayesian xT/Bayesian_Opta_xT.csv')

# converting the timestamp string to a datetime
df['timeStamp'] = pd.to_datetime(df.timeStamp, format='%Y-%m-%d %H:%M:%S.%f')
df['kickOffDateTime'] = pd.to_datetime(df.kickOffDateTime, format='%Y-%m-%d %H:%M:%S.%f')

print (f'{len(df)} rows loaded.\n')

df.head()


3126182 rows loaded.

CPU times: user 13.4 s, sys: 1.19 s, total: 14.6 s
Wall time: 14.7 s


Unnamed: 0,competition,season,seasonIndex,gameMonthIndex,matchId,playerId,playerName,position,detailedPosition,playerTeamId,minsPlayed,subIn,subOut,replacedReplacingPlayerId,booking,eventType,eventSubType,eventTypeId,x1,y1,x2,y2,gameTime,timeStamp,periodId,homeTeamName,homeTeamId,awayTeamName,awayTeamId,kickOffDateTime,minute,second,x1_m,y1_m,x2_m,y2_m,xT
0,English Premier League,2017/18,1,24212,918893,59966,Alexandre Lacazette,Forward,Striker,3,95,,,,,attack,Pass,1,50.0,50.7,28.8,30.1,0:1,2017-08-11 19:46:04.968,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,1,52.5,34.476,30.24,20.468,-0.003278
1,English Premier League,2017/18,1,24212,918893,156074,Rob Holding,Defender,FullBack,3,67,,1.0,,,attack,Pass,1,29.7,26.7,52.3,21.5,0:2,2017-08-11 19:46:05.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,2,31.185,18.156,54.915,14.62,0.003008
2,English Premier League,2017/18,1,24212,918893,37605,Mesut Özil,Forward,AttackingMidfielder,3,95,,,,,attack,Pass,1,52.8,21.3,44.3,20.7,0:5,2017-08-11 19:46:08.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,5,55.44,14.484,46.515,14.076,-0.001186
3,English Premier League,2017/18,1,24212,918893,153256,Mohamed Elneny,Midfielder,CentralMidfielder,3,66,,1.0,,,attack,Pass,1,44.0,19.6,50.3,4.2,0:7,2017-08-11 19:46:10.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,7,46.2,13.328,52.815,2.856,-0.000542
4,English Premier League,2017/18,1,24212,918893,98745,Héctor Bellerín,Midfielder,RightMidfielder,3,95,,,,,attack,Pass,1,51.0,4.2,70.5,5.0,0:9,2017-08-11 19:46:13.519,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,9,53.55,2.856,74.025,3.4,0.00585


### **Setting useful event types (may not need this)**

In [3]:
# pass events (inc. crosses)
opta_successful_pass_events = ['2nd Assist','Assist','Chance Created','Cross','Pass']
opta_failed_pass_events = ['Failed Pass','Offside Pass']

# dribble events
opta_successful_dribble_events = ['Dribble']
opta_failed_dribble_events = ['Failed Dribble']

# shot events
opta_successful_shot_events = ['Goal']
opta_failed_shot_events = ['Hit Woodwork','Miss','Missed Penalty','Penalty Saved','Shot Blocked','Shot Saved']

opta_events_successful = opta_successful_pass_events + opta_successful_dribble_events + opta_successful_shot_events
opta_events_relevant = opta_successful_pass_events + opta_failed_pass_events + opta_successful_dribble_events + opta_failed_dribble_events + opta_successful_shot_events + opta_failed_shot_events

opta_events_relevant

['2nd Assist',
 'Assist',
 'Chance Created',
 'Cross',
 'Pass',
 'Failed Pass',
 'Offside Pass',
 'Dribble',
 'Failed Dribble',
 'Goal',
 'Hit Woodwork',
 'Miss',
 'Missed Penalty',
 'Penalty Saved',
 'Shot Blocked',
 'Shot Saved']

## **2) Loading in Synthetic Shot Data**

In [4]:
df_synthetic = pd.read_csv('/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Project/Data/Synthetic/Synthetic_Shots.csv')

## **3) Feature Engineering**

#### Binary response variable
* Shot success = 1

#### Simple features:
* Initial $x$
* Initial $y$

#### Added features:
* Initial $x^2$
* Initial $y^2$
* Initial $xy$
* Shooting angle to centre of goal
* Distance to goal (metres), $D$
* $D^2$
* $D^3$
* Amount of goal the shooter can see (requires some trigonometry)

#### Contextual features:
* Binary home/away flag (home=1)
* Game state (the point-in-time difference in goals between the two sides)
* Headcount difference (e.g. is equal to 1 if 11 Vs 10)
* Player possession duration.
* Cumulative team possession sequence duration.
* Passing index within possession sequence.

### **Feature Engineering Functions**

In [None]:
%%time

df = xG.xG_contextual_feature_engineering(df)

In [None]:
df.columns

In [49]:
def create_eventId(df):
    """
    Sorts all events, then provides a unique row identifier in the order that the event occurred within a match
    """
    
    df = df.sort_values(['competition','season','matchId','periodId','gameTime'], ascending=[True,True,True,True,True])\
            .reset_index(drop=True)
    
    return df.index.values + 1


def possession_indicator(df):
    """
    Function which identifies which team is in possession.
    
    NaN's will be forward filled, so we just need to provide very clear sides that an event is on, 
    and the forward fill will work in between.
    
    """
    
    # team identifiers
    teamId = df['playerTeamId']
    homeTeamId = df['homeTeamId']
    awayTeamId = df['awayTeamId']
    teams = set([homeTeamId,awayTeamId])
    otherTeamId = list(teams - set([teamId]))[0]
    
    # events & subevents
    eventType = df['eventType']
    eventSubType = df['eventSubType']
    
    # assigning possessionTeamId
    ## Basically picking the things in DEFENCE that should actually be the TEAMID in possession
    ## (But including Pass because of it's prevelence to reduce having to go further down the loop)
    if eventSubType in ['Pass','Ball Recovery','Catch','Clearance','Interception','Ball Claim']:
        possessionTeamId = teamId
        
    # picking things in ATTACK that should actually be seen as the OTHERTEAMID in possession
    elif eventSubType in ['Lost Aerial Duel','Lost Possession']:
        possessionTeamId = otherTeamId
    
    elif eventSubType in  ['Aerial Duel']:
        possessionTeamId = np.NaN
    
    elif eventType in ['attack','shot']:
        possessionTeamId = teamId
    
    elif eventType in ['defence', 'press']:
        possessionTeamId = otherTeamId
        
    else:
        possessionTeamId = np.NaN
        
    return possessionTeamId


In [65]:
df.groupby(['eventType','eventSubType']).agg({'x1_m':'count'}).rename(columns={'x1_m':'actionCount'}).reset_index()

Unnamed: 0,eventType,eventSubType,actionCount
0,attack,2nd Assist,396
1,attack,Aerial Duel,27666
2,attack,Assist,3252
3,attack,Bad Touch,50985
4,attack,Chance Created,27989
5,attack,Cross,5903
6,attack,Dribble,33877
7,attack,Error,197
8,attack,Error,1458
9,attack,Failed Dribble,28725


In [29]:
df.eventSubType.drop_duplicates().values

array(['Pass', 'Failed Pass', 'Lost Aerial Duel', 'Aerial Duel',
       'Pressure on Pass', 'Clearance', 'Ball Recovery', 'Dribbled Past',
       'Dribble', 'Fouled', 'Foul', 'Cross', 'Assist', 'Goal',
       'Conceded Goal', 'Blocked Pass', 'Bad Touch', 'Miss', '2nd Assist',
       'Chance Created', 'Failed Dribble', 'Lost Possession', 'Tackle',
       'Interception', 'Failed Tackle', 'Save', 'Shot Saved',
       'Shield Ball Out', 'Offside Trap', 'Offside Pass', 'Shot Blocked',
       'Blocked Shot', 'Pressure on Shot', 'Error ', 'Catch',
       'Ball Claim', 'Yellow Card', 'Hit Woodwork', 'Own Goal',
       'Red Card', 'Punch', '2nd Yellow Card', 'Foul Throw',
       'Foul for Penalty', 'Conceded Penalty', 'Penalty Saved',
       'Saved Penalty', 'Missed Penalty', 'Error'], dtype=object)

In [94]:
df.loc[df['eventSubType'] == 'Red Card'].head()

df.loc[(df['eventId'] >= 3734-5) & (df['eventId'] <= 3734+5)]

Unnamed: 0,competition,season,seasonIndex,gameMonthIndex,matchId,playerId,playerName,position,detailedPosition,playerTeamId,minsPlayed,subIn,subOut,replacedReplacingPlayerId,booking,eventType,eventSubType,eventTypeId,x1,y1,x2,y2,gameTime,timeStamp,periodId,homeTeamName,homeTeamId,awayTeamName,awayTeamId,kickOffDateTime,minute,second,x1_m,y1_m,x2_m,y2_m,xT,eventId,possessionTeamId,possessionSequenceIndex,possessionStartTime,possessionTimeSec,playerPossessionTimeSec,goalScoredFlag,goalsConcededFlag,goalsScored,goalsConceded,goalDelta,redCardFlag,numReds
3728,English Premier League,2017/18,1,24212,918895,41328,César Azpilicueta,Midfielder,RightMidfielder,8,100,,,,,attack,Pass,1,46.2,0.0,44.0,4.6,12:45,2017-08-12 15:12:58.628,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,12,45,48.51,0.0,46.2,3.128,-0.0002,3729,8,44,2017-08-12 15:12:58.628,0.0,0.0,0,0,0,0,0,0,0
3729,English Premier League,2017/18,1,24212,918895,17878,Cesc Fàbregas,Midfielder,CentralMidfielder,8,81,,,,Second Yellow Card,attack,Pass,1,44.4,7.7,36.0,60.2,12:49,2017-08-12 15:13:02.645,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,12,49,46.62,5.236,37.8,40.936,0.00035,3730,8,44,2017-08-12 15:12:58.628,4.017,4.017,0,0,0,0,0,0,0
3730,English Premier League,2017/18,1,24212,918895,19419,Gary Cahill,Defender,FullBack,8,13,,,,Red Card,attack,Bad Touch,61,69.2,69.4,,,12:57,2017-08-12 15:13:09.901,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,12,57,72.66,47.192,72.66,47.192,0.0,3731,8,44,2017-08-12 15:12:58.628,11.273,7.256,0,0,0,0,0,0,0
3731,English Premier League,2017/18,1,24212,918895,19419,Gary Cahill,Defender,FullBack,8,13,,,,Red Card,defence,Foul,4,69.4,64.0,,,12:59,2017-08-12 15:13:11.981,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,12,59,72.87,43.52,72.87,43.52,0.0,3732,90,45,2017-08-12 15:13:11.981,0.0,0.0,0,0,0,0,0,0,0
3732,English Premier League,2017/18,1,24212,918895,39847,Steven Defour,Midfielder,CentralMidfielder,90,74,,1.0,,,attack,Fouled,4,30.6,36.0,,,12:59,2017-08-12 15:13:11.981,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,12,59,32.13,24.48,32.13,24.48,0.0,3733,90,45,2017-08-12 15:13:11.981,0.0,0.0,0,0,0,0,0,0,0
3733,English Premier League,2017/18,1,24212,918895,19419,Gary Cahill,Defender,FullBack,8,13,,,,Red Card,defence,Red Card,17,0.0,0.0,,,13:1,2017-08-12 15:13:14.401,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,13,1,0.0,0.0,0.0,0.0,0.0,3734,90,45,2017-08-12 15:13:11.981,2.42,2.42,0,0,0,0,0,-1,-1
3734,English Premier League,2017/18,1,24212,918895,21205,Tom Heaton,Goalkeeper,Goalkeeper,90,100,,,,,attack,Failed Pass,1,29.2,41.3,76.8,57.8,14:56,2017-08-12 15:15:09.690,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,14,56,30.66,28.084,80.64,39.304,-0.001419,3735,90,45,2017-08-12 15:13:11.981,117.709,115.289,0,0,0,0,0,1,1
3735,English Premier League,2017/18,1,24212,918895,17761,James Tarkowski,Defender,FullBack,90,100,,,,,attack,Lost Aerial Duel,44,79.4,62.8,,,14:58,2017-08-12 15:15:11.763,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,14,58,83.37,42.704,83.37,42.704,0.0,3736,8,46,2017-08-12 15:15:11.763,0.0,0.0,0,0,0,0,0,0,1
3736,English Premier League,2017/18,1,24212,918895,102380,Antonio Rüdiger,Defender,FullBack,8,100,,,,Yellow Card,defence,Aerial Duel,44,20.6,37.2,,,14:58,2017-08-12 15:15:11.773,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,14,58,21.63,25.296,21.63,25.296,0.0,3737,8,46,2017-08-12 15:15:11.763,0.01,0.01,0,0,0,0,0,0,-1
3737,English Premier League,2017/18,1,24212,918895,102380,Antonio Rüdiger,Defender,FullBack,8,100,,,,Yellow Card,defence,Clearance,12,18.1,32.2,31.8,34.5,14:59,2017-08-12 15:15:12.342,1,Chelsea,8,Burnley,90,2017-08-12 15:00:00,14,59,19.005,21.896,33.39,23.46,0.0,3738,8,46,2017-08-12 15:15:11.763,0.579,0.569,0,0,0,0,0,0,-1


### **Applying Feature Engineering Functions**

In [92]:
%%time

def xG_contextual_feature_engineering(df):

    # 1) producing eventId
    df['eventId'] = create_eventId(df)

    # 2) producing possessionTeamId marker -> quite a few other advanced features hang off of this
    df['possessionTeamId'] = df.apply(possession_indicator, axis=1)
    # forward filling NaNs
    df['possessionTeamId'] = df.possessionTeamId.fillna(method='ffill')
    # converting to int
    df['possessionTeamId'] = df['possessionTeamId'].astype(int)

    # 3) Sequencing the possessions (each possession  will have it's own index per match)
    #print ('Applying possessionSequenceIndex...')
    ## initiate sequence at 0
    df['possessionSequenceIndex'] = 0
    ## every time there's a change in sequence (or a change in half), you set a value of 1
    df.loc[( (df['possessionTeamId'] != df['possessionTeamId'].shift(1)) | (df['periodId'] != df['periodId'].shift(1)) | (df['matchId'] != df['matchId'].shift(1)) ), 'possessionSequenceIndex'] = 1
    ## take a cumulative sum of the 1s per match
    df['possessionSequenceIndex'] = df.groupby('matchId')['possessionSequenceIndex'].cumsum()

    # 4) Getting the time that the team has been in possession until the pass has been made (1) takes a while, but allows 2) to be vectorised)
    #print ('Applying possessionStartSec...')
    ## getting the time since the possession started
    df['possessionStartTime'] = df.loc[df.groupby(['matchId','possessionSequenceIndex'])['timeStamp'].transform('idxmin'), 'timeStamp'].values
    ## calculating the time of the posession
    df['possessionTimeSec'] = (df['timeStamp'] - df['possessionStartTime']) / pd.Timedelta(1, 's')


    # 5) Getting the time that the player has been in possession
    #print ('Applying playerPossessionTimeSec...')
    ## 1) initialising at 0
    df['playerPossessionTimeSec'] = 0
    ## 2) checks that the previous event was part of the same possession sequence within the same match, and if it is, calculates possession time in seconds
    df.loc[( (df['matchId'] == df['matchId'].shift(1)) & (df['possessionSequenceIndex'] == df['possessionSequenceIndex'].shift(1)) ), 'playerPossessionTimeSec'] = df['possessionTimeSec'] - df['possessionTimeSec'].shift(1)

    ################################################################################################
    ################################################################################################

    # 6) Game State (The +/- Number of Goals)
    # print ('Applying gameState...')
    ## getting goals scored flag
    df['goalScoredFlag'] = df.eventSubType.apply(lambda x: 1 if x == 'Goal' else 0)

    # querying goals
    df_goals = df.loc[df['goalScoredFlag'] == 1, ['matchId','eventId','playerTeamId']]

    # a list of eventId's that occur right after a goal for the other team that we'll be added a conceded flag
    lst_concededEventId = []

    # this is basically a really ugly cross apply
    for idx, cols in df_goals.iterrows():
        matchId, eventId, teamId = cols
        try:
            concededEventId = df.loc[(df['matchId'] == matchId) & (df['eventId'] > eventId) & \
                                     (df['playerTeamId'] != teamId)]\
                                .sort_values('eventId', ascending=True)\
                                .head(100)['eventId'].values[0]

            # appending eventId to list
            lst_concededEventId.append(concededEventId)
        except:
            continue

    # setting goals conceded flag
    df['goalsConcededFlag'] = 0
    df.loc[(df['eventId'].isin(lst_concededEventId)), 'goalsConcededFlag'] = 1

    ## Cumulatively summing the goals scored
    df['goalsScored'] = df.sort_values(['matchId','periodId','timeStamp'], ascending=[True, True, True])\
                                        .groupby(['matchId','playerTeamId'])\
                                        ['goalScoredFlag'].cumsum()

    ## Cumulatively summing the goals conceded
    df['goalsConceded'] = df.sort_values(['matchId','periodId','timeStamp'], ascending=[True, True, True])\
                                        .groupby(['matchId','playerTeamId'])\
                                        ['goalsConcededFlag'].cumsum()

    ## Calculating the goal delta
    df['goalDelta'] = df['goalsScored'] - df['goalsConceded']

    ################################################################################################
    ################################################################################################

    # 7) Number Red Cards (Very similar method above)
    # print ('Applying numReds...')
    ## Applying red card flag
    df['redCardFlag'] = df.eventSubType.apply(lambda x: -1 if x == 'Red Card' else 0)

    ## Applying Excess Player flag to the other team
    df_reds = df.loc[df['redCardFlag'] == -1, ['matchId','eventId','playerTeamId']]

    lst_redEventId = []

    for idx, cols in df_reds.iterrows():
        matchId, eventId, teamId = cols
        try:
            redEventId = df.loc[(df['matchId'] == matchId) & (df['eventId'] > eventId) & \
                                     (df['playerTeamId'] != teamId)]\
                                .sort_values('eventId', ascending=True)\
                                .head(100)['eventId'].values[0]

            lst_redEventId.append(redEventId)
        except:
            continue

    df.loc[df['eventId'].isin(lst_redEventId), 'redCardFlag'] = 1

    ## Cumulatively summing the number of red cards on a team throughout a game
    df['numReds'] = df.sort_values(['matchId','periodId','timeStamp'], ascending=[True, True, True])\
                                    .groupby(['matchId','playerTeamId'])\
                                    ['redCardFlag'].cumsum()
    
    return df
    
    

CPU times: user 5min 14s, sys: 13.9 s, total: 5min 28s
Wall time: 3min 35s


In [95]:
df.loc[df['matchId'] == 2128665, ['playerName','playerTeamId','homeTeamId','awayTeamId','possessionTeamId','eventType','eventSubType','gameTime','timeStamp'\
                                  ,'possessionSequenceIndex','possessionStartTime','possessionTimeSec','playerPossessionTimeSec','goalDelta','numReds']].head(20)



Unnamed: 0,playerName,playerTeamId,homeTeamId,awayTeamId,possessionTeamId,eventType,eventSubType,gameTime,timeStamp,possessionSequenceIndex,possessionStartTime,possessionTimeSec,playerPossessionTimeSec,goalDelta,numReds
3124487,Fábio Silva,39,39,1,39,attack,Pass,0:0,2021-05-23 16:00:31.727,1,2021-05-23 16:00:31.727,0.0,0.0,0,0
3124488,Rúben Neves,39,39,1,39,attack,Pass,0:1,2021-05-23 16:00:33.443,1,2021-05-23 16:00:31.727,1.716,1.716,0,0
3124489,Romain Saïss,39,39,1,39,attack,Pass,0:4,2021-05-23 16:00:36.586,1,2021-05-23 16:00:31.727,4.859,3.143,0,0
3124490,Conor Coady,39,39,1,39,attack,Pass,0:10,2021-05-23 16:00:42.266,1,2021-05-23 16:00:31.727,10.539,5.68,0,0
3124491,Rayan Aït-Nouri,39,39,1,39,attack,Pass,0:13,2021-05-23 16:00:45.563,1,2021-05-23 16:00:31.727,13.836,3.297,0,0
3124492,Willy Boly,39,39,1,39,attack,Pass,0:16,2021-05-23 16:00:48.293,1,2021-05-23 16:00:31.727,16.566,2.73,0,0
3124493,Conor Coady,39,39,1,39,attack,Pass,0:21,2021-05-23 16:00:53.520,1,2021-05-23 16:00:31.727,21.793,5.227,0,0
3124494,Rayan Aït-Nouri,39,39,1,39,attack,Pass,0:25,2021-05-23 16:00:56.871,1,2021-05-23 16:00:31.727,25.144,3.351,0,0
3124495,Willy Boly,39,39,1,39,attack,Pass,0:29,2021-05-23 16:01:00.838,1,2021-05-23 16:00:31.727,29.111,3.967,0,0
3124496,Conor Coady,39,39,1,39,attack,Pass,0:31,2021-05-23 16:01:02.645,1,2021-05-23 16:00:31.727,30.918,1.807,0,0


In [96]:
df.columns

Index(['competition', 'season', 'seasonIndex', 'gameMonthIndex', 'matchId',
       'playerId', 'playerName', 'position', 'detailedPosition',
       'playerTeamId', 'minsPlayed', 'subIn', 'subOut',
       'replacedReplacingPlayerId', 'booking', 'eventType', 'eventSubType',
       'eventTypeId', 'x1', 'y1', 'x2', 'y2', 'gameTime', 'timeStamp',
       'periodId', 'homeTeamName', 'homeTeamId', 'awayTeamName', 'awayTeamId',
       'kickOffDateTime', 'minute', 'second', 'x1_m', 'y1_m', 'x2_m', 'y2_m',
       'xT', 'eventId', 'possessionTeamId', 'possessionSequenceIndex',
       'possessionStartTime', 'possessionTimeSec', 'playerPossessionTimeSec',
       'goalScoredFlag', 'goalsConcededFlag', 'goalsScored', 'goalsConceded',
       'goalDelta', 'redCardFlag', 'numReds'],
      dtype='object')