In [1]:
# standard imports
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from collections import Counter
#import json
import os

pd.set_option('display.max_rows', 2500)
pd.set_option('display.max_columns', 100)

# stats packages to fit classification models
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.calibration import calibration_curve

# importing xG xGils library
import xGils.xG as xG

# **xG Feature Engineering**

1. Load in pre-made Opta dataset.
2. Load in synthetic data.
2. Add additional features:
    * Basic features;
    * Added features;
    * Advanced features.
3. Fit logistic / probit regression model.

**Note, we'll have to construct some of the features before we construct a `df_shots` dataframe.**

**Will also want to integrate the synthetic shots and see if that improves things.**
(Will have to generate some dummy data for the synthetic shots).

## **1) Loading Opta dataset (which includes Bayesian xT)**

In [2]:
%%time

df = pd.read_csv('/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Project/Data/Analysis Ready/Opta Bayesian xT/Bayesian_Opta_xT.csv')

# converting the timestamp string to a datetime
df['timeStamp'] = pd.to_datetime(df.timeStamp, format='%Y-%m-%d %H:%M:%S.%f')
df['kickOffDateTime'] = pd.to_datetime(df.kickOffDateTime, format='%Y-%m-%d %H:%M:%S.%f')

print (f'{len(df)} rows loaded.\n')

df.head()


3126182 rows loaded.

CPU times: user 14.1 s, sys: 1.26 s, total: 15.4 s
Wall time: 15.5 s


Unnamed: 0,competition,season,seasonIndex,gameMonthIndex,matchId,playerId,playerName,position,detailedPosition,playerTeamId,minsPlayed,subIn,subOut,replacedReplacingPlayerId,booking,eventType,eventSubType,eventTypeId,x1,y1,x2,y2,gameTime,timeStamp,periodId,homeTeamName,homeTeamId,awayTeamName,awayTeamId,kickOffDateTime,minute,second,x1_m,y1_m,x2_m,y2_m,xT
0,English Premier League,2017/18,1,24212,918893,59966,Alexandre Lacazette,Forward,Striker,3,95,,,,,attack,Pass,1,50.0,50.7,28.8,30.1,0:1,2017-08-11 19:46:04.968,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,1,52.5,34.476,30.24,20.468,-0.003278
1,English Premier League,2017/18,1,24212,918893,156074,Rob Holding,Defender,FullBack,3,67,,1.0,,,attack,Pass,1,29.7,26.7,52.3,21.5,0:2,2017-08-11 19:46:05.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,2,31.185,18.156,54.915,14.62,0.003008
2,English Premier League,2017/18,1,24212,918893,37605,Mesut Özil,Forward,AttackingMidfielder,3,95,,,,,attack,Pass,1,52.8,21.3,44.3,20.7,0:5,2017-08-11 19:46:08.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,5,55.44,14.484,46.515,14.076,-0.001186
3,English Premier League,2017/18,1,24212,918893,153256,Mohamed Elneny,Midfielder,CentralMidfielder,3,66,,1.0,,,attack,Pass,1,44.0,19.6,50.3,4.2,0:7,2017-08-11 19:46:10.554,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,7,46.2,13.328,52.815,2.856,-0.000542
4,English Premier League,2017/18,1,24212,918893,98745,Héctor Bellerín,Midfielder,RightMidfielder,3,95,,,,,attack,Pass,1,51.0,4.2,70.5,5.0,0:9,2017-08-11 19:46:13.519,1,Arsenal,3,Leicester City,13,2017-08-11 19:45:00,0,9,53.55,2.856,74.025,3.4,0.00585


### **Setting useful event types (may not need this)**

In [3]:
# pass events (inc. crosses)
opta_successful_pass_events = ['2nd Assist','Assist','Chance Created','Cross','Pass']
opta_failed_pass_events = ['Failed Pass','Offside Pass']

# dribble events
opta_successful_dribble_events = ['Dribble']
opta_failed_dribble_events = ['Failed Dribble']

# shot events
opta_successful_shot_events = ['Goal']
opta_failed_shot_events = ['Hit Woodwork','Miss','Missed Penalty','Penalty Saved','Shot Blocked','Shot Saved']

opta_events_successful = opta_successful_pass_events + opta_successful_dribble_events + opta_successful_shot_events
opta_events_relevant = opta_successful_pass_events + opta_failed_pass_events + opta_successful_dribble_events + opta_failed_dribble_events + opta_successful_shot_events + opta_failed_shot_events

opta_events_relevant

['2nd Assist',
 'Assist',
 'Chance Created',
 'Cross',
 'Pass',
 'Failed Pass',
 'Offside Pass',
 'Dribble',
 'Failed Dribble',
 'Goal',
 'Hit Woodwork',
 'Miss',
 'Missed Penalty',
 'Penalty Saved',
 'Shot Blocked',
 'Shot Saved']

## **2) Loading in Synthetic Shot Data**

In [4]:
df_synthetic = pd.read_csv('/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Project/Data/Synthetic/Synthetic_Shots.csv')

## **3) Feature Engineering**

#### Binary response variable
* Shot success = 1

#### Simple features:
* Initial $x$
* Initial $y$

#### Added features:
* Initial $x^2$
* Initial $y^2$
* Initial $xy$
* Shooting angle to centre of goal
* Distance to goal (metres), $D$
* $D^2$
* $D^3$
* Amount of goal the shooter can see (requires some trigonometry)

#### Contextual features:
* Binary home/away flag (home=1)
* Game state (the point-in-time difference in goals between the two sides)
* Headcount difference (e.g. is equal to 1 if 11 Vs 10)
* Player possession duration.
* Cumulative team possession sequence duration.
* Passing index within possession sequence.

### **Feature Engineering Functions**

In [None]:
%%time

df = xG.xG_contextual_feature_engineering(df)

In [None]:
df.columns

### **Applying Feature Engineering Functions**

In [95]:
df.loc[df['matchId'] == 2128665, ['playerName','playerTeamId','homeTeamId','awayTeamId','possessionTeamId','eventType','eventSubType','gameTime','timeStamp'\
                                  ,'possessionSequenceIndex','possessionStartTime','possessionTimeSec','playerPossessionTimeSec','goalDelta','numReds']].head(20)



Unnamed: 0,playerName,playerTeamId,homeTeamId,awayTeamId,possessionTeamId,eventType,eventSubType,gameTime,timeStamp,possessionSequenceIndex,possessionStartTime,possessionTimeSec,playerPossessionTimeSec,goalDelta,numReds
3124487,Fábio Silva,39,39,1,39,attack,Pass,0:0,2021-05-23 16:00:31.727,1,2021-05-23 16:00:31.727,0.0,0.0,0,0
3124488,Rúben Neves,39,39,1,39,attack,Pass,0:1,2021-05-23 16:00:33.443,1,2021-05-23 16:00:31.727,1.716,1.716,0,0
3124489,Romain Saïss,39,39,1,39,attack,Pass,0:4,2021-05-23 16:00:36.586,1,2021-05-23 16:00:31.727,4.859,3.143,0,0
3124490,Conor Coady,39,39,1,39,attack,Pass,0:10,2021-05-23 16:00:42.266,1,2021-05-23 16:00:31.727,10.539,5.68,0,0
3124491,Rayan Aït-Nouri,39,39,1,39,attack,Pass,0:13,2021-05-23 16:00:45.563,1,2021-05-23 16:00:31.727,13.836,3.297,0,0
3124492,Willy Boly,39,39,1,39,attack,Pass,0:16,2021-05-23 16:00:48.293,1,2021-05-23 16:00:31.727,16.566,2.73,0,0
3124493,Conor Coady,39,39,1,39,attack,Pass,0:21,2021-05-23 16:00:53.520,1,2021-05-23 16:00:31.727,21.793,5.227,0,0
3124494,Rayan Aït-Nouri,39,39,1,39,attack,Pass,0:25,2021-05-23 16:00:56.871,1,2021-05-23 16:00:31.727,25.144,3.351,0,0
3124495,Willy Boly,39,39,1,39,attack,Pass,0:29,2021-05-23 16:01:00.838,1,2021-05-23 16:00:31.727,29.111,3.967,0,0
3124496,Conor Coady,39,39,1,39,attack,Pass,0:31,2021-05-23 16:01:02.645,1,2021-05-23 16:00:31.727,30.918,1.807,0,0


In [96]:
df.columns

Index(['competition', 'season', 'seasonIndex', 'gameMonthIndex', 'matchId',
       'playerId', 'playerName', 'position', 'detailedPosition',
       'playerTeamId', 'minsPlayed', 'subIn', 'subOut',
       'replacedReplacingPlayerId', 'booking', 'eventType', 'eventSubType',
       'eventTypeId', 'x1', 'y1', 'x2', 'y2', 'gameTime', 'timeStamp',
       'periodId', 'homeTeamName', 'homeTeamId', 'awayTeamName', 'awayTeamId',
       'kickOffDateTime', 'minute', 'second', 'x1_m', 'y1_m', 'x2_m', 'y2_m',
       'xT', 'eventId', 'possessionTeamId', 'possessionSequenceIndex',
       'possessionStartTime', 'possessionTimeSec', 'playerPossessionTimeSec',
       'goalScoredFlag', 'goalsConcededFlag', 'goalsScored', 'goalsConceded',
       'goalDelta', 'redCardFlag', 'numReds'],
      dtype='object')