In [14]:
# https://github.com/statsbomb/open-data
# https://github.com/wswager/womens_soccer_expected_goals_model/blob/main/data_preprocessing/expected_goals_data_preprocessing_notebook.ipynb
# https://github.com/Friends-of-Tracking-Data-FoTD/SoccermaticsForPython/tree/master
# https://soccermatics.readthedocs.io/en/latest/gallery/lesson2/plot_xGModelFit.html
# https://www.youtube.com/watch?v=310_eW0hUqQ&ab_channel=FriendsofTracking
# https://twitter.com/LanusStats/status/1456381238252605443
# https://theanalyst.com/eu/2021/06/que-son-los-goles-esperados-xg/

from statsbombpy import sb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mplsoccer import Pitch, VerticalPitch
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [3]:
tiros_statsbomb = pd.read_csv('tirosStatsBomb_modeloxG_v2.csv')

In [4]:
tiros_statsbomb.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'bad_behaviour_card',
       'ball_receipt_outcome', 'ball_recovery_recovery_failure',
       'block_deflection', 'block_offensive', 'carry_end_location',
       'clearance_aerial_won', 'clearance_body_part',
       ...
       'goalkeeper_lost_in_play', 'pass_backheel', 'x', 'y',
       'player_off_permanent', 'goalkeeper_lost_out', 'goalkeeper_success_out',
       'half_end_early_video_end', 'goalkeeper_penalty_saved_to_post',
       'goalkeeper_saved_to_post'],
      dtype='object', length=119)

In [5]:
tiros_statsbomb.columns.str.contains('shot')

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True,  True, False,  True,  True, False,
        True, False, False, False, False, False, False, False, False,
       False, False])

In [8]:
booleanos = pd.Series(tiros_statsbomb.columns).str.contains('shot')
df_test = pd.concat([pd.Series(tiros_statsbomb.columns), booleanos], axis=1)
df_test[df_test.iloc[:,1] == True]

Unnamed: 0,0,1
41,pass_assisted_shot_id,True
54,pass_shot_assist,True
67,shot_aerial_won,True
68,shot_body_part,True
69,shot_deflected,True
70,shot_end_location,True
71,shot_first_time,True
72,shot_freeze_frame,True
73,shot_key_pass_id,True
74,shot_one_on_one,True


In [13]:
tiros_filt = tiros_statsbomb[['shot_aerial_won',             
'shot_body_part',               
'shot_first_time',
'shot_deflected',             
'shot_one_on_one',             
'shot_open_goal',              
'shot_outcome',                 
'shot_technique',               
'shot_type',              
'play_pattern',                 
'x',
'y'
]].reset_index(drop=True)
tiros_filt['goal'] = np.where(tiros_filt.shot_outcome == 'Goal', 1,0)
tiros_filt['Distance'] = np.sqrt(np.square(120-tiros_filt['x']) + np.square(40 - tiros_filt['y']))
tiros_filt['angulo'] = np.arctan(7.32 *tiros_filt['x'] /(tiros_filt['x']**2 + tiros_filt['y']**2 - (7.32/2)**2))
tiros_filt.shot_aerial_won = tiros_filt.shot_aerial_won.fillna(False)
tiros_filt.shot_first_time = tiros_filt.shot_first_time.fillna(False)
tiros_filt.shot_one_on_one = tiros_filt.shot_one_on_one.fillna(False)
tiros_filt.shot_open_goal = tiros_filt.shot_open_goal.fillna(False)
tiros_filt.shot_deflected = tiros_filt.shot_open_goal.fillna(False)
tiros_filt.head()

Unnamed: 0,shot_aerial_won,shot_body_part,shot_first_time,shot_deflected,shot_one_on_one,shot_open_goal,shot_outcome,shot_technique,shot_type,play_pattern,x,y,goal,Distance,angulo
0,False,Left Foot,True,False,False,False,Blocked,Half Volley,Open Play,From Kick Off,96.0,38.8,0,24.029981,0.065531
1,False,Left Foot,True,False,False,False,Saved,Half Volley,Open Play,From Kick Off,113.1,40.7,0,6.935416,0.057292
2,False,Right Foot,True,False,False,False,Saved,Half Volley,Open Play,From Kick Off,103.8,41.9,0,16.311039,0.06063
3,False,Head,False,False,False,False,Off T,Normal,Open Play,From Corner,112.2,36.8,0,8.430896,0.058893
4,False,Left Foot,False,False,False,False,Post,Normal,Open Play,Regular Play,97.8,51.5,0,25.0018,0.058595


In [19]:
tiros_filt.head()

Unnamed: 0,shot_aerial_won,shot_body_part,shot_first_time,shot_deflected,shot_one_on_one,shot_open_goal,shot_outcome,shot_technique,shot_type,play_pattern,x,y,goal,Distance,angulo
0,False,Left Foot,True,False,False,False,Blocked,Half Volley,Open Play,From Kick Off,96.0,38.8,0,24.029981,0.065531
1,False,Left Foot,True,False,False,False,Saved,Half Volley,Open Play,From Kick Off,113.1,40.7,0,6.935416,0.057292
2,False,Right Foot,True,False,False,False,Saved,Half Volley,Open Play,From Kick Off,103.8,41.9,0,16.311039,0.06063
3,False,Head,False,False,False,False,Off T,Normal,Open Play,From Corner,112.2,36.8,0,8.430896,0.058893
4,False,Left Foot,False,False,False,False,Post,Normal,Open Play,Regular Play,97.8,51.5,0,25.0018,0.058595


In [22]:
y = tiros_filt['goal']
X = tiros_filt.drop(columns=['goal', 'shot_outcome'])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 13)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['shot_body_part', 'shot_technique', 'shot_type', 'play_pattern'])
    ],
    remainder= 'passthrough'
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model.fit(X_test, y_test)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['shot_body_part',
                                                   'shot_technique',
                                                   'shot_type',
                                                   'play_pattern'])])),
                ('classifier', LogisticRegression())])

In [24]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
from sklearn.metrics import confusion_matrix
preds = model.predict(X_test)
df_cm = pd.DataFrame(confusion_matrix(y_test, preds))  # return  tn, fp, fn, tp

VP = df_cm.iloc[1,1]
VN = df_cm.iloc[0,0]
FN = df_cm.iloc[1,0]
FP = df_cm.iloc[0,1]
exactitud = accuracy = (VP + VN)/(VP + VN + FN + FP)
precision = VP/(VP + FP)
sensibilidad = VP/(VP + FN)
especifidad = VN/(FP + VN)

print('La exactitud del modelo elegido es: ' + str(exactitud))
print('La precision del modelo elegido es: ' + str(precision))
print('La sensibilidad del modelo elegido es: ' + str(sensibilidad))
print('La especifidad del modelo elegido es: ' + str(especifidad))

La exactitud del modelo elegido es: 0.9009216589861752
La precision del modelo elegido es: 0.6691729323308271
La sensibilidad del modelo elegido es: 0.1718146718146718
La especifidad del modelo elegido es: 0.9896616541353384


In [30]:
probabilities = model.predict_proba(X)[:,1]
probabilities = pd.DataFrame(probabilities, columns=['xG'])
X_with_prob = pd.concat([X.reset_index(drop=True), probabilities], axis=1)
X_with_prob

Unnamed: 0,shot_aerial_won,shot_body_part,shot_first_time,shot_deflected,shot_one_on_one,shot_open_goal,shot_technique,shot_type,play_pattern,x,y,Distance,angulo,xG
0,False,Left Foot,True,False,False,False,Half Volley,Open Play,From Kick Off,96.0,38.8,24.029981,0.065531,0.037184
1,False,Left Foot,True,False,False,False,Half Volley,Open Play,From Kick Off,113.1,40.7,6.935416,0.057292,0.353327
2,False,Right Foot,True,False,False,False,Half Volley,Open Play,From Kick Off,103.8,41.9,16.311039,0.060630,0.121333
3,False,Head,False,False,False,False,Normal,Open Play,From Corner,112.2,36.8,8.430896,0.058893,0.113592
4,False,Left Foot,False,False,False,False,Normal,Open Play,Regular Play,97.8,51.5,25.001800,0.058595,0.034569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23862,False,Right Foot,False,False,False,False,Normal,Open Play,Regular Play,109.1,29.8,14.928161,0.062420,0.131528
23863,False,Right Foot,False,False,True,False,Normal,Open Play,From Counter,114.6,51.4,12.614278,0.053172,0.304743
23864,False,Right Foot,False,False,False,False,Normal,Open Play,From Throw In,98.3,56.9,27.504545,0.055777,0.024224
23865,False,Left Foot,False,False,False,False,Normal,Open Play,Regular Play,102.9,58.1,24.900201,0.053940,0.036927


In [34]:
X_with_prob[X_with_prob['shot_type'] == 'Penalty']

Unnamed: 0,shot_aerial_won,shot_body_part,shot_first_time,shot_deflected,shot_one_on_one,shot_open_goal,shot_technique,shot_type,play_pattern,x,y,Distance,angulo,xG
128,False,Right Foot,False,False,False,False,Normal,Penalty,Other,108.0,40.0,12.000000,0.059592,0.643942
147,False,Left Foot,False,False,False,False,Normal,Penalty,Other,108.0,40.0,12.000000,0.059592,0.630049
161,False,Right Foot,False,False,False,False,Normal,Penalty,Other,108.1,40.1,11.900420,0.059514,0.647633
162,False,Left Foot,False,False,False,False,Normal,Penalty,Other,108.1,40.1,11.900420,0.059514,0.633802
163,False,Left Foot,False,False,False,False,Normal,Penalty,Other,108.1,40.1,11.900420,0.059514,0.633802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23242,False,Right Foot,False,False,True,False,Normal,Penalty,Other,107.7,39.9,12.300406,0.059753,0.660743
23269,False,Right Foot,False,False,False,False,Normal,Penalty,Other,108.3,39.9,11.700427,0.059502,0.654278
23438,False,Right Foot,False,False,False,False,Normal,Penalty,Other,108.0,40.0,12.000000,0.059592,0.643942
23648,False,Left Foot,False,False,False,False,Normal,Penalty,Other,108.0,40.0,12.000000,0.059592,0.630049
