### Packages

In [3]:
import os
import ast
import pandas as pd
import numpy as np

### Data

In [100]:
# read shots.csv file 
shots = pd.read_csv('.data/shots.csv')
shots.loc[:, 'freeze_frame'] = shots.loc[:, 'freeze_frame'].apply(ast.literal_eval)
shots.loc[:, 'gk_loc'] = shots.loc[:, 'gk_loc'].apply(ast.literal_eval)
shots.loc[:, 'end_loc'] = shots.loc[:, 'end_loc'].apply(ast.literal_eval)

# rename existing 'outcome' column to 'outcome_type' 
shots = shots.rename(columns = {'outcome': 'outcome_type'})
# save binary results into a newly created 'outcome' column
shots.loc[:, 'outcome'] = shots.loc[:, 'outcome_type'].apply(lambda x: 1 if x == 'Goal' else 0)

### Features

In [101]:
#| code-fold: false
# Distance Feature calculation

# define goal center for 'statsbomb'
goal_center = np.array([120, 40])

# calculate distance between a shot coordinate and goal centerline coordinate
shots['distance'] = np.sqrt((shots['x_start'] - goal_center[0])**2 + (shots['y_start'] - goal_center[1])**2)
shots['distance'] = shots['distance'].round(decimals = 2)

In [102]:
#| code-fold: false
# Angle Feature calculation

# transform (x, y) coordinates from percentiles to field length coordinates (105 meters x 68 meters)
x = shots['x_start'] * 105/120
y = shots['y_start'] * 68/80 

# Use trigonometric formula to find an angle between two sides (a,b) of a triangle where the third side (c) 
# is a goal line of length 7.32 meters.
a = np.sqrt((x - 105)**2 + (y - 30.34)**2) # length between right post and (x, y) shot coordinate
b = np.sqrt((x - 105)**2 + (y - 37.66)**2) # length between left post and (x, y) shot coordinate
c = 7.32 # goal line length in meters
cos_alpha = (a**2 + b**2 - c**2)/(2*a*b)
cos_alpha = np.round(cos_alpha, decimals = 4)

# remember to leave angle in radians (if you want to transfer to degree multiply cos_alpha by 180/pi)
shots['angle'] = np.arccos(cos_alpha)

### Outliers

In [95]:
shots = shots.loc[~((shots['play_pattern_name'] == 'Other') | (shots['play_pattern_name'] == 'From Keeper' ) 
| (shots['play_pattern_name'] == 'From Kick Off') | (shots['body_part'] == 'Other')),  :]

### Transforming and Splitting Data

In [103]:
from sklearn.model_selection import train_test_split
# Prepare features and labels from available data
X = shots.loc[:, ['play_pattern_name','under_pressure', 'distance', 'angle', 'gk_loc_x', 'gk_loc_y',
                   'follows_dribble', 'first_time', 'open_goal', 'technique', 'body_part']]
y = shots.loc[:, 'outcome']

# split data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [104]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Build a column transformer
column_trans = ColumnTransformer(
    [('encode_bodyparts', OneHotEncoder(dtype='int'), ['play_pattern_name', 'technique', 'body_part']),
    ('std_coords', StandardScaler(), ['distance', 'angle', 'gk_loc_x'])],
    remainder = 'passthrough', verbose_feature_names_out = True)

# Transform feature columns
#X_train = column_trans.fit_transform(X_train)
#X_test = column_trans.transform(X_test)
X = column_trans.fit_transform(X)

### Base Model

In [66]:
from sklearn.model_selection import train_test_split
# Prepare features and labels from available data
X = shots.loc[:, ['distance', 'angle']]
y = shots.loc[:, 'outcome']

# split data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Build a column transformer
column_trans = ColumnTransformer([('std_coords', StandardScaler(), ['distance', 'angle'])],
    remainder = 'passthrough', verbose_feature_names_out = True)

# Transform feature columns
X = column_trans.fit_transform(X)
#X_test = column_trans.transform(X_test)

### Model Testing

In [53]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, brier_score_loss

classifier = XGBClassifier(gamma = 6, min_child_weight = 60, max_depth = 10)
classifier.fit(X_train, y_train)


# Evaluate on train data
predictions = classifier.predict_proba(X_train)[:, 1]
print('Brier score = ', brier_score_loss(y_train, predictions))
print('ROC-AUC = ', roc_auc_score(y_train, predictions))


# Evaluate on test data
predictions = classifier.predict_proba(X_test)[:, 1]
print('Brier score = ', brier_score_loss(y_test, predictions))
print('ROC-AUC = ', roc_auc_score(y_test, predictions))



Brier score =  0.08049908253545875
ROC-AUC =  0.7985446298517084
Brier score =  0.0784908283739884
ROC-AUC =  0.7823180065349857


In [105]:
from sklearn.linear_model import LogisticRegression

# Model
classifier = LogisticRegression( penalty = 'none', max_iter = 400, random_state = 42)

# Hyperparameters
#parameters = dict(C = uniform(loc = 0, scale = 4), 
#                  penalty = ['l2', 'l1'])

# Classifier
#classifier = RandomizedSearchCV(model, parameters, random_state = 42, 
#                                cv = 10, scoring = 'neg_brier_score')

#classifier.fit(X_train, y_train)
#print('Optimal parameters are:\n', classifier.best_params_)

# Evaluate on train data
# predictions = classifier.predict_proba(X_train)[:, 1]
# print('Brier score = ', brier_score_loss(y_train, predictions))
# print('ROC-AUC = ', roc_auc_score(y_train, predictions))


# # Evaluate on test data
# predictions = classifier.predict_proba(X_test)[:, 1]
# print('Brier score = ', brier_score_loss(y_test, predictions))
# print('ROC-AUC = ', roc_auc_score(y_test, predictions))

In [106]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(classifier, X, y, cv = 10, scoring = 'roc_auc')

In [94]:
cv_results['test_score'].mean()

0.7885714437322404

In [107]:
cv_results['test_score'].mean()

0.786948576771249