In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from scipy.spatial import distance

from sklearn.naive_bayes import MultinomialNB 

In [2]:
import re
import pickle

In [3]:
import statsbomb as sb

In [4]:
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.io import output_notebook, output_file

### loading dataset

In [5]:
path_dataset = os.path.abspath(os.path.join(os.sep, os.getcwd(), os.pardir, 'data', 'scisports-shots.parquet'))

df_dataset = pd.read_parquet(path_dataset)

In [6]:
df_dataset.columns

Index(['action_game_id', 'action_team_id', 'action_player_id', 'action_period',
       'action_seconds', 'action_type_id', 'action_type_name',
       'action_body_part_id', 'action_result', 'action_start_x',
       'action_start_y', 'action_end_x', 'action_end_y', 'action1_game_id',
       'action1_team_id', 'action1_player_id', 'action1_period',
       'action1_seconds', 'action1_type_id', 'action1_type_name',
       'action1_body_part_id', 'action1_result', 'action1_start_x',
       'action1_start_y', 'action1_end_x', 'action1_end_y', 'action2_game_id',
       'action2_team_id', 'action2_player_id', 'action2_period',
       'action2_seconds', 'action2_type_id', 'action2_type_name',
       'action2_body_part_id', 'action2_result', 'action2_start_x',
       'action2_start_y', 'action2_end_x', 'action2_end_y'],
      dtype='object')

### normalize

In [7]:
for action in ['action', 'action1', 'action2']:
    for side in ['start', 'end']:
        
        # Normalize the X location
        key_x = '{}_{}_x'.format(action, side)
        df_dataset[key_x] = df_dataset[key_x] / 105
               
        # Normalize the Y location
        key_y = '{}_{}_y'.format(action, side)
        df_dataset[key_y] = df_dataset[key_y] / 68

### construct test

In [8]:
goal = (1, 0.5)

In [9]:
for action in ['action', 'action1', 'action2']:
    key_start_x = '{action}_start_x'.format(action=action)
    key_start_y = '{action}_start_y'.format(action=action)
    key_start_distance = '{action}_start_distance'.format(action=action)

    df_dataset[key_start_distance] = df_dataset.apply(lambda s: distance.euclidean((s[key_start_x], s[key_start_y]), goal), axis=1)

In [10]:
# columns_features = ['action_start_x', 'action_start_y', 'action_body_part_id', 'action_start_distance', 'action1_start_distance', 'action2_start_distance']
columns_features = ['action_start_x', 'action_start_y', 'action_body_part_id', 'action_start_distance']

column_target = 'action_result'

In [11]:
X = df_dataset[columns_features]
y = df_dataset[column_target]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0) # test_size=0.10)

### model

In [13]:
b_classifier = MultinomialNB()
b_classifier = b_classifier.fit(X_train, y_train)

In [14]:
with open('b_classifier.pickle.dat', 'wb') as f:
    pickle.dump(b_classifier, f)

### evaluate

In [15]:
# y_pred = b_classifier.predict_proba(X_test)

In [16]:
# y_pred[:, 1]

### statsbomb test

In [17]:
BvsE = sb.Events(event_id='8657')

In [18]:
# df_pass = BvsE.get_dataframe(event_type='pass')

In [19]:
# df_pass.columns

In [20]:
# df_pass.to_csv('sb_passes.csv', sep=';')
# del df_pass

In [21]:
df_shot = BvsE.get_dataframe(event_type='shot')

In [22]:
df_shot.columns

Index(['event_type', 'id', 'index', 'period', 'timestamp', 'minute', 'second',
       'possession', 'possession_team', 'play_pattern', 'off_camera', 'team',
       'player', 'position', 'duration', 'under_pressure', 'statsbomb_xg',
       'key_pass_id', 'body_part', 'type', 'outcome', 'technique',
       'first_time', 'follows_dribble', 'redirect', 'one_on_one', 'open_goal',
       'deflected', 'start_location_x', 'start_location_y', 'end_location_x',
       'end_location_y', 'end_location_z'],
      dtype='object')

In [23]:
# df_shot.to_csv('sb_shots.csv', sep=';')

In [24]:
for side in ['start', 'end']:
    key_x = '{0}_location_x_normalized'.format(side)
    key_y = '{0}_location_y_normalized'.format(side)
    
    df_shot[key_x] = df_shot[key_x[:-11]] / max(df_shot.start_location_x)
    df_shot[key_y] = df_shot[key_y[:-11]] / max(df_shot.start_location_y)

In [25]:
df_shot.body_part = df_shot.body_part.apply(lambda part: 0 if re.findall(r'Foot', part) else 1 if re.findall(r'Head', part) else 2)

In [26]:
key_start_x = 'start_location_x_normalized'
key_start_y = 'start_location_y_normalized'
key_start_distance = 'start_distance'.format(action=action)

df_shot[key_start_distance] = df_shot.apply(lambda s: distance.euclidean((s[key_start_x], s[key_start_y]), goal), axis=1)

In [27]:
res = {}
fields = ['start_location_x_normalized', 'start_location_y_normalized', 'body_part', 'start_distance']
df = df_shot[fields]

for i in range(df_shot.index.size-1):
    key = '{0}'.format(df_shot.player[i])
    data = pd.DataFrame([([df[df.columns[j]][i] for j in range(df.columns.size)])], columns=fields)
    val = round(b_classifier.predict_proba(data)[:, 1][0], 7)
    res[key] = (val, df_shot.statsbomb_xg[i], df_shot.start_location_x[i], df_shot.start_location_y[i])

In [28]:
res

{'Dries Mertens': (0.1063195, 0.05515628, 104.0, 30.0),
 'Eden Hazard': (0.0936771, 0.0795171, 106.0, 51.0),
 'Eric Jeremy Edgar Dier': (0.1219629, 0.0589808, 109.0, 43.0),
 'Fabian Delph': (0.100927, 0.019933008, 91.0, 31.0),
 'Harry Kane': (0.0949321, 0.030152736, 100.0, 48.0),
 'Harry Maguire': (0.1309105, 0.035002366, 111.0, 33.0),
 'Jesse Lingard': (0.0891503, 0.022281447, 107.0, 58.0),
 'Kevin De Bruyne': (0.0928133, 0.061722323, 104.0, 52.0),
 'Marcus Rashford': (0.102992, 0.027446777, 99.0, 23.0),
 'Raheem Shaquille Sterling': (0.103535, 0.034227762, 97.0, 29.0),
 'Romelu Lukaku Menama': (0.1063195, 0.06716168, 104.0, 30.0),
 'Ruben Loftus-Cheek': (0.120167, 0.044667743, 108.0, 45.0),
 'Thomas Meunier': (0.091247, 0.06443123, 109.0, 55.0),
 'Toby Albertine Maurits Alderweireld': (0.1060919, 0.111274794, 110.0, 34.0),
 'Youri Tielemans': (0.0973381, 0.02890598, 96.0, 43.0)}

In [29]:
x = [res[key][2] for key in res.keys()]
y = [res[key][3] for key in res.keys()]

output_notebook()
output_file("BayG.html")

source = ColumnDataSource(data=dict(
    x = x,
    y = y,
    player = [key for key in res.keys()],
    xg = [str(round(res[key][0], 2)) for key in res.keys()],
    sbxg = [str(round(res[key][1], 2)) for key in res.keys()],
    rad = [res[key][0]*10 for key in res.keys()],
    colors = ["#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(10*np.array(x), 10*np.array(y))],
))

TOOLTIPS = [
    ("index", "$index"),
    ("player", "@player"),
    ("xg", "@xg"),
    ("sbxg", "@sbxg")
]

p = figure(x_range=(0, 120), y_range=(0, 90), plot_width=120*5, plot_height=90*5, tooltips=TOOLTIPS,
           title="xG")

p.circle('x', 'y', radius='rad', fill_color='colors', source=source) 

show(p)