In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from scipy.spatial import distance

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

from tqdm import tqdm

In [2]:
import re
import pickle

In [3]:
import statsbomb as sb

  columns = yaml.load(open(os.path.join(os.path.dirname(__file__), 'events.yaml')))


In [4]:
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.io import output_notebook, output_file

### Loading Dataset

In [5]:
path_dataset = os.path.abspath(os.path.join(os.sep, os.getcwd(), os.pardir, 'data', 'scisports-shots.parquet'))

df_dataset = pd.read_parquet(path_dataset)

### Normalize

In [6]:
for action in ['action', 'action1', 'action2']:
    for side in ['start', 'end']:
        
        # Normalize the X location
        key_x = '{}_{}_x'.format(action, side)
        df_dataset[key_x] = df_dataset[key_x] / 105
               
        # Normalize the Y location
        key_y = '{}_{}_y'.format(action, side)
        df_dataset[key_y] = df_dataset[key_y] / 68

In [7]:
goal = (1, 0.5)

In [8]:
for action in ['action', 'action1', 'action2']:
    key_start_x = '{action}_start_x'.format(action=action)
    key_start_y = '{action}_start_y'.format(action=action)
    key_start_distance = '{action}_start_distance'.format(action=action)

    df_dataset[key_start_distance] = df_dataset.apply(lambda s: distance.euclidean((s[key_start_x], s[key_start_y]), goal), axis=1)

In [9]:
columns_features = ['action_start_x', 'action_start_y', 'action_body_part_id', 'action_start_distance']

column_target = 'action_result'

In [10]:
X = df_dataset[columns_features]
y = df_dataset[column_target]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001) # test_size=0.10)

In [18]:
X_train[0:1].values, y_train[0:1].values

(array([[0.75114286, 0.45176471, 0.        , 0.2534887 ]]), array([0]))

In [152]:
class SimpleXG(nn.Module):

    def __init__(self, embedding_dim, input_size, n_classes):
        super(SimpleXG, self).__init__()

        self.fc1 = nn.Linear(input_size, embedding_dim)

        self.fc2 = nn.Linear(embedding_dim, embedding_dim)
        
        self.fc3 = nn.Linear(embedding_dim, n_classes)
        
        self.sm = sm = nn.Softmax(dim=1)

    def forward(self, x):
        z1 = self.fc1(x)
        z2 = self.fc2(z1)
        z3 = self.fc3(z2)
        output = self.sm(z3)
        return out

In [153]:
def train(model, X, Y, loss_function, optimizer, num_epoch=2):
    avg_loss = []

    for epoch in range(num_epoch): # tqdm(range(200)):

        running_loss = []
        for i in range(X.index.size):
            batch = torch.tensor(np.array(X.iloc[i])).view(1, -1).type('torch.FloatTensor')
            targets = torch.tensor(np.array(y_train.iloc[i])).view(-1).type('torch.LongTensor')
            model.zero_grad()

            tag_scores = model(batch)

            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

            for p in model.parameters(): # making weights non-negative
                p.data.clamp_(0)

            running_loss.append(loss.item())

        avg_loss.append(round(sum(running_loss) / len(running_loss), 4))
        print('epo: {0}, loss: {1}'.format(epoch, avg_loss[-1]))
    return model, avg_loss

In [154]:
EMBEDDING_DIM = 4
N_CLASSES = 2
INPUT_SIZE = len(columns_features)

model = SimpleXG(EMBEDDING_DIM, INPUT_SIZE, N_CLASSES)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [108]:
data = {}
for i in range(X_train.index.size):
    data[i] = (X_train.iloc[i][0], X_train.iloc[i][1], 
               X_train.iloc[i][2], X_train.iloc[i][3],
               y_train.iloc[i])

In [110]:
df = pd.DataFrame(data.values(), columns=columns_features+[column_target])

In [114]:
df[:1]

Unnamed: 0,action_start_x,action_start_y,action_body_part_id,action_start_distance,action_result
0,0.751143,0.451765,0.0,0.253489,0


In [115]:
df1 = df[df.action_result == 1]
df0 = df[df.action_result == 0].sample(df1.index.size)
dfj = pd.concat([df0, df1])
dfj = dfj.sample(frac=1).reset_index(drop=True)

In [116]:
xj = dfj[columns_features]
yj = dfj[[column_target]]

In [155]:
model, avg_loss = train(model, X_train, y_train, loss_function, optimizer, num_epoch=10)

ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 4])

In [144]:
def proba(x, model):
    batch = torch.tensor(x).type('torch.FloatTensor')
    with torch.no_grad():
        pred = model(batch)
    return round(float(pred[0][1]), 4)

### statsbomb test

In [145]:
BvsE = sb.Events(event_id='8657')

df_shot = BvsE.get_dataframe(event_type='shot')

for side in ['start', 'end']:
    key_x = '{0}_location_x_normalized'.format(side)
    key_y = '{0}_location_y_normalized'.format(side)
    
    df_shot[key_x] = df_shot[key_x[:-11]] / max(df_shot.start_location_x)
    df_shot[key_y] = df_shot[key_y[:-11]] / max(df_shot.start_location_y)
    
df_shot.body_part = df_shot.body_part.apply(lambda part: 0 if re.findall(r'Foot', part) else 1 if re.findall(r'Head', part) else 2)

key_start_x = 'start_location_x_normalized'
key_start_y = 'start_location_y_normalized'
key_start_distance = 'start_distance'.format(action=action)

df_shot[key_start_distance] = df_shot.apply(lambda s: distance.euclidean((s[key_start_x], s[key_start_y]), goal), axis=1)

res = {}
fields = ['start_location_x_normalized', 'start_location_y_normalized', 'body_part', 'start_distance']
df = df_shot[fields]

for i in range(df_shot.index.size-1):
    key = '{0}'.format(df_shot.player[i])
    data = pd.DataFrame([([df[df.columns[j]][i] for j in range(df.columns.size)])], columns=fields)
    val = round(proba(data[0:1].values, model), 7)
    res[key] = (val, df_shot.statsbomb_xg[i], df_shot.start_location_x[i], df_shot.start_location_y[i])

In [146]:
x = [res[key][2] for key in res.keys()]
y = [res[key][3] for key in res.keys()]

output_notebook()
# output_file("BayG.html")

source = ColumnDataSource(data=dict(
    x = x,
    y = y,
    player = [key for key in res.keys()],
    xg = [str(round(res[key][0], 2)) for key in res.keys()],
    sbxg = [str(round(res[key][1], 2)) for key in res.keys()],
    rad = [res[key][0]*10 for key in res.keys()],
    colors = ["#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(10*np.array(x), 10*np.array(y))],
))

TOOLTIPS = [
    ("index", "$index"),
    ("player", "@player"),
    ("xg", "@xg"),
    ("sbxg", "@sbxg")
]

p = figure(x_range=(0, 120), y_range=(0, 90), plot_width=120*5, plot_height=90*5, tooltips=TOOLTIPS,
           title="xG")

p.circle('x', 'y', radius='rad', fill_color='colors', source=source) 

show(p)

 50%|████▉     | 63569/127515 [01:20<01:12, 886.21it/s]

In [83]:
res

{'Thomas Meunier': (0.0, 0.0670284, 109.0, 55.0),
 'Raheem Shaquille Sterling': (0.0, 0.029468331, 97.0, 29.0),
 'Kevin De Bruyne': (0.0, 0.051837448, 104.0, 52.0),
 'Fabian Delph': (0.0, 0.014406141, 91.0, 31.0),
 'Ruben Loftus-Cheek': (0.0, 0.04835619, 108.0, 45.0),
 'Harry Maguire': (0.0, 0.036430676, 111.0, 33.0),
 'Harry Kane': (0.0, 0.03146625, 100.0, 48.0),
 'Youri Tielemans': (0.0, 0.022493094, 96.0, 43.0),
 'Eden Hazard': (0.0, 0.0906536, 106.0, 51.0),
 'Toby Alderweireld': (0.0, 0.07563523, 110.0, 34.0),
 'Romelu Lukaku Menama': (0.0, 0.078560986, 104.0, 30.0),
 'Jesse Lingard': (0.0, 0.014216057, 107.0, 58.0),
 'Eric Dier': (0.0, 0.06710211, 109.0, 43.0),
 'Dries Mertens': (0.0, 0.0634691, 104.0, 30.0),
 'Marcus Rashford': (0.0, 0.03826514, 99.0, 23.0)}

In [None]:
events = ['19714', '19715', '19716', '19717', '19718', '19719', '19720',
          '19722', '19723', '19724', '19725', '19726', '19727', '19728',
          '19730', '19731', '19732', '19733', '19734', '19735', '19736',
          '19738', '19739', '19740', '19741', '19742', '19743', '19744',
          '19745', '19746', '19747', '19748', '19749', '19750', '19751',
          '19752', '19753', '19759', '19760', '19761', '19762', '19763',
          '19765', '19766', '19767', '19768', '7298', '7443', '7444', '7445',
          '7456', '7457', '7471', '7472', '7473', '7474', '7475', '7476',
          '7477', '7478', '7479', '7480', '7482', '7483', '7484', '7485',
          '7486', '7487', '7490', '7492', '7493', '7494', '7496', '7497',
          '7500', '7519', '7520', '7521', '7522', '7523', '7524', '7525',
          '7529', '7530', '7531', '7532', '7533', '7534', '7535', '7536',
          '7537', '7538', '7539', '7540', '7541', '7542', '7543', '7544',
          '7545', '7546', '7547', '7548', '7549', '7550', '7551', '7552',
          '7553', '7554', '7555', '7556', '7557', '7558', '7559', '7560',
          '7561', '7562', '7563', '7564', '7565', '7566', '7567', '7568',
          '7569', '7570', '7571', '7572', '7576', '7577', '7578', '7579',
          '7580', '7581', '7582', '7583', '7584', '7585', '7586', '8649',
          '8650', '8651', '8652', '8655', '8656', '8657', '8658']

seq = []
for event in tqdm(events):
    BvsE = sb.Events(event_id=event)
    df_shot = BvsE.get_dataframe(event_type='shot')
    
    for side in ['start', 'end']:
        key_x = '{0}_location_x_normalized'.format(side)
        key_y = '{0}_location_y_normalized'.format(side)

        df_shot[key_x] = df_shot[key_x[:-11]] / max(df_shot.start_location_x)
        df_shot[key_y] = df_shot[key_y[:-11]] / max(df_shot.start_location_y) 
    df_shot.body_part = df_shot.body_part.apply(lambda part: 0 if re.findall(r'Foot', part) else 1 if re.findall(r'Head', part) else 2)
    
    key_start_x = 'start_location_x_normalized'
    key_start_y = 'start_location_y_normalized'
    key_start_distance = 'start_distance'.format(action=action)
    df_shot[key_start_distance] = df_shot.apply(lambda s: distance.euclidean((s[key_start_x], s[key_start_y]), goal), axis=1)

    res = {}
    fields = ['start_location_x_normalized', 'start_location_y_normalized', 'body_part', 'start_distance']
    df = df_shot[fields]

    for i in range(df_shot.index.size-1):
        key = '{0}'.format(df_shot.player[i])
        data = pd.DataFrame([([df[df.columns[j]][i] for j in range(df.columns.size)])], columns=fields)
        val = round(proba(data[0:1].values, model), 7)
        res[key] = (val, df_shot.statsbomb_xg[i], df_shot.start_location_x[i], df_shot.start_location_y[i])
    
    for name, (xg, sbxg, _, _) in res.items():
        seq.append((xg, sbxg))