In [21]:
import os
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from mplsoccer import Pitch
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, ReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, TerminateOnNaN, ReduceLROnPlateau, EarlyStopping, CSVLogger
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta

### Based on simple model by Paul Minogue : https://paulminogue.com/posts/8e4e1914-4cd9-4a0f-9cf0-9dbf3c47e1ce 
#### "Building a simple expected pass completion (xP) model using Keras"

#### First test just with man city data

Building pass event dataframe

In [22]:
event_data_dir = '../data/events'
pass_data = []

# iterate through each file in the events data directory and pull each event if it is a passing event
for filename in tqdm(os.listdir(event_data_dir)):
    if filename.endswith(".json"): 
        with open(os.path.join(event_data_dir, filename)) as json_file:
            match_id = int(filename[-6])
            data = json.load(json_file)
            for event in data:
                if event.get('type', {}).get('name', None) == 'Pass':
                    pass_data.append(
                        {
                            'match_id': match_id,
                            'event_id': event['id'],
                            'timestamp': event['timestamp'],
                            'period' : event['period'],
                            'passing_team': event.get('possession_team', {}).get('name', None),
                            'recipient_name': event.get('pass', {}).get('recipient', {}).get('name', None),
                            'passing_player': event.get('player', {}).get('name', None),
                            'passing_player_x_location': event['location'][0],
                            'passing_player_y_location': event['location'][1],
                            'pass_end_location_x': event.get('pass', {})['end_location'][0],
                            'pass_end_location_y': event.get('pass', {})['end_location'][1],
                            'pass_height_category': event.get('pass', {}).get('height', {}).get('name', None),
                            'body_part': event.get('pass', {}).get('body_part', {}).get('name', None),
                            'outcome': event.get('pass', {}).get('outcome', {}).get('name', None)
                        }
                    )
# Note that "None" outcome means complete
pass_df = pd.DataFrame(pass_data)
pass_df.head()

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,match_id,event_id,timestamp,period,passing_team,recipient_name,passing_player,passing_player_x_location,passing_player_y_location,pass_end_location_x,pass_end_location_y,pass_height_category,body_part,outcome
0,2,bc23853b-0aa8-42ac-9daa-deefa00d9de2,00:00:00.250,1,Aston Villa,Lucy Staniforth,Rachel Daly,60.0,40.0,49.8,38.5,Ground Pass,Right Foot,
1,2,1fe69497-98e8-4d4f-84c2-4ec034eb97ec,00:00:02.056,1,Aston Villa,Mayumi Pacheco,Lucy Staniforth,48.4,35.9,40.3,19.0,Ground Pass,Right Foot,
2,2,13549a51-9ecd-4bdb-94fd-c4982c1fded6,00:00:04.507,1,Aston Villa,Danielle Turner,Mayumi Pacheco,39.7,18.7,30.2,22.1,Ground Pass,Left Foot,
3,2,711c01d2-7bdb-4a2f-bf3c-a5a0baa809d5,00:00:07.681,1,Aston Villa,Anna Patten,Danielle Turner,28.0,25.5,23.5,53.6,Ground Pass,Left Foot,
4,2,32629e9e-92ae-4184-abec-bbc050519402,00:00:11.728,1,Aston Villa,Danielle Turner,Anna Patten,28.7,57.8,26.6,30.5,Ground Pass,Right Foot,


In [23]:
# Step 1: remove pass events we don't want
modelling_df = pass_df.loc[
    ~pass_df['outcome'].isin(['Injury Clearance', 'Pass Offside', 'Unknown'])
]

# Step 2: create one hot variables
# pass height and body part
one_hot_pass_height_variables = pd.get_dummies(modelling_df['pass_height_category'])
one_hot_body_part_variables = pd.get_dummies(modelling_df['body_part'])

# tidies up naming before appending row wise
one_hot_pass_height_variables.columns = [
    col.lower().replace(' ', '_') for col in one_hot_pass_height_variables.columns
]
one_hot_body_part_variables.columns = [
    col.lower().replace(' ', '_') for col in one_hot_body_part_variables.columns
]

modelling_df = pd.concat([modelling_df, one_hot_pass_height_variables], axis=1)
modelling_df = pd.concat([modelling_df, one_hot_body_part_variables], axis=1)

# Step 3: create binary pass complete column
modelling_df['completed'] = 0
modelling_df.loc[modelling_df['outcome'].isna(), 'completed'] = 1

# Step 4: finally filter down to the columns we want
modelling_cols = (
    [
        'event_id',
        'passing_player_x_location', 
        'passing_player_y_location', 
        'pass_end_location_x',
        'pass_end_location_y'
    ] + 
    list(one_hot_pass_height_variables.columns) +
    list(one_hot_body_part_variables.columns) + 
    ['completed']
)

modelling_df = modelling_df[modelling_cols]

In [24]:
modelling_df.to_pickle("../data/modelling_df_city.pkl")

Create Train and Test dataset

In [25]:
# Here X represents our predictors (location, height, body part) and y represents our target (completed)
X = modelling_df.iloc[:, 1:-1]
y = modelling_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=123, stratify=y
)


X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.25, random_state=123, stratify=y_test
)

### Oversample minority class

To review

In [26]:
negative_samples = y_train.value_counts()[0]
positive_samples = y_train.value_counts()[1]
current_negative_samples = pd.concat([X_train, y_train], axis=1)
current_negative_samples = current_negative_samples.loc[current_negative_samples['completed'] == 0].iloc[:, :-1]
additional_negative_samples = []
y_vals = []

print("Nombre de passes complétées: " + str(positive_samples))
print("Nombre de passes ratées: " + str(negative_samples))

while negative_samples < positive_samples:
    sample_for_smote = current_negative_samples.sample().to_dict(orient='records')[0]
    smote_player_x_loc = sample_for_smote['passing_player_x_location'] + random.uniform(-4, 4)
    # do a small bit of smote
    sample_for_smote['passing_player_x_location'] = (
        sample_for_smote['passing_player_x_location'] + random.uniform(-4, 4)
    )
    sample_for_smote['passing_player_y_location'] = (
        sample_for_smote['passing_player_y_location'] + random.uniform(-4, 4)
    )
    sample_for_smote['pass_end_location_x'] = (
        sample_for_smote['pass_end_location_x'] + random.uniform(-4, 4)
    )
    sample_for_smote['pass_end_location_y'] = (
        sample_for_smote['pass_end_location_y'] + random.uniform(-4, 4)
    )
    additional_negative_samples.append(sample_for_smote)
    y_vals.append(0)
    negative_samples += 1
#    print(str(negative_samples))
    
X_train = pd.concat([X_train, pd.DataFrame(additional_negative_samples)]).reset_index(drop=True)
y_train = pd.concat([y_train, pd.Series(y_vals)]).reset_index(drop=True)

Nombre de passes complétées: 624
Nombre de passes ratées: 148


### Train our first simple model

In [27]:
def create_model():
    model = Sequential()
    
    model.add(BatchNormalization())
    model.add(Dense(128))
    model.add(ReLU())
    model.add(Dropout(0.5))
    
    model.add(Dense(64))
    model.add(ReLU())
    model.add(Dropout(0.5))

    model.add(Dense(8))
    model.add(ReLU())
    model.add(Dropout(0.5))

    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.001)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [29]:
first_model = create_model()

# some model callbacks to improve results on monitor training
save_best_model = ModelCheckpoint("../model/pass_model.hdf5",monitor="val_loss")
terminate_on_nan = TerminateOnNaN()
csv_logger = CSVLogger('training.log')
dynamic_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=15, verbose=0, mode='auto', cooldown=0, min_lr=0
)
stop_early = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=25, verbose=0, mode='auto'
)

# Here is where the model actually trains
model_history = first_model.fit(
    np.array(X_train).astype('float32'),
    np.array(y_train).astype('float32'),
    batch_size=128,
    epochs=100,
    verbose = 1,
    callbacks=[save_best_model, terminate_on_nan, dynamic_lr, stop_early, csv_logger],
    validation_data=(
        np.array(X_val).astype('float32'), 
        np.array(y_val).astype('float32')
    ),
    shuffle=True
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100


### Assess Results


In [30]:
# load the model
model = load_model("../model/pass_model.hdf5")

y_pred = model.predict(np.array(X_test))
y_prob = y_pred
y_pred = y_pred > 0.5
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.44      0.73      0.55       446
           1       0.92      0.77      0.84      1873

    accuracy                           0.77      2319
   macro avg       0.68      0.75      0.69      2319
weighted avg       0.83      0.77      0.79      2319

