# Imports 

In [1]:
!pip install tables==3.6.1
!pip install socceraction==0.2.0



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
import seaborn as sns
from math import *
import matplotlib.pylab as pyl
import pickle
import swifter
import warnings
import plotly.express as px
from itertools import chain
import scipy.stats as sps
from tqdm import tqdm
from unidecode import unidecode
import re
from io import BytesIO
from pathlib import Path
from tqdm.notebook import tqdm
from urllib.parse import urlparse
from urllib.request import urlopen, urlretrieve
from zipfile import ZipFile, is_zipfile
import pandas as pd
from sklearn.metrics import brier_score_loss, roc_auc_score  # version 0.22.2
from xgboost import XGBClassifier  # version 1.0.2

import socceraction.vaep.features as features
import socceraction.vaep.labels as labels

from socceraction.spadl.wyscout import convert_to_spadl
from socceraction.vaep.formula import value

In [3]:
#pd.set_option('max_colwidth', 999)
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_rows", 3000)

In [4]:
import warnings
warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning)

# Preprocess the data

## Preprocess the Wyscout data

In [5]:
def read_json_file(filename):
    with open(filename, 'rb') as json_file:
        return BytesIO(json_file.read()).getvalue().decode('unicode_escape')

### Teams

In [6]:
json_teams = read_json_file('teams.json')
df_teams = pd.read_json(json_teams)

In [7]:
df_teams.head(10)

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
2,Barcelona,Espanyol,691,Reial Club Deportiu Espanyol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
3,Vitoria-Gasteiz,Deportivo Alavés,696,Deportivo Alavés,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
4,Valencia,Levante,695,Levante UD,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
5,Troyes,Troyes,3795,Espérance Sportive Troyes Aube Champagne,"{'name': 'France', 'id': '250', 'alpha3code': ...",club
6,Getafe (Madrid),Getafe,698,Getafe Club de Fútbol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
7,Mönchengladbach,Borussia M'gladbach,2454,Borussia VfL Mönchengladbach,"{'name': 'Germany', 'id': '276', 'alpha3code':...",club
8,"Huddersfield, West Yorkshire",Huddersfield Town,1673,Huddersfield Town FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
9,Bilbao,Athletic Club,678,Athletic Club Bilbao,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club


In [8]:
df_teams.to_hdf('wyscout.h5', key='teams', mode='w')

### Players

In [9]:
json_players = read_json_file('players.json')
df_players = pd.read_json(json_players)

In [10]:
df_players.head(10)

Unnamed: 0,passportArea,weight,firstName,middleName,lastName,currentTeamId,birthDate,height,role,birthArea,wyId,foot,shortName,currentNationalTeamId
0,"{'name': 'Turkey', 'id': '792', 'alpha3code': ...",78,Harun,,Tekin,4502,1989-06-17,187,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'Turkey', 'id': '792', 'alpha3code': ...",32777,right,H. Tekin,4687.0
1,"{'name': 'Senegal', 'id': '686', 'alpha3code':...",73,Malang,,Sarr,3775,1999-01-23,182,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",393228,left,M. Sarr,4423.0
2,"{'name': 'France', 'id': '250', 'alpha3code': ...",72,Over,,Mandanda,3772,1998-10-26,176,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'France', 'id': '250', 'alpha3code': ...",393230,,O. Mandanda,
3,"{'name': 'Senegal', 'id': '686', 'alpha3code':...",82,Alfred John Momar,,N'Diaye,683,1990-03-06,187,"{'code2': 'MD', 'code3': 'MID', 'name': 'Midfi...","{'name': 'France', 'id': '250', 'alpha3code': ...",32793,right,A. N'Diaye,19314.0
4,"{'name': 'France', 'id': '250', 'alpha3code': ...",84,Ibrahima,,Konaté,2975,1999-05-25,192,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",393247,right,I. Konaté,
5,"{'name': 'Netherlands', 'id': '528', 'alpha3co...",83,Jasper,,Cillessen,676,1989-04-22,185,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'Netherlands', 'id': '528', 'alpha3co...",33,right,J. Cillessen,664.0
6,"{'name': 'Belgium', 'id': '56', 'alpha3code': ...",91,Toby,,Alderweireld,1624,1989-03-02,187,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'Belgium', 'id': '56', 'alpha3code': ...",36,right,T. Alderweireld,5629.0
7,"{'name': 'Belgium', 'id': '56', 'alpha3code': ...",88,Jan,,Vertonghen,1624,1987-04-24,189,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'Belgium', 'id': '56', 'alpha3code': ...",48,left,J. Vertonghen,5629.0
8,"{'name': 'France', 'id': '250', 'alpha3code': ...",74,Alexander,,Djiku,3783,1994-08-09,182,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",229427,right,A. Djiku,
9,"{'name': 'Denmark', 'id': '208', 'alpha3code':...",76,Christian,,Dannemann Eriksen,1624,1992-02-14,180,"{'code2': 'MD', 'code3': 'MID', 'name': 'Midfi...","{'name': 'Denmark', 'id': '208', 'alpha3code':...",54,right,C. Eriksen,7712.0


In [11]:
df_players.to_hdf('wyscout.h5', key='players', mode='a')

### Matches

In [12]:
competitions = [
     'England',
     'France',
     'Germany',
     'Italy',
     'Spain',
    'European Championship',
     'World Cup'
]

In [13]:
dfs_matches = []
for competition in competitions:
    competition_name = competition.replace(' ', '_')
    file_matches = f'matches_{competition_name}.json'
    json_matches = read_json_file(file_matches)
    df_matches = pd.read_json(json_matches)
    dfs_matches.append(df_matches)
df_matches = pd.concat(dfs_matches)

In [14]:
df_matches.head(10)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId,groupName
0,Played,4405654,38,"{'1646': {'scoreET': 0, 'coachId': 8880, 'side...",181150,2018-05-13 14:00:00,1659,Turf Moor,2500089,"Burnley - AFC Bournemouth, 1 - 2","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385705, 'role': 'referee'}, {'r...",Regular,364,
1,Played,4405654,38,"{'1628': {'scoreET': 0, 'coachId': 8357, 'side...",181150,2018-05-13 14:00:00,1628,Selhurst Park,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 381851, 'role': 'referee'}, {'r...",Regular,364,
2,Played,4405654,38,"{'1609': {'scoreET': 0, 'coachId': 7845, 'side...",181150,2018-05-13 14:00:00,1609,The John Smith's Stadium,2500091,"Huddersfield Town - Arsenal, 0 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 384965, 'role': 'referee'}, {'r...",Regular,364,
3,Played,4405654,38,"{'1651': {'scoreET': 0, 'coachId': 8093, 'side...",181150,2018-05-13 14:00:00,1612,Anfield,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385704, 'role': 'referee'}, {'r...",Regular,364,
4,Played,4405654,38,"{'1644': {'scoreET': 0, 'coachId': 93112, 'sid...",181150,2018-05-13 14:00:00,1611,Old Trafford,2500093,"Manchester United - Watford, 1 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 381853, 'role': 'referee'}, {'r...",Regular,364,
5,Played,4405654,38,"{'1613': {'scoreET': 0, 'coachId': 210700, 'si...",181150,2018-05-13 14:00:00,1613,St. James' Park,2500094,"Newcastle United - Chelsea, 3 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 384888, 'role': 'referee'}, {'r...",Regular,364,
6,Played,4405654,38,"{'1625': {'scoreET': 0, 'coachId': 267136, 'si...",181150,2018-05-13 14:00:00,1625,St. Mary's Stadium,2500095,"Southampton - Manchester City, 0 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385911, 'role': 'referee'}, {'r...",Regular,364,
7,Played,4405654,38,"{'10531': {'scoreET': 0, 'coachId': 32573, 'si...",181150,2018-05-13 14:00:00,1639,Liberty Stadium,2500096,"Swansea City - Stoke City, 1 - 2","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 378952, 'role': 'referee'}, {'r...",Regular,364,
8,Played,4405654,38,"{'1631': {'scoreET': 0, 'coachId': 209010, 'si...",181150,2018-05-13 14:00:00,1624,Wembley Stadium,2500097,"Tottenham Hotspur - Leicester City, 5 - 4","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 378951, 'role': 'referee'}, {'r...",Regular,364,
9,Played,4405654,38,"{'1623': {'scoreET': 0, 'coachId': 8541, 'side...",181150,2018-05-13 14:00:00,1633,London Stadium,2500098,"West Ham United - Everton, 3 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 408156, 'role': 'referee'}, {'r...",Regular,364,


In [15]:
df_matches.to_hdf('wyscout.h5', key='matches', mode='a')

### Events

In [16]:
for competition in competitions:
    competition_name = competition.replace(' ', '_')
    file_events = f'events_{competition_name}.json'
    json_events = read_json_file(file_events)
    df_events = pd.read_json(json_events)
    df_events_matches = df_events.groupby('matchId', as_index=False)
    for match_id, df_events_match in df_events_matches:
        df_events_match.to_hdf('wyscout.h5', key=f'events/match_{match_id}', mode='a')

## Convert the Wyscout data to the SPADL representation

In [17]:
convert_to_spadl('wyscout.h5', 'spadl.h5')

...Inserting actiontypes
...Inserting bodyparts
...Inserting results
...Converting games


  0%|          | 0/1941 [00:00<?, ?game/s]

...Converting players
...Converting teams
...Generating player_games


100%|██████████| 1941/1941 [01:30<00:00, 21.38game/s]
  0%|          | 0/1941 [00:00<?, ?game/s]

...Converting events to actions


100%|██████████| 1941/1941 [1:24:20<00:00,  2.61s/game]


# Value game states

In [18]:
df_games = pd.read_hdf('spadl.h5', key='games')
df_actiontypes = pd.read_hdf('spadl.h5', key='actiontypes')
df_bodyparts = pd.read_hdf('spadl.h5', key='bodyparts')
df_results = pd.read_hdf('spadl.h5', key='results')

In [19]:
nb_prev_actions = 3

## Generate game state features

In [20]:
functions_features = [
    features.actiontype_onehot,
    features.bodypart_onehot,
    features.result_onehot,
    features.goalscore,
    features.startlocation,
    features.endlocation,
    features.movement,
    features.space_delta,
    features.startpolar,
    features.endpolar,
    features.team,
    features.time_delta
]

In [21]:
df_actions = pd.read_hdf('spadl.h5', key=f'actions/game_{2500089}')

In [22]:
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .reset_index(drop=True)
    )
    
    dfs_gamestates = features.gamestates(df_actions, nb_prev_actions=nb_prev_actions)
    dfs_gamestates = features.play_left_to_right(dfs_gamestates, game['home_team_id'])
    
    df_features = pd.concat([function(dfs_gamestates) for function in functions_features], axis=1)
    df_features.to_hdf('features.h5', key=f'game_{game_id}')

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




## Generate game state labels

In [23]:
functions_labels = [
    labels.scores,
    labels.concedes
]

In [24]:
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .reset_index(drop=True)
    )
    
    df_labels = pd.concat([function(df_actions) for function in functions_labels], axis=1)
    df_labels.to_hdf('labels.h5', key=f'game_{game_id}')

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




## Generate dataset

In [25]:
columns_features = features.feature_column_names(functions_features, nb_prev_actions=nb_prev_actions)

In [26]:
dfs_features = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_features = pd.read_hdf('features.h5', key=f'game_{game_id}')
    dfs_features.append(df_features[columns_features])
df_features = pd.concat(dfs_features).reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [27]:
df_features.head(10)

Unnamed: 0,type_pass_a0,type_cross_a0,type_throw_in_a0,type_freekick_crossed_a0,type_freekick_short_a0,type_corner_crossed_a0,type_corner_short_a0,type_take_on_a0,type_foul_a0,type_tackle_a0,type_interception_a0,type_shot_a0,type_shot_penalty_a0,type_shot_freekick_a0,type_keeper_save_a0,type_keeper_claim_a0,type_keeper_punch_a0,type_keeper_pick_up_a0,type_clearance_a0,type_bad_touch_a0,type_non_action_a0,type_dribble_a0,type_goalkick_a0,type_pass_a1,type_cross_a1,type_throw_in_a1,type_freekick_crossed_a1,type_freekick_short_a1,type_corner_crossed_a1,type_corner_short_a1,type_take_on_a1,type_foul_a1,type_tackle_a1,type_interception_a1,type_shot_a1,type_shot_penalty_a1,type_shot_freekick_a1,type_keeper_save_a1,type_keeper_claim_a1,type_keeper_punch_a1,type_keeper_pick_up_a1,type_clearance_a1,type_bad_touch_a1,type_non_action_a1,type_dribble_a1,type_goalkick_a1,type_pass_a2,type_cross_a2,type_throw_in_a2,type_freekick_crossed_a2,type_freekick_short_a2,type_corner_crossed_a2,type_corner_short_a2,type_take_on_a2,type_foul_a2,type_tackle_a2,type_interception_a2,type_shot_a2,type_shot_penalty_a2,type_shot_freekick_a2,type_keeper_save_a2,type_keeper_claim_a2,type_keeper_punch_a2,type_keeper_pick_up_a2,type_clearance_a2,type_bad_touch_a2,type_non_action_a2,type_dribble_a2,type_goalkick_a2,bodypart_foot_a0,bodypart_head_a0,bodypart_other_a0,bodypart_foot_a1,bodypart_head_a1,bodypart_other_a1,bodypart_foot_a2,bodypart_head_a2,bodypart_other_a2,result_fail_a0,result_success_a0,result_offside_a0,result_owngoal_a0,result_yellow_card_a0,result_red_card_a0,result_fail_a1,result_success_a1,result_offside_a1,result_owngoal_a1,result_yellow_card_a1,result_red_card_a1,result_fail_a2,result_success_a2,result_offside_a2,result_owngoal_a2,result_yellow_card_a2,result_red_card_a2,goalscore_team,goalscore_opponent,goalscore_diff,start_x_a0,start_y_a0,start_x_a1,start_y_a1,start_x_a2,start_y_a2,end_x_a0,end_y_a0,end_x_a1,end_y_a1,end_x_a2,end_y_a2,dx_a0,dy_a0,movement_a0,dx_a1,dy_a1,movement_a1,dx_a2,dy_a2,movement_a2,dx_a01,dy_a01,mov_a01,dx_a02,dy_a02,mov_a02,start_dist_to_goal_a0,start_angle_to_goal_a0,start_dist_to_goal_a1,start_angle_to_goal_a1,start_dist_to_goal_a2,start_angle_to_goal_a2,end_dist_to_goal_a0,end_angle_to_goal_a0,end_dist_to_goal_a1,end_angle_to_goal_a1,end_dist_to_goal_a2,end_angle_to_goal_a2,team_1,team_2,time_delta_1,time_delta_2
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,52.5,34.0,52.5,34.0,52.5,34.0,42.0,37.4,42.0,37.4,42.0,37.4,-10.5,3.4,11.036757,-10.5,3.4,11.036757,-10.5,3.4,11.036757,-10.5,3.4,11.03676,-10.5,3.4,11.036757,52.5,0.0,52.5,0.0,52.5,0.0,63.091679,0.053916,63.091679,0.053916,63.091679,0.053916,True,True,0.0,0.0
1,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,42.0,37.4,52.5,34.0,52.5,34.0,40.95,57.8,42.0,37.4,42.0,37.4,-1.05,20.4,20.427004,-10.5,3.4,11.036757,-10.5,3.4,11.036757,0.0,0.0,0.0,0.0,0.0,0.0,63.091679,0.053916,52.5,0.0,52.5,0.0,68.328929,0.355773,63.091679,0.053916,63.091679,0.053916,True,True,1.997756,1.997756
2,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,40.95,57.8,42.0,37.4,52.5,34.0,32.55,47.6,40.95,57.8,42.0,37.4,-8.4,-10.2,13.213629,-1.05,20.4,20.427004,-10.5,3.4,11.036757,0.0,0.0,0.0,1.05,-20.4,20.427004,68.328929,0.355773,63.091679,0.053916,52.5,0.0,73.715416,0.185556,68.328929,0.355773,63.091679,0.053916,True,True,0.771744,2.7695
3,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,32.55,47.6,40.95,57.8,42.0,37.4,69.3,48.96,32.55,47.6,40.95,57.8,36.75,1.36,36.775156,-8.4,-10.2,13.213629,-1.05,20.4,20.427004,0.0,0.0,0.0,8.4,10.2,13.213629,73.715416,0.185556,68.328929,0.355773,63.091679,0.053916,38.707772,0.396818,73.715416,0.185556,68.328929,0.355773,True,True,2.174464,2.946208
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,69.3,48.96,32.55,47.6,40.95,57.8,74.55,55.76,69.3,48.96,32.55,47.6,5.25,6.8,8.590838,36.75,1.36,36.775156,-8.4,-10.2,13.213629,0.0,0.0,0.0,-36.75,-1.36,36.775156,38.707772,0.396818,73.715416,0.185556,68.328929,0.355773,37.425928,0.620467,38.707772,0.396818,73.715416,0.185556,True,True,3.907382,6.081846
5,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,74.55,55.76,69.3,48.96,32.55,47.6,95.55,63.24,74.55,55.76,69.3,48.96,21.0,7.48,22.292384,5.25,6.8,8.590838,36.75,1.36,36.775156,0.0,0.0,0.0,-5.25,-6.8,8.590838,37.425928,0.620467,38.707772,0.396818,73.715416,0.185556,30.729141,1.258205,37.425928,0.620467,38.707772,0.396818,True,True,3.75873,7.666112
6,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,95.55,63.24,74.55,55.76,69.3,48.96,96.6,34.0,95.55,63.24,74.55,55.76,1.05,-29.24,29.258847,21.0,7.48,22.292384,5.25,6.8,8.590838,0.0,0.0,0.0,-21.0,-7.48,22.292384,30.729141,1.258205,37.425928,0.620467,38.707772,0.396818,8.4,0.0,30.729141,1.258205,37.425928,0.620467,True,True,2.210584,5.969314
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0,0,0,8.4,34.0,9.45,4.76,30.45,12.24,2.1,8.16,8.4,34.0,9.45,4.76,-6.3,-25.84,26.59691,-1.05,29.24,29.258847,-21.0,-7.48,22.292384,5.329071e-15,0.0,5.329071e-15,1.05,-29.24,29.258847,96.6,0.0,99.923872,0.296969,77.660802,0.283995,106.094842,0.24603,96.6,0.0,99.923872,0.296969,False,False,1.756122,3.966706
8,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,0,102.9,59.84,96.6,34.0,95.55,63.24,102.9,59.84,102.9,59.84,96.6,34.0,0.0,0.0,0.0,6.3,25.84,26.59691,1.05,-29.24,29.258847,0.0,0.0,0.0,-6.3,-25.84,26.59691,25.925192,1.489705,8.4,0.0,30.729141,1.258205,25.925192,1.489705,25.925192,1.489705,8.4,0.0,False,True,2.095783,3.851905
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,0,6.3,8.16,2.1,8.16,8.4,34.0,22.05,0.0,2.1,8.16,2.1,8.16,15.75,-8.16,17.738323,0.0,0.0,0.0,-6.3,-25.84,26.59691,-4.2,-3.552714e-15,4.2,-4.2,-3.552714e-15,4.2,102.026446,0.256057,106.094842,0.24603,96.6,0.0,89.647658,0.388999,106.094842,0.24603,106.094842,0.24603,False,True,3.034782,5.130565


In [28]:
columns_labels = [
    'scores',
    'concedes'
]

In [29]:
dfs_labels = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_labels = pd.read_hdf('labels.h5', key=f'game_{game_id}')
    dfs_labels.append(df_labels[columns_labels])
df_labels = pd.concat(dfs_labels).reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [31]:
df_labels.tail(10)

Unnamed: 0,scores,concedes
2465146,True,False
2465147,True,False
2465148,True,False
2465149,True,False
2465150,True,False
2465151,True,False
2465152,False,True
2465153,False,True
2465154,True,False
2465155,False,False


## Train classifiers

In [32]:
models = {}
for column_labels in columns_labels:
    model = XGBClassifier(n_estimators=100, max_depth=4)
    model.fit(df_features, df_labels[column_labels])
    models[column_labels] = model

In [33]:
filename = 'finalised_model.pkl'
pickle.dump(model,open(filename,'wb'))

## Estimate probabilities

In [34]:
dfs_predictions = {}
for column_labels in columns_labels:
    model = models[column_labels]
    probabilities = model.predict_proba(df_features)
    predictions = probabilities[:, 1]
    dfs_predictions[column_labels] = pd.Series(predictions)
df_predictions = pd.concat(dfs_predictions, axis=1)

In [35]:
df_predictions.head(10)

Unnamed: 0,scores,concedes
0,0.003628,0.001496
1,0.005049,0.00175
2,0.003883,0.002318
3,0.01461,0.002321
4,0.018392,0.001865
5,0.020405,0.001477
6,0.011381,0.002902
7,0.001521,0.02292
8,0.021142,0.0026
9,0.001294,0.008486


The following cell obtains the `game_id` for each action in order to store the predictions per game.

In [36]:
dfs_game_ids = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('spadl.h5', key=f'actions/game_{game_id}')
    dfs_game_ids.append(df_actions['game_id'])
df_game_ids = pd.concat(dfs_game_ids, axis=0).astype('int').reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [37]:
df_predictions = pd.concat([df_predictions, df_game_ids], axis=1)

In [38]:
df_predictions.head(10)

Unnamed: 0,scores,concedes,game_id
0,0.003628,0.001496,2500089
1,0.005049,0.00175,2500089
2,0.003883,0.002318,2500089
3,0.01461,0.002321,2500089
4,0.018392,0.001865,2500089
5,0.020405,0.001477,2500089
6,0.011381,0.002902,2500089
7,0.001521,0.02292,2500089
8,0.021142,0.0026,2500089
9,0.001294,0.008486,2500089


In [39]:
df_predictions_per_game = df_predictions.groupby('game_id')

In [40]:
for game_id, df_predictions in tqdm(df_predictions_per_game):
    df_predictions = df_predictions.reset_index(drop=True)
    df_predictions[columns_labels].to_hdf('predictions.h5', key=f'game_{game_id}')

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




# Value on-the-ball actions - VAEP Values

<img src="./expl_photo/1.JPG">
<img src="./expl_photo/2.JPG">
<img src="./expl_photo/3.JPG">
<img src="./expl_photo/4.JPG">

In [41]:
df_players = pd.read_hdf('spadl.h5', key='players')
df_teams = pd.read_hdf('spadl.h5', key='teams')

In [42]:
dfs_values = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .merge(df_players, how='left')
        .merge(df_teams, how='left')
        .reset_index(drop=True)
    )
    
    df_predictions = pd.read_hdf('predictions.h5', key=f'game_{game_id}')
    df_values = value(df_actions, df_predictions['scores'], df_predictions['concedes'])
    
    df_all = pd.concat([df_actions, df_predictions, df_values], axis=1)
    dfs_values.append(df_all)

HBox(children=(FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [43]:
df_values = (pd.concat(dfs_values)
    .sort_values(['game_id', 'period_id', 'time_seconds'])
    .reset_index(drop=True)
)

In [44]:
pl_teams = ['Arsenal','Leicester City','Manchester City','Brighton & Hove Albion','Burnley','Chelsea',
            'Crystal Palace','Huddersfield Town','Everton','Stoke City','Manchester United','West Ham United',
            'Newcastle United','Tottenham Hotspur','Swansea City','Southampton','Watford','Liverpool',
            'West Bromwich Albion','AFC Bournemouth']

In [46]:
df_values_pl = df_values.loc[df_values['short_team_name'].isin(pl_teams)]

In [50]:
df_values_pl.loc[(df_values_pl['game_id']==2499719.0)]

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,bodypart_id,type_id,result_id,type_name,result_name,bodypart_name,short_name,first_name,last_name,birth_date,short_team_name,team_name,scores,concedes,offensive_value,defensive_value,vaep_value
143676,2499719.0,1.0,2.758649,1609.0,25413.0,51.45,34.68,32.55,14.96,0,0,1,pass,success,foot,A. Lacazette,Alexandre,Lacazette,1991-05-28,Arsenal,Arsenal FC,0.003792,0.000771,0.0,-0.0,0.0
143677,2499719.0,1.0,4.94685,1609.0,370224.0,32.55,14.96,53.55,17.0,0,0,1,pass,success,foot,R. Holding,Rob,Holding,1995-09-20,Arsenal,Arsenal FC,0.006808,0.001816,0.003016,-0.001045622,0.001971
143678,2499719.0,1.0,6.542188,1609.0,3319.0,53.55,17.0,36.75,19.72,1,0,1,pass,success,head,M. Özil,Mesut,Özil,1988-10-15,Arsenal,Arsenal FC,0.005715,0.001805,-0.001093,1.148973e-05,-0.001082
143679,2499719.0,1.0,8.143395,1609.0,120339.0,36.75,19.72,43.05,3.4,1,0,1,pass,success,head,Mohamed Elneny,Mohamed Naser,Elsayed Elneny,1992-07-11,Arsenal,Arsenal FC,0.005394,0.001754,-0.000321,5.096453e-05,-0.00027
143680,2499719.0,1.0,10.302366,1609.0,167145.0,43.05,3.4,75.6,8.16,0,0,1,pass,success,foot,Bellerín,Héctor,Bellerín Moruno,1995-03-19,Arsenal,Arsenal FC,0.013657,0.001785,0.008263,-3.088184e-05,0.008232
143681,2499719.0,1.0,12.548934,1609.0,3319.0,75.6,8.16,80.85,17.0,0,0,0,pass,fail,foot,M. Özil,Mesut,Özil,1988-10-15,Arsenal,Arsenal FC,0.008465,0.003832,-0.005192,-0.002047073,-0.007239
143682,2499719.0,1.0,13.961228,1631.0,8653.0,80.85,17.0,64.05,10.2,1,0,1,pass,success,head,H. Maguire,Harry,Maguire,1993-03-05,Leicester City,Leicester City FC,0.006062,0.0032,0.00223,0.00526586,0.007496
143683,2499719.0,1.0,15.320341,1609.0,167145.0,70.35,13.6,70.35,13.6,0,10,1,interception,success,foot,Bellerín,Héctor,Bellerín Moruno,1995-03-19,Arsenal,Arsenal FC,0.010396,0.00215,0.007196,0.003911996,0.011108
143684,2499719.0,1.0,15.320341,1609.0,167145.0,70.35,13.6,61.95,26.52,1,0,1,pass,success,head,Bellerín,Héctor,Bellerín Moruno,1995-03-19,Arsenal,Arsenal FC,0.00871,0.00242,-0.001686,-0.0002704116,-0.001957
143685,2499719.0,1.0,18.051875,1609.0,49876.0,61.95,26.52,47.25,37.4,1,0,1,pass,success,head,G. Xhaka,Granit,Xhaka,1992-09-27,Arsenal,Arsenal FC,0.007581,0.001843,-0.001129,0.0005771858,-0.000552


* scores = P_scores(Si, x) - prob(scoring in the next 10 actions) - obtained from the ML Model - given for the particular `state`
* concedes = P_concedes(Si, x) - prob(concedes in the next 10 actions) - obtained from the ML Model - given for the particular `state`
* offensive_value = P_scores(Si) - P_scores(Si-1) - given for the particular `action`
* defensive_value = P_concedes(Si-1) - P_concedes(Si) - given for the particular `action`

In [51]:
df_values_pl.to_pickle('./vaep_values.pkl')

# VAEP for Defenders 

## Pre-Processing 

In [85]:
vaep_values = pd.read_pickle('./vaep_values.pkl')
match_def = pd.read_pickle("../../data/matches/match+def_lineup+footedness_ver2.pkl")

In [86]:
vaep_values = vaep_values.replace({'short_team_name':{
        'Manchester United': 'Man Utd',
        'Tottenham Hotspur': 'Spurs',
        'West Ham United': 'West Ham',
        'Manchester City': 'Man City',
        'Brighton & Hove Albion': 'Brighton',
        'Stoke City': 'Stoke',
        'AFC Bournemouth': 'Bournemouth',
        'West Bromwich Albion': 'West Brom',
        'Leicester City': 'Leicester',
        'Swansea City': 'Swansea',
        'Huddersfield Town': 'Huddersfield',
        'Newcastle United': 'Newcastle'
    }}
)

vaep_values['game_id'] = vaep_values['game_id'].astype(int)
vaep_values['temp'] = vaep_values['game_id'].astype(str) + vaep_values['short_team_name']
match_def['temp'] = match_def['wyId'].astype(str) + match_def['team']

In [87]:
vaep_values = vaep_values.merge(match_def[['temp', 'footedness']], left_on='temp', right_on='temp', how='left')
vaep_values.drop(columns = ['temp'], inplace=True)

In [88]:
vaep_values['name'] = vaep_values['first_name']+vaep_values['last_name']
vaep_values['name'] = vaep_values['name'].astype(str).apply(lambda x: unidecode(x))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace('-', ''))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace(' ', ''))
# vaep_values.loc[(vaep_values['last_name'].str.contains('PhilJa'))]

In [89]:
# Creating a key to identify each row using game_id, time_seconds and player name
vaep_values['key'] = vaep_values['game_id'].astype(str) + np.round(vaep_values['time_seconds'],6).astype(str) +vaep_values['name'].astype(str)

In [103]:
vaep_values.loc[(vaep_values['name'].str.contains('AaronCre'))& (vaep_values['type_name']=='pass') &(vaep_values['game_id']==2499724)]


Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,bodypart_id,type_id,result_id,type_name,result_name,bodypart_name,short_name,first_name,last_name,birth_date,short_team_name,team_name,scores,concedes,offensive_value,defensive_value,vaep_value,footedness,name,key
7060,2499724,2.0,2271.502895,1633.0,8582.0,34.65,5.44,18.9,7.48,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.01846,0.003054,0.005759,0.000818,0.006577,right-right-left-left,AaronCresswell,24997242271.502895AaronCresswell
7062,2499724,2.0,2276.573331,1633.0,8582.0,37.8,8.16,36.75,14.96,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.012047,0.004725,0.003157,-0.002174,0.000983,right-right-left-left,AaronCresswell,24997242276.573331AaronCresswell
7104,2499724,2.0,2424.362406,1633.0,8582.0,92.4,19.72,35.7,31.28,0,0,0,pass,fail,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.004589,0.007457,0.000659,-0.001177,-0.000518,right-right-left-left,AaronCresswell,24997242424.362406AaronCresswell
7148,2499724,2.0,2621.957344,1633.0,8582.0,11.55,8.16,40.95,8.84,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.007866,0.011057,-0.006992,-0.007752,-0.014743,right-right-left-left,AaronCresswell,24997242621.957344AaronCresswell
7167,2499724,2.0,2738.543658,1633.0,8582.0,70.35,18.36,72.45,44.88,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.005145,0.005596,0.001155,0.000255,0.00141,right-right-left-left,AaronCresswell,24997242738.543658AaronCresswell
7179,2499724,2.0,2771.290094,1633.0,8582.0,68.25,13.6,44.1,5.44,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.005961,0.006131,0.000698,0.00163,0.002329,right-right-left-left,AaronCresswell,24997242771.290094AaronCresswell
7181,2499724,2.0,2774.938118,1633.0,8582.0,67.2,10.2,68.25,43.52,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.005488,0.004266,0.000984,0.004179,0.005163,right-right-left-left,AaronCresswell,24997242774.938118AaronCresswell
7184,2499724,2.0,2789.353844,1633.0,8582.0,70.35,8.84,72.45,42.84,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.004298,0.00506,0.000137,0.000516,0.000653,right-right-left-left,AaronCresswell,24997242789.353844AaronCresswell
7190,2499724,2.0,2803.477512,1633.0,8582.0,56.7,9.52,59.85,19.04,0,0,1,pass,success,foot,A. Cresswell,Aaron,Cresswell,1989-12-15,West Ham,West Ham United FC,0.005818,0.007772,-0.000972,-0.000294,-0.001266,right-right-left-left,AaronCresswell,24997242803.477512AaronCresswell


In [57]:
# Filtering the events that are labelled as pass and cross
vaep_values_pass = vaep_values.loc[(vaep_values['type_name'] == 'pass') | (vaep_values['type_name'] =='cross') ]

In [58]:
# Merging player roles to SPADL events data
df_players = pd.read_pickle('../../data/players/players.pkl')
roles_temp = df_players['role'].values
roles = list()
for i in roles_temp:
    roles.append(i['code3'])
players_roles = list(zip(roles,df_players['wyId'],df_players['playerName']))
df_players_roles = pd.DataFrame(players_roles,columns = ['role','playerId','playerName1'])
vaep_values_pass_proles = vaep_values_pass.merge(df_players_roles, left_on = 'player_id', right_on = 'playerId')
vaep_values_pass_proles.drop(['playerName1'], axis = 1, inplace = True)
vaep_values_pass_def = vaep_values_pass_proles.loc[vaep_values_pass_proles['role']=='DEF']

In [59]:
# Counting duplicate keys in the VAEP filtered dataframe. 
# Note: Some events in the dataframe have same key attributes (game_id, time_seconds and player name)
# When checked at a deeper level, these passes (roughly 30 of them) have the same player performing two 
# different passes from the same starting location to different end locations at the same time stamp.
# We have ignored these 30 passes from our analysis
vaep_values_pass_def['key'].value_counts()

24999722087.498654NicolasHernanOtamendi          2
2499949222.306174LaurentKoscielny                2
2499949483.766015ShkodranMustafi                 2
2499949222.464338HectorBellerinMoruno            2
2499949234.956542HectorBellerinMoruno            2
                                                ..
25000452642.370602LuisAntonioValenciaMosquera    1
2499735864.19906SeadKolasinac                    1
25000302423.861461MatiasEzequielSchelotto        1
24999001195.061528ConnorGoldson                  1
2499811430.548821LewisDunk                       1
Name: key, Length: 141940, dtype: int64

In [60]:
# vaep_values_pass_def.loc[vaep_values_pass_def['key'].str.contains('2499949256.961055LaurentKoscielny')]

In [62]:
# Importing Wyscout events data
df_events_wyscout = pd.read_pickle('../../data/events/events_v2.pkl')

In [63]:
# Filtering out passes events which were performed by defenders in Wyscout events data
df_events_wyscout_pass = df_events_wyscout.loc[(df_events_wyscout['eventName']=='Pass') & (df_events_wyscout['role']=='DEF')]

In [64]:
# Creating a key to identify each row using matchid, event_seconds and player name
df_events_wyscout_pass['key'] = df_events_wyscout_pass['matchId'].astype(str) + np.round(df_events_wyscout_pass['eventSec'],6).astype(str) + df_events_wyscout_pass['playerName'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [65]:
# df_events_wyscout_pass.loc[df_events_wyscout_pass['key'].str.contains('2499949256.961055LaurentKoscielny')]

In [66]:
# Appending VAEP values along with offensive and defensive values to Wyscout events data from VAEP data
# by joining the two dataframes
df_events_vaep = df_events_wyscout_pass.merge(vaep_values_pass_def[['key','vaep_value','offensive_value','defensive_value']], how = 'left', left_on = 'key', right_on = 'key')

In [112]:
# Total number of passes in Wyscout events data
df_events_vaep.loc[(df_events_vaep['playerName'].str.contains("AaronCre")) & ((df_events_vaep['eventName'].str.contains("Pass")))]

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,wyId,foot,playerName,role,key,vaep_value,offensive_value,defensive_value
58583,8,Simple pass,[Accurate],8582,"[[69.68, 62.56], [85.28, 60.52]]",2499724,Pass,1633,2H,2271.502895,85,178495888,8582.0,left,AaronCresswell,DEF,24997242271.502895AaronCresswell,0.006577,0.005759,0.0008183604
58584,8,Simple pass,[Accurate],8582,"[[66.56, 59.84], [67.6, 53.04]]",2499724,Pass,1633,2H,2276.573331,85,178495891,8582.0,left,AaronCresswell,DEF,24997242276.573331AaronCresswell,0.000983,0.003157,-0.002174121
58585,8,Cross,"[Left foot, Blocked, Not accurate]",8582,"[[95.68, 63.92], [104.0, 0.0]]",2499724,Pass,1633,2H,2286.733895,80,178495905,8582.0,left,AaronCresswell,DEF,24997242286.733895AaronCresswell,-0.014032,-0.012448,-0.001583422
58586,8,Cross,"[Left foot, High, Not accurate]",8582,"[[99.84, 61.88], [104.0, 55.08]]",2499724,Pass,1633,2H,2307.982089,80,178495916,8582.0,left,AaronCresswell,DEF,24997242307.982089AaronCresswell,-0.011479,-0.011386,-9.318232e-05
58587,8,High pass,[Not accurate],8582,"[[12.48, 48.28], [68.64, 36.72]]",2499724,Pass,1633,2H,2424.362406,83,178495935,8582.0,left,AaronCresswell,DEF,24997242424.362406AaronCresswell,-0.000518,0.000659,-0.001176674
58588,8,Simple pass,[Accurate],8582,"[[92.56, 59.84], [63.44, 59.16]]",2499724,Pass,1633,2H,2621.957344,85,178495971,8582.0,left,AaronCresswell,DEF,24997242621.957344AaronCresswell,-0.014743,-0.006992,-0.007751575
58589,8,Simple pass,[Accurate],8582,"[[34.32, 49.64], [32.24, 23.12]]",2499724,Pass,1633,2H,2738.543658,85,178495981,8582.0,left,AaronCresswell,DEF,24997242738.543658AaronCresswell,0.00141,0.001155,0.0002548443
58590,8,Simple pass,[Accurate],8582,"[[36.4, 54.4], [60.32, 62.56]]",2499724,Pass,1633,2H,2771.290094,85,178495994,8582.0,left,AaronCresswell,DEF,24997242771.290094AaronCresswell,0.002329,0.000698,0.001630347
58591,8,Simple pass,[Accurate],8582,"[[37.44, 57.8], [36.4, 24.48]]",2499724,Pass,1633,2H,2774.938118,85,178495996,8582.0,left,AaronCresswell,DEF,24997242774.938118AaronCresswell,0.005163,0.000984,0.00417919
58592,8,Simple pass,[Accurate],8582,"[[34.32, 59.16], [32.24, 25.16]]",2499724,Pass,1633,2H,2789.353844,85,178496000,8582.0,left,AaronCresswell,DEF,24997242789.353844AaronCresswell,0.000653,0.000137,0.0005162684


## MetricsCollection & Clustering

In [68]:
footedness_patterns = match_def["footedness"].unique()

In [69]:
match_def.rename(columns={'R-CB':'R_CB',"L-CB":'L_CB'},inplace=True)

**Creating seperate dataframes for each defensive lineup based for preferred foot of each defender**

In [70]:
df_rrrl = match_def.loc[match_def['footedness']=='right-right-right-left']
df_rrll = match_def.loc[match_def['footedness']=='right-right-left-left']
df_rrl = match_def.loc[match_def['footedness']=='right-right-left']
df_rrr = match_def.loc[match_def['footedness']=='right-right-right']
df_rll = match_def.loc[match_def['footedness']=='right-left-left']
df_rrrll = match_def.loc[match_def['footedness']=='right-right-right-left-left']
df_rrlr = match_def.loc[match_def['footedness']=='right-right-left-right']
df_rrrr = match_def.loc[match_def['footedness']=='right-right-right-right']
df_rrrrl = match_def.loc[match_def['footedness']=='right-right-right-right-left']
df_rlr = match_def.loc[match_def['footedness']=='right-left-right']
df_rrrlr = match_def.loc[match_def['footedness']=='right-right-right-left-right']
df_rrlll = match_def.loc[match_def['footedness']=='right-right-left-left-left']
df_rlll = match_def.loc[match_def['footedness']=='right-left-left-left']

**Creating a list of such dataframes**

In [71]:
df_clusters = [df_rrrl,df_rrll,df_rrl,df_rrr,df_rll,df_rrrll,df_rrlr,df_rrrr,df_rrrrl,df_rlr,df_rrrlr,df_rrlll,df_rlll]

**Creating a dictionary of mapping of players with a mismatch in names in events data and Premier League parsed data**

In [72]:
player_map = {  'RamiroFunesMori': 'JoseRamiroFunesMori',
                'KurtZouma': 'KurtHappyZouma',
                'Danilo': 'DaniloLuizdaSilva',
                'CesarAzpilicueta': 'CesarAzpilicuetaTanco',
                'EzequielSchelotto': 'MatiasEzequielSchelotto',
                'GaetanBong': 'GaetanBongSongo',
                'HectorBellerin': 'HectorBellerinMoruno',
                'AhmedHegazi': 'AhmedHegazy',
                'JamaalLascelles': 'JamalLascelles',
                'AngelRangel': 'AngelRangelZaragoza',
                'Zanka': 'MathiasJattahNjieJorgensen',
                'ChrisLwe': 'ChrisLowe',
                'EricBailly': 'EricBertrandBailly',
                'MarcosRojo': 'FaustinoMarcosAlbertoRojo',
                'CdricSoares': 'CedricRicardoAlvesSoares',
                'AngeloOgbonna': 'AngeloObinzeOgbonna',
                'HctorBellern': 'HectorBellerinMoruno',
                'DavinsonSanchez': 'DavinsonSanchezMina',
                'JavierManquillo': 'JavierManquilloGaitan',
                'TommySmith': 'TomSmith',
                'Bruno': 'BrunoSaltorGrau',
                'GatanBong': 'GaetanBongSongo',
                'NicolsOtamendi': 'NicolasHernanOtamendi',
                'CsarAzpilicueta': 'CesarAzpilicuetaTanco',
                'AntonioRdiger': 'AntonioRudiger',
                'JosHolebas': 'JoseHolebas',
                'SamusColeman': 'SeamusColeman',
                'AllanRomoNyom': 'AllanRomeoNyom',
                'NathanAk': 'NathanAke',
                'JosephGomez': 'JoeGomez',
                'AlbertoMoreno':'AlbertoMorenoPerez',
                'LuisAntonioValencia':'LuisAntonioValenciaMosquera',
                'VictorLindelf':'VictorNilssonLindelof',
                'DavinsonSnchez':'DavinsonSanchezMina',
                'NicolasOtamendi':'NicolasHernanOtamendi',
                'NachoMonreal':'IgnacioMonrealEraso',
                'FedericoFernndez':'FedericoFernandez',
                'SebastianPrdl':'SebastianProdl',
                'CedricSoares':'CedricRicardoAlvesSoares',
                'JoelMatip':'JoelAndreJobMatip',
                'MiguelBritos':'MiguelAngelBritosCabrera',
                'VictorLindelof':'VictorNilssonLindelof',
                'JamesCollins':'JamesMichaelCollins',
                'CucoMartina':'RhuendlyMartina',
                'DavidLuiz':'DavidLuizMoreiraMarinho',
                'MollaWagu':'MollaWague',
                'JrmyPied':'JeremyPied',
                'ChancelMbemba':'ChancelMbembaMangulu',
                'PabloZabaleta':'PabloJavierZabaletaGirod',
                'KikoFemenia':'FranciscoFemeniaFar',
                'CheikhouKouyat':'CheikhouKouyate',
                'KikoFemena':'FranciscoFemeniaFar',
                'JoseFonte':'JoseMigueldaRochaFonte',
                'JosFonte':'JoseMigueldaRochaFonte',
                'JesusGamez':'JesusGamezDuarte'}


**Creating a metrics collection function that takes in x (match_id) and y (player name) and returns the standard metrics-**

In [73]:
def getmetrics(x,y):
    try:
        y = player_map[y]
    except:
        pass
    split_y = re.findall('[A-Z][^A-Z]*',y)
    try:
        pass_df = df_events_vaep.loc[(df_events_vaep['playerName'].str.contains(split_y[-1]))&
                                 (df_events_vaep['playerName'].str.contains(split_y[-2]))&
                                 (df_events_vaep['matchId']==int(x))]
    except:
        pass_df = df_events_vaep.loc[(df_events_vaep['playerName'].str.contains(split_y[-1]))&
                                 (df_events_vaep['matchId']==int(x))]
    numpasses = len(pass_df)
    numaccpasses = len(pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)])
    accpasslocs = pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)]['positions'].tolist()
    inaccpasslocs = pass_df.loc[pass_df['tags'].apply(lambda a: "Not accurate" in a)]['positions'].tolist()
    acc_vaep_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)]['vaep_value'].tolist()
    inacc_vaep_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Not accurate" in a)]['vaep_value'].tolist()
    acc_off_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)]['offensive_value'].tolist()
    inacc_off_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Not accurate" in a)]['offensive_value'].tolist()
    acc_def_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)]['defensive_value'].tolist()
    inacc_def_values = pass_df.loc[pass_df['tags'].apply(lambda a: "Not accurate" in a)]['defensive_value'].tolist()
    return [numpasses, numaccpasses, accpasslocs, inaccpasslocs, acc_vaep_values,
            inacc_vaep_values, acc_off_values, inacc_off_values, acc_def_values,
            inacc_def_values]


In [74]:
# getmetrics(2500081,"Bruno")

In [75]:
new_cols = ['RB_all',
            'R_CB_all',
            'L_CB_all',
            'LB_all',
            'RCB_all',
            'CB_all',
            'LCB_all',
            'RWB_all',
            'LWB_all']

**Collecting metrics for each defender location for various clusters**

In [76]:
#R_CB - Right center back for 4 defender formation
#RCB - Right center back for 3 or 5 defender formation
#L_CB - Left center back for 4 defender formation
#LCB - Left center back for 3 or 5 defender formation
df_clusters_updated = list()
for df in tqdm(df_clusters):
    df = df.reindex(columns = df.columns.tolist() + new_cols)
    if df.iloc[0]['backline'] == 4.0:     
        df['RB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RB), axis=1)
        df['R_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['R_CB']), axis=1)
        df['L_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['L_CB']), axis=1)
        df['LB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LB), axis=1)
        df_clusters_updated.append(df)
    
    elif df.iloc[0]['backline'] == 3.0:
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df_clusters_updated.append(df)
        
    elif df.iloc[0]['backline'] == 5.0:
        df['RWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RWB), axis=1)
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df['LWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LWB), axis=1)
        df_clusters_updated.append(df)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




**Splitting the metrics into individual columns**

In [77]:
df_clusters_metrics = list()
for df in tqdm(df_clusters_updated):
    if df.iloc[0]['backline'] == 4.0:
        df[[
            'RB_pass', 'RB_accpass', 'RB_accpassloc', 'RB_inaccpassloc',
            'RB_accvaep', 'RB_inaccvaep', 'RB_accoff', 'RB_inaccoff',
            'RB_accdef', 'RB_inaccdef'
        ]] = pd.DataFrame(df['RB_all'].to_list(), index=df.index)
        df[[
            'R_CB_pass', 'R_CB_accpass', 'R_CB_accpassloc',
            'R_CB_inaccpassloc', 'R_CB_accvaep', 'R_CB_inaccvaep',
            'R_CB_accoff', 'R_CB_inaccoff', 'R_CB_accdef', 'R_CB_inaccdef'
        ]] = pd.DataFrame(df['R_CB_all'].to_list(), index=df.index)
        df[[
            'L_CB_pass', 'L_CB_accpass', 'L_CB_accpassloc', 'L_CB_inaccpassloc',
            'L_CB_accvaep', 'L_CB_inaccvaep', 'L_CB_accoff', 'L_CB_inaccoff',
            'L_CB_accdef', 'L_CB_inaccdef'
        ]] = pd.DataFrame(df['L_CB_all'].to_list(), index=df.index)
        df[[
            'LB_pass', 'LB_accpass', 'LB_accpassloc', 'LB_inaccpassloc',
            'LB_accvaep', 'LB_inaccvaep', 'LB_accoff', 'LB_inaccoff',
            'LB_accdef', 'LB_inaccdef'
        ]] = pd.DataFrame(df['LB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

    elif df.iloc[0]['backline'] == 3.0:
        df[[
            'RCB_pass', 'RCB_accpass', 'RCB_accpassloc', 'RCB_inaccpassloc',
            'RCB_accvaep', 'RCB_inaccvaep', 'RCB_accoff', 'RCB_inaccoff',
            'RCB_accdef', 'RCB_inaccdef'
        ]] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[[
            'CB_pass', 'CB_accpass', 'CB_accpassloc', 'CB_inaccpassloc',
            'CB_accvaep', 'CB_inaccvaep', 'CB_accoff', 'CB_inaccoff',
            'CB_accdef', 'CB_inaccdef'
        ]] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[[
            'LCB_pass', 'LCB_accpass', 'LCB_accpassloc', 'LCB_inaccpassloc',
            'LCB_accvaep', 'LCB_inaccvaep', 'LCB_accoff', 'LCB_inaccoff',
            'LCB_accdef', 'LCB_inaccdef'
        ]] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

    elif df.iloc[0]['backline'] == 5.0:
        df[[
            'RCB_pass', 'RCB_accpass', 'RCB_accpassloc', 'RCB_inaccpassloc',
            'RCB_accvaep', 'RCB_inaccvaep', 'RCB_accoff', 'RCB_inaccoff',
            'RCB_accdef', 'RCB_inaccdef'
        ]] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[[
            'CB_pass', 'CB_accpass', 'CB_accpassloc', 'CB_inaccpassloc',
            'CB_accvaep', 'CB_inaccvaep', 'CB_accoff', 'CB_inaccoff',
            'CB_accdef', 'CB_inaccdef'
        ]] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[[
            'LCB_pass', 'LCB_accpass', 'LCB_accpassloc', 'LCB_inaccpassloc',
            'LCB_accvaep', 'LCB_inaccvaep', 'LCB_accoff', 'LCB_inaccoff',
            'LCB_accdef', 'LCB_inaccdef'
        ]] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df[[
            'RWB_pass', 'RWB_accpass', 'RWB_accpassloc', 'RWB_inaccpassloc',
            'RWB_accvaep', 'RWB_inaccvaep', 'RWB_accoff', 'RWB_inaccoff',
            'RWB_accdef', 'RWB_inaccdef'
        ]] = pd.DataFrame(df['RWB_all'].to_list(), index=df.index)
        df[[
            'LWB_pass', 'LWB_accpass', 'LWB_accpassloc', 'LWB_inaccpassloc',
            'LWB_accvaep', 'LWB_inaccvaep', 'LWB_accoff', 'LWB_inaccoff',
            'LWB_accdef', 'LWB_inaccdef'
        ]] = pd.DataFrame(df['LWB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))




In [78]:
df_clusters_metrics[0].head()

In [79]:
# Saving the clusters into individual pickle files
names = ['rrrl','rrll','rrl','rrr','rll','rrrll','rrlr','rrrr','rrrrl','rlr','rrrlr','rrlll','rlll']

for i,df in enumerate(df_clusters_metrics):
    df.to_pickle(f'../../data/clusters/clusters_vaep/cluster_{names[i]}.pkl')

In [71]:
# rrrl = pd.read_pickle(f'../../data/clusters/clusters_vaep/cluster_rrrl.pkl')
# rrrl.head()