**6. Computing VAEP from Wyscout data and comabining it with our metrics dataset (as generated from '3_metricscollection' notebook)**

The following tasks were taken into account in this notebook:

1. To convert the Wyscout data to SPADL format and to compute VAEP values using the Socceractions framework

2. Combine the VAEP values (as well as offensive and defensive values) with the existing metrics dataframe

The following results were saved as pickle files:

1. Cluster wise dataframes with VAEP, offensive and defensive value as additional features


# Imports 

In [1]:
!pip install tables==3.6.1
!pip install socceraction==0.2.0



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
import seaborn as sns
from math import *
import matplotlib.pylab as pyl
import pickle
import swifter
import warnings
import plotly.express as px
from itertools import chain
import scipy.stats as sps
from tqdm import tqdm
from unidecode import unidecode
import re
from io import BytesIO
from pathlib import Path
from tqdm.notebook import tqdm
from urllib.parse import urlparse
from urllib.request import urlopen, urlretrieve
from zipfile import ZipFile, is_zipfile
import pandas as pd
from sklearn.metrics import brier_score_loss, roc_auc_score  # version 0.22.2
from xgboost import XGBClassifier  # version 1.0.2

import socceraction.vaep.features as features
import socceraction.vaep.labels as labels

from socceraction.spadl.wyscout import convert_to_spadl
from socceraction.vaep.formula import value

In [3]:
#pd.set_option('max_colwidth', 999)
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_rows", 3000)

In [4]:
import warnings
warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning)

# Preprocess the data

## Preprocess the Wyscout data

In [5]:
def read_json_file(filename):
    with open(filename, 'rb') as json_file:
        return BytesIO(json_file.read()).getvalue().decode('unicode_escape')

### Teams

In [6]:
json_teams = read_json_file('../data_top5/teams/teams.json')
df_teams = pd.read_json(json_teams)

In [7]:
df_teams.head(10)

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
2,Barcelona,Espanyol,691,Reial Club Deportiu Espanyol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
3,Vitoria-Gasteiz,Deportivo Alavés,696,Deportivo Alavés,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
4,Valencia,Levante,695,Levante UD,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
5,Troyes,Troyes,3795,Espérance Sportive Troyes Aube Champagne,"{'name': 'France', 'id': '250', 'alpha3code': ...",club
6,Getafe (Madrid),Getafe,698,Getafe Club de Fútbol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
7,Mönchengladbach,Borussia M'gladbach,2454,Borussia VfL Mönchengladbach,"{'name': 'Germany', 'id': '276', 'alpha3code':...",club
8,"Huddersfield, West Yorkshire",Huddersfield Town,1673,Huddersfield Town FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
9,Bilbao,Athletic Club,678,Athletic Club Bilbao,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club


In [8]:
df_teams.to_hdf('../data_top5/vaep/wyscout.h5', key='teams', mode='w')

### Players

In [9]:
json_players = read_json_file('../data_top5/players/players.json')
df_players = pd.read_json(json_players)

In [10]:
df_players.head(10)

Unnamed: 0,passportArea,weight,firstName,middleName,lastName,currentTeamId,birthDate,height,role,birthArea,wyId,foot,shortName,currentNationalTeamId
0,"{'name': 'Turkey', 'id': '792', 'alpha3code': ...",78,Harun,,Tekin,4502,1989-06-17,187,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'Turkey', 'id': '792', 'alpha3code': ...",32777,right,H. Tekin,4687.0
1,"{'name': 'Senegal', 'id': '686', 'alpha3code':...",73,Malang,,Sarr,3775,1999-01-23,182,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",393228,left,M. Sarr,4423.0
2,"{'name': 'France', 'id': '250', 'alpha3code': ...",72,Over,,Mandanda,3772,1998-10-26,176,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'France', 'id': '250', 'alpha3code': ...",393230,,O. Mandanda,
3,"{'name': 'Senegal', 'id': '686', 'alpha3code':...",82,Alfred John Momar,,N'Diaye,683,1990-03-06,187,"{'code2': 'MD', 'code3': 'MID', 'name': 'Midfi...","{'name': 'France', 'id': '250', 'alpha3code': ...",32793,right,A. N'Diaye,19314.0
4,"{'name': 'France', 'id': '250', 'alpha3code': ...",84,Ibrahima,,Konaté,2975,1999-05-25,192,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",393247,right,I. Konaté,
5,"{'name': 'Netherlands', 'id': '528', 'alpha3co...",83,Jasper,,Cillessen,676,1989-04-22,185,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...","{'name': 'Netherlands', 'id': '528', 'alpha3co...",33,right,J. Cillessen,664.0
6,"{'name': 'Belgium', 'id': '56', 'alpha3code': ...",91,Toby,,Alderweireld,1624,1989-03-02,187,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'Belgium', 'id': '56', 'alpha3code': ...",36,right,T. Alderweireld,5629.0
7,"{'name': 'Belgium', 'id': '56', 'alpha3code': ...",88,Jan,,Vertonghen,1624,1987-04-24,189,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'Belgium', 'id': '56', 'alpha3code': ...",48,left,J. Vertonghen,5629.0
8,"{'name': 'France', 'id': '250', 'alpha3code': ...",74,Alexander,,Djiku,3783,1994-08-09,182,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...","{'name': 'France', 'id': '250', 'alpha3code': ...",229427,right,A. Djiku,
9,"{'name': 'Denmark', 'id': '208', 'alpha3code':...",76,Christian,,Dannemann Eriksen,1624,1992-02-14,180,"{'code2': 'MD', 'code3': 'MID', 'name': 'Midfi...","{'name': 'Denmark', 'id': '208', 'alpha3code':...",54,right,C. Eriksen,7712.0


In [11]:
df_players.to_hdf('../data_top5/vaep/wyscout.h5', key='players', mode='a')

### Matches

In [12]:
competitions = [
     'England',
     'France',
     'Germany',
     'Italy',
     'Spain',
    'European Championship',
     'World Cup'
]

In [13]:
dfs_matches = []
for competition in competitions:
    competition_name = competition.replace(' ', '_')
    file_matches = f'../../data_top5/matches/matches_{competition_name}.json'
    json_matches = read_json_file(file_matches)
    df_matches = pd.read_json(json_matches)
    dfs_matches.append(df_matches)
df_matches = pd.concat(dfs_matches)

In [14]:
df_matches.head(10)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId,groupName
0,Played,4405654,38,"{'1646': {'scoreET': 0, 'coachId': 8880, 'side...",181150,2018-05-13 14:00:00,1659,Turf Moor,2500089,"Burnley - AFC Bournemouth, 1 - 2","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385705, 'role': 'referee'}, {'r...",Regular,364,
1,Played,4405654,38,"{'1628': {'scoreET': 0, 'coachId': 8357, 'side...",181150,2018-05-13 14:00:00,1628,Selhurst Park,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 381851, 'role': 'referee'}, {'r...",Regular,364,
2,Played,4405654,38,"{'1609': {'scoreET': 0, 'coachId': 7845, 'side...",181150,2018-05-13 14:00:00,1609,The John Smith's Stadium,2500091,"Huddersfield Town - Arsenal, 0 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 384965, 'role': 'referee'}, {'r...",Regular,364,
3,Played,4405654,38,"{'1651': {'scoreET': 0, 'coachId': 8093, 'side...",181150,2018-05-13 14:00:00,1612,Anfield,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385704, 'role': 'referee'}, {'r...",Regular,364,
4,Played,4405654,38,"{'1644': {'scoreET': 0, 'coachId': 93112, 'sid...",181150,2018-05-13 14:00:00,1611,Old Trafford,2500093,"Manchester United - Watford, 1 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 381853, 'role': 'referee'}, {'r...",Regular,364,
5,Played,4405654,38,"{'1613': {'scoreET': 0, 'coachId': 210700, 'si...",181150,2018-05-13 14:00:00,1613,St. James' Park,2500094,"Newcastle United - Chelsea, 3 - 0","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 384888, 'role': 'referee'}, {'r...",Regular,364,
6,Played,4405654,38,"{'1625': {'scoreET': 0, 'coachId': 267136, 'si...",181150,2018-05-13 14:00:00,1625,St. Mary's Stadium,2500095,"Southampton - Manchester City, 0 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 385911, 'role': 'referee'}, {'r...",Regular,364,
7,Played,4405654,38,"{'10531': {'scoreET': 0, 'coachId': 32573, 'si...",181150,2018-05-13 14:00:00,1639,Liberty Stadium,2500096,"Swansea City - Stoke City, 1 - 2","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 378952, 'role': 'referee'}, {'r...",Regular,364,
8,Played,4405654,38,"{'1631': {'scoreET': 0, 'coachId': 209010, 'si...",181150,2018-05-13 14:00:00,1624,Wembley Stadium,2500097,"Tottenham Hotspur - Leicester City, 5 - 4","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 378951, 'role': 'referee'}, {'r...",Regular,364,
9,Played,4405654,38,"{'1623': {'scoreET': 0, 'coachId': 8541, 'side...",181150,2018-05-13 14:00:00,1633,London Stadium,2500098,"West Ham United - Everton, 3 - 1","May 13, 2018 at 4:00:00 PM GMT+2","[{'refereeId': 408156, 'role': 'referee'}, {'r...",Regular,364,


In [15]:
df_matches.to_hdf('../data_top5/vaep/wyscout.h5', key='matches', mode='a')

### Events

In [16]:
for competition in competitions:
    competition_name = competition.replace(' ', '_')
    file_events = f'../data_top5/events/events_{competition_name}.json'
    json_events = read_json_file(file_events)
    df_events = pd.read_json(json_events)
    df_events_matches = df_events.groupby('matchId', as_index=False)
    for match_id, df_events_match in df_events_matches:
        df_events_match.to_hdf('../data_top5/vaep/wyscout.h5', key=f'events/match_{match_id}', mode='a')

## Convert the Wyscout data to the SPADL representation

In [17]:
convert_to_spadl('../data_top5/vaep/wyscout.h5', '../../data_top5/vaep/spadl.h5')

...Inserting actiontypes
...Inserting bodyparts
...Inserting results
...Converting games
...Converting players


  0%|          | 0/1941 [00:00<?, ?game/s]

...Converting teams
...Generating player_games


100%|██████████| 1941/1941 [04:53<00:00,  6.60game/s]
  0%|          | 0/1941 [00:00<?, ?game/s]

...Converting events to actions


100%|██████████| 1941/1941 [28:57<00:00,  1.12game/s]


# Value game states

In [18]:
df_games = pd.read_hdf('../data_top5/vaep/spadl.h5', key='games')
df_actiontypes = pd.read_hdf('../data_top5/vaep/spadl.h5', key='actiontypes')
df_bodyparts = pd.read_hdf('../data_top5/vaep/spadl.h5', key='bodyparts')
df_results = pd.read_hdf('../data_top5/vaep/spadl.h5', key='results')

In [19]:
nb_prev_actions = 3

## Generate game state features

In [20]:
functions_features = [
    features.actiontype_onehot,
    features.bodypart_onehot,
    features.result_onehot,
    features.goalscore,
    features.startlocation,
    features.endlocation,
    features.movement,
    features.space_delta,
    features.startpolar,
    features.endpolar,
    features.team,
    features.time_delta
]

In [21]:
df_actions = pd.read_hdf('../data_top5/vaep/spadl.h5', key=f'actions/game_{2500089}')

In [22]:
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('../data_top5/vaep/spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .reset_index(drop=True)
    )
    
    dfs_gamestates = features.gamestates(df_actions, nb_prev_actions=nb_prev_actions)
    dfs_gamestates = features.play_left_to_right(dfs_gamestates, game['home_team_id'])
    
    df_features = pd.concat([function(dfs_gamestates) for function in functions_features], axis=1)
    df_features.to_hdf('../data_top5/vaep/features.h5', key=f'game_{game_id}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




## Generate game state labels

In [23]:
functions_labels = [
    labels.scores,
    labels.concedes
]

In [24]:
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('../data_top5/vaep/spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .reset_index(drop=True)
    )
    
    df_labels = pd.concat([function(df_actions) for function in functions_labels], axis=1)
    df_labels.to_hdf('../data_top5/vaep/labels.h5', key=f'game_{game_id}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




## Generate dataset

In [25]:
columns_features = features.feature_column_names(functions_features, nb_prev_actions=nb_prev_actions)

In [26]:
dfs_features = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_features = pd.read_hdf('../data_top5/vaep/features.h5', key=f'game_{game_id}')
    dfs_features.append(df_features[columns_features])
df_features = pd.concat(dfs_features).reset_index(drop=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [27]:
df_features.head(10)

Unnamed: 0,type_pass_a0,type_cross_a0,type_throw_in_a0,type_freekick_crossed_a0,type_freekick_short_a0,type_corner_crossed_a0,type_corner_short_a0,type_take_on_a0,type_foul_a0,type_tackle_a0,type_interception_a0,type_shot_a0,type_shot_penalty_a0,type_shot_freekick_a0,type_keeper_save_a0,type_keeper_claim_a0,type_keeper_punch_a0,type_keeper_pick_up_a0,type_clearance_a0,type_bad_touch_a0,type_non_action_a0,type_dribble_a0,type_goalkick_a0,type_pass_a1,type_cross_a1,type_throw_in_a1,type_freekick_crossed_a1,type_freekick_short_a1,type_corner_crossed_a1,type_corner_short_a1,type_take_on_a1,type_foul_a1,type_tackle_a1,type_interception_a1,type_shot_a1,type_shot_penalty_a1,type_shot_freekick_a1,type_keeper_save_a1,type_keeper_claim_a1,type_keeper_punch_a1,type_keeper_pick_up_a1,type_clearance_a1,type_bad_touch_a1,type_non_action_a1,type_dribble_a1,type_goalkick_a1,type_pass_a2,type_cross_a2,type_throw_in_a2,type_freekick_crossed_a2,type_freekick_short_a2,type_corner_crossed_a2,type_corner_short_a2,type_take_on_a2,type_foul_a2,type_tackle_a2,type_interception_a2,type_shot_a2,type_shot_penalty_a2,type_shot_freekick_a2,type_keeper_save_a2,type_keeper_claim_a2,type_keeper_punch_a2,type_keeper_pick_up_a2,type_clearance_a2,type_bad_touch_a2,type_non_action_a2,type_dribble_a2,type_goalkick_a2,bodypart_foot_a0,bodypart_head_a0,bodypart_other_a0,bodypart_foot_a1,bodypart_head_a1,bodypart_other_a1,bodypart_foot_a2,bodypart_head_a2,bodypart_other_a2,result_fail_a0,result_success_a0,result_offside_a0,result_owngoal_a0,result_yellow_card_a0,result_red_card_a0,result_fail_a1,result_success_a1,result_offside_a1,result_owngoal_a1,result_yellow_card_a1,result_red_card_a1,result_fail_a2,result_success_a2,result_offside_a2,result_owngoal_a2,result_yellow_card_a2,result_red_card_a2,goalscore_team,goalscore_opponent,goalscore_diff,start_x_a0,start_y_a0,start_x_a1,start_y_a1,start_x_a2,start_y_a2,end_x_a0,end_y_a0,end_x_a1,end_y_a1,end_x_a2,end_y_a2,dx_a0,dy_a0,movement_a0,dx_a1,dy_a1,movement_a1,dx_a2,dy_a2,movement_a2,dx_a01,dy_a01,mov_a01,dx_a02,dy_a02,mov_a02,start_dist_to_goal_a0,start_angle_to_goal_a0,start_dist_to_goal_a1,start_angle_to_goal_a1,start_dist_to_goal_a2,start_angle_to_goal_a2,end_dist_to_goal_a0,end_angle_to_goal_a0,end_dist_to_goal_a1,end_angle_to_goal_a1,end_dist_to_goal_a2,end_angle_to_goal_a2,team_1,team_2,time_delta_1,time_delta_2
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,52.5,34.0,52.5,34.0,52.5,34.0,42.0,37.4,42.0,37.4,42.0,37.4,-10.5,3.4,11.036757,-10.5,3.4,11.036757,-10.5,3.4,11.036757,-10.5,3.4,11.03676,-10.5,3.4,11.036757,52.5,0.0,52.5,0.0,52.5,0.0,63.091679,0.053916,63.091679,0.053916,63.091679,0.053916,True,True,0.0,0.0
1,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,42.0,37.4,52.5,34.0,52.5,34.0,40.95,57.8,42.0,37.4,42.0,37.4,-1.05,20.4,20.427004,-10.5,3.4,11.036757,-10.5,3.4,11.036757,0.0,0.0,0.0,0.0,0.0,0.0,63.091679,0.053916,52.5,0.0,52.5,0.0,68.328929,0.355773,63.091679,0.053916,63.091679,0.053916,True,True,1.997756,1.997756
2,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,40.95,57.8,42.0,37.4,52.5,34.0,32.55,47.6,40.95,57.8,42.0,37.4,-8.4,-10.2,13.213629,-1.05,20.4,20.427004,-10.5,3.4,11.036757,0.0,0.0,0.0,1.05,-20.4,20.427004,68.328929,0.355773,63.091679,0.053916,52.5,0.0,73.715416,0.185556,68.328929,0.355773,63.091679,0.053916,True,True,0.771744,2.7695
3,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,32.55,47.6,40.95,57.8,42.0,37.4,69.3,48.96,32.55,47.6,40.95,57.8,36.75,1.36,36.775156,-8.4,-10.2,13.213629,-1.05,20.4,20.427004,0.0,0.0,0.0,8.4,10.2,13.213629,73.715416,0.185556,68.328929,0.355773,63.091679,0.053916,38.707772,0.396818,73.715416,0.185556,68.328929,0.355773,True,True,2.174464,2.946208
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,69.3,48.96,32.55,47.6,40.95,57.8,74.55,55.76,69.3,48.96,32.55,47.6,5.25,6.8,8.590838,36.75,1.36,36.775156,-8.4,-10.2,13.213629,0.0,0.0,0.0,-36.75,-1.36,36.775156,38.707772,0.396818,73.715416,0.185556,68.328929,0.355773,37.425928,0.620467,38.707772,0.396818,73.715416,0.185556,True,True,3.907382,6.081846
5,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,74.55,55.76,69.3,48.96,32.55,47.6,95.55,63.24,74.55,55.76,69.3,48.96,21.0,7.48,22.292384,5.25,6.8,8.590838,36.75,1.36,36.775156,0.0,0.0,0.0,-5.25,-6.8,8.590838,37.425928,0.620467,38.707772,0.396818,73.715416,0.185556,30.729141,1.258205,37.425928,0.620467,38.707772,0.396818,True,True,3.75873,7.666112
6,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,0,0,0,95.55,63.24,74.55,55.76,69.3,48.96,96.6,34.0,95.55,63.24,74.55,55.76,1.05,-29.24,29.258847,21.0,7.48,22.292384,5.25,6.8,8.590838,0.0,0.0,0.0,-21.0,-7.48,22.292384,30.729141,1.258205,37.425928,0.620467,38.707772,0.396818,8.4,0.0,30.729141,1.258205,37.425928,0.620467,True,True,2.210584,5.969314
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,0,0,0,8.4,34.0,9.45,4.76,30.45,12.24,2.1,8.16,8.4,34.0,9.45,4.76,-6.3,-25.84,26.59691,-1.05,29.24,29.258847,-21.0,-7.48,22.292384,5.329071e-15,0.0,5.329071e-15,1.05,-29.24,29.258847,96.6,0.0,99.923872,0.296969,77.660802,0.283995,106.094842,0.24603,96.6,0.0,99.923872,0.296969,False,False,1.756122,3.966706
8,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,0,102.9,59.84,96.6,34.0,95.55,63.24,102.9,59.84,102.9,59.84,96.6,34.0,0.0,0.0,0.0,6.3,25.84,26.59691,1.05,-29.24,29.258847,0.0,0.0,0.0,-6.3,-25.84,26.59691,25.925192,1.489705,8.4,0.0,30.729141,1.258205,25.925192,1.489705,25.925192,1.489705,8.4,0.0,False,True,2.095783,3.851905
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,0,0,0,6.3,8.16,2.1,8.16,8.4,34.0,22.05,0.0,2.1,8.16,2.1,8.16,15.75,-8.16,17.738323,0.0,0.0,0.0,-6.3,-25.84,26.59691,-4.2,-3.552714e-15,4.2,-4.2,-3.552714e-15,4.2,102.026446,0.256057,106.094842,0.24603,96.6,0.0,89.647658,0.388999,106.094842,0.24603,106.094842,0.24603,False,True,3.034782,5.130565


In [28]:
columns_labels = [
    'scores',
    'concedes'
]

In [29]:
dfs_labels = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_labels = pd.read_hdf('../data_top5/vaep/labels.h5', key=f'game_{game_id}')
    dfs_labels.append(df_labels[columns_labels])
df_labels = pd.concat(dfs_labels).reset_index(drop=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [30]:
df_labels.tail(10)

Unnamed: 0,scores,concedes
2465146,True,False
2465147,True,False
2465148,True,False
2465149,True,False
2465150,True,False
2465151,True,False
2465152,False,True
2465153,False,True
2465154,True,False
2465155,False,False


## Train classifiers

In [31]:
models = {}
for column_labels in columns_labels:
    model = XGBClassifier(n_estimators=100, max_depth=4)
    model.fit(df_features, df_labels[column_labels])
    models[column_labels] = model

In [32]:
filename = '../data_top5/vaep/finalised_vaep_model.pkl'
pickle.dump(model,open(filename,'wb'))

## Estimate probabilities

In [33]:
dfs_predictions = {}
for column_labels in columns_labels:
    model = models[column_labels]
    probabilities = model.predict_proba(df_features)
    predictions = probabilities[:, 1]
    dfs_predictions[column_labels] = pd.Series(predictions)
df_predictions = pd.concat(dfs_predictions, axis=1)

In [34]:
df_predictions.head(10)

Unnamed: 0,scores,concedes
0,0.003628,0.001496
1,0.005049,0.00175
2,0.003883,0.002318
3,0.01461,0.002321
4,0.018392,0.001865
5,0.020405,0.001477
6,0.011381,0.002902
7,0.001521,0.02292
8,0.021142,0.0026
9,0.001294,0.008486


The following cell obtains the `game_id` for each action in order to store the predictions per game.

In [35]:
dfs_game_ids = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('../data_top5/vaep/spadl.h5', key=f'actions/game_{game_id}')
    dfs_game_ids.append(df_actions['game_id'])
df_game_ids = pd.concat(dfs_game_ids, axis=0).astype('int').reset_index(drop=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [36]:
df_predictions = pd.concat([df_predictions, df_game_ids], axis=1)

In [37]:
df_predictions.head(10)

Unnamed: 0,scores,concedes,game_id
0,0.003628,0.001496,2500089
1,0.005049,0.00175,2500089
2,0.003883,0.002318,2500089
3,0.01461,0.002321,2500089
4,0.018392,0.001865,2500089
5,0.020405,0.001477,2500089
6,0.011381,0.002902,2500089
7,0.001521,0.02292,2500089
8,0.021142,0.0026,2500089
9,0.001294,0.008486,2500089


In [38]:
df_predictions_per_game = df_predictions.groupby('game_id')

In [39]:
for game_id, df_predictions in tqdm(df_predictions_per_game):
    df_predictions = df_predictions.reset_index(drop=True)
    df_predictions[columns_labels].to_hdf('../data_top5/vaep/predictions.h5', key=f'game_{game_id}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




# Value on-the-ball actions - VAEP Values

In [40]:
df_players = pd.read_hdf('../data_top5/vaep/spadl.h5', key='players')
df_teams = pd.read_hdf('../data_top5/vaep/spadl.h5', key='teams')

In [41]:
dfs_values = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
    game_id = game['game_id']
    df_actions = pd.read_hdf('../data_top5/vaep/spadl.h5', key=f'actions/game_{game_id}')
    df_actions = (df_actions
        .merge(df_actiontypes, how='left')
        .merge(df_results, how='left')
        .merge(df_bodyparts, how='left')
        .merge(df_players, how='left')
        .merge(df_teams, how='left')
        .reset_index(drop=True)
    )
    
    df_predictions = pd.read_hdf('../data_top5/vaep/predictions.h5', key=f'game_{game_id}')
    df_values = value(df_actions, df_predictions['scores'], df_predictions['concedes'])
    
    df_all = pd.concat([df_actions, df_predictions, df_values], axis=1)
    dfs_values.append(df_all)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1941.0), HTML(value='')))




In [42]:
df_values = (pd.concat(dfs_values)
    .sort_values(['game_id', 'period_id', 'time_seconds'])
    .reset_index(drop=True)
)

In [44]:
# df_values['short_team_name'].unique()[44:]

In [45]:
# pl_teams = ['Arsenal','Leicester City','Manchester City','Brighton & Hove Albion','Burnley','Chelsea',
#             'Crystal Palace','Huddersfield Town','Everton','Stoke City','Manchester United','West Ham United',
#             'Newcastle United','Tottenham Hotspur','Swansea City','Southampton','Watford','Liverpool',
#             'West Bromwich Albion','AFC Bournemouth']

In [46]:
df_values_clubs = df_values.loc[df_values['short_team_name'].isin(df_values['short_team_name'].unique()[44:])]

In [47]:
# df_values_pl.loc[(df_values_pl['game_id']==2499719.0)]

* scores = P_scores(Si, x) - prob(scoring in the next 10 actions) - obtained from the ML Model - given for the particular `state`
* concedes = P_concedes(Si, x) - prob(concedes in the next 10 actions) - obtained from the ML Model - given for the particular `state`
* offensive_value = P_scores(Si) - P_scores(Si-1) - given for the particular `action`
* defensive_value = P_concedes(Si-1) - P_concedes(Si) - given for the particular `action`

In [48]:
df_values_clubs.to_pickle('../data_top5/vaep/vaep_values.pkl')

# VAEP for Defenders 

## Pre-Processing 

In [49]:
vaep_values = pd.read_pickle('../data_top5/vaep/vaep_values.pkl')
match_def = pd.read_pickle("../data_top5/matches/match+def_lineup+footedness_ver2_top5.pkl")

In [50]:
vaep_values['short_team_name'].unique()

array(['Arsenal', 'Leicester City', 'Manchester City',
       'Brighton & Hove Albion', 'Burnley', 'Chelsea', 'Crystal Palace',
       'Huddersfield Town', 'Everton', 'Stoke City', 'Manchester United',
       'West Ham United', 'Newcastle United', 'Tottenham Hotspur',
       'Swansea City', 'Southampton', 'Watford', 'Liverpool',
       'West Bromwich Albion', 'AFC Bournemouth', 'Angers', 'Bordeaux',
       'Nantes', 'Lille', 'Strasbourg', 'Olympique Lyonnais', 'Dijon',
       'Olympique Marseille', 'Metz', 'Guingamp', 'Monaco', 'Toulouse',
       'Montpellier', 'Caen', 'Amiens SC', 'PSG', 'Nice', 'Saint-Étienne',
       'Troyes', 'Rennes', 'Bayer Leverkusen', 'Bayern München',
       'Werder Bremen', 'Hoffenheim', 'Hertha BSC', 'Stuttgart',
       'Freiburg', 'Eintracht Frankfurt', "Borussia M'gladbach", 'Köln',
       'Schalke 04', 'RB Leipzig', 'Augsburg', 'Hamburger SV',
       'Hannover 96', 'Mainz 05', 'Borussia Dortmund', 'Wolfsburg',
       'Villarreal', 'Levante', 'Real Socieda

In [51]:
match_def['team'].unique()

array(['Arsenal', 'Leicester City', 'Brighton', 'Manchester City',
       'Burnley', 'Chelsea', 'Crystal Palace', 'Huddersfield', 'Everton',
       'Stoke City', 'Manchester Utd', 'West Ham', 'Newcastle Utd',
       'Tottenham', 'Southampton', 'Swansea City', 'Liverpool', 'Watford',
       'Bournemouth', 'West Brom', 'Angers', 'Bordeaux', 'Lille',
       'Nantes', 'Lyon', 'Strasbourg', 'Dijon', 'Marseille', 'Guingamp',
       'Metz', 'Monaco', 'Toulouse', 'Caen', 'Montpellier', 'Amiens',
       'Paris S-G', 'Nice', 'Saint-Étienne', 'Rennes', 'Troyes',
       'Bayern Munich', 'Leverkusen', 'Hoffenheim', 'Werder Bremen',
       'Hertha BSC', 'Stuttgart', 'Eint Frankfurt', 'Freiburg', 'Köln',
       "M'Gladbach", 'RB Leipzig', 'Schalke 04', 'Augsburg',
       'Hamburger SV', 'Hannover 96', 'Mainz 05', 'Dortmund', 'Wolfsburg',
       'Levante', 'Villarreal', 'Celta Vigo', 'Real Sociedad',
       'Athletic Club', 'Getafe', 'Atlético Madrid', 'Girona', 'Espanyol',
       'Sevilla', 'Eibar', 

In [52]:
vaep_values = vaep_values.replace({'short_team_name':{
        'Brighton & Hove Albion': 'Brighton',
        'AFC Bournemouth': 'Bournemouth',
        'Huddersfield Town': 'Huddersfield',
        'Manchester United': 'Manchester Utd',
        'Newcastle United': 'Newcastle Utd',
        'Tottenham Hotspur': 'Tottenham',
        'West Bromwich Albion': 'West Brom',
        'West Ham United': 'West Ham',
        'Bayer Leverkusen': 'Leverkusen',
        'Bayern München': 'Bayern Munich',
        'Borussia Dortmund': 'Dortmund',
        "Borussia M'gladbach": "M'Gladbach",
        'Eintracht Frankfurt': 'Eint Frankfurt',
        'Amiens SC': 'Amiens',
        'Angers SCO': 'Angers',
        'Olympique Lyonnais': 'Lyon',
        'PSG': 'Paris S-G',
        'Olympique Marseille': 'Marseille',
        'Deportivo Alavés': 'Alavés',
        'Real Betis': 'Betis',
        'Celta de Vigo': 'Celta Vigo',
        'Deportivo La Coruña': 'La Coruña',
        'Internazionale': 'Inter'
    }}
)

vaep_values['game_id'] = vaep_values['game_id'].astype(int)
vaep_values['temp'] = vaep_values['game_id'].astype(str) + vaep_values['short_team_name']
match_def['temp'] = match_def['wyId'].astype(str) + match_def['team']

In [53]:
vaep_values = vaep_values.merge(match_def[['temp', 'footedness']], left_on='temp', right_on='temp', how='left')
vaep_values.drop(columns = ['temp'], inplace=True)

In [54]:
vaep_values['name'] = vaep_values['first_name']+vaep_values['last_name']
vaep_values['name'] = vaep_values['name'].astype(str).apply(lambda x: unidecode(x))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace('-', ''))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace(' ', ''))
# vaep_values.loc[(vaep_values['last_name'].str.contains('PhilJa'))]

In [55]:
# Creating a key to identify each row using game_id, time_seconds and player name
vaep_values['key'] = vaep_values['game_id'].astype(str) + np.round(vaep_values['time_seconds'],6).astype(str) +vaep_values['name'].astype(str)

In [56]:
# vaep_values.loc[(vaep_values['name'].str.contains('AaronCre'))& (vaep_values['type_name']=='pass') &(vaep_values['game_id']==2499724)]


In [57]:
# Filtering the events that are labelled as pass and cross
vaep_values_pass = vaep_values.loc[(vaep_values['type_name'] == 'pass') | (vaep_values['type_name'] =='cross') ]

In [58]:
# Merging player roles to SPADL events data
df_players = pd.read_pickle('../data_top5/players/players.pkl')
roles_temp = df_players['role'].values
roles = list()
for i in roles_temp:
    roles.append(i['code3'])
players_roles = list(zip(roles,df_players['wyId'],df_players['playerName']))
df_players_roles = pd.DataFrame(players_roles,columns = ['role','playerId','playerName1'])
vaep_values_pass_proles = vaep_values_pass.merge(df_players_roles, left_on = 'player_id', right_on = 'playerId')
vaep_values_pass_proles.drop(['playerName1'], axis = 1, inplace = True)
vaep_values_pass_def = vaep_values_pass_proles.loc[vaep_values_pass_proles['role']=='DEF']

In [59]:
# Counting duplicate keys in the VAEP filtered dataframe. 
# Note: Some events in the dataframe have same key attributes (game_id, time_seconds and player name)
# When checked at a deeper level, these passes (roughly 32 of them) have the same player performing two 
# different passes from the same starting location to different end locations at the same time stamp.
# We have dropped the duplicates from vaep_values_pass_def
vaep_values_pass_def['key'].value_counts().tolist().count(2)

32

In [60]:
vaep_values_pass_def = vaep_values_pass_def.drop_duplicates(subset=['key'],keep='first')

In [61]:
vaep_values_pass_def.loc[vaep_values_pass_def['key'].str.contains('25656942426.710948JonathanCastroOtto')]

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,bodypart_id,type_id,result_id,type_name,result_name,bodypart_name,short_name,first_name,last_name,birth_date,short_team_name,team_name,scores,concedes,offensive_value,defensive_value,vaep_value,footedness,name,key,role,playerId
985923,2565694,1.0,2426.710948,692.0,4424.0,82.95,8.16,80.85,7.48,0,0,1,pass,success,foot,Jonny,Jonathan,Castro Otto,1994-03-03,Celta Vigo,Real Club Celta de Vigo,0.003708,0.00914,4.5e-05,-0.005529,-0.005484,right-right-left-right,JonathanCastroOtto,25656942426.710948JonathanCastroOtto,DEF,4424


In [62]:
# Importing Wyscout events data
df_events_wyscout = pd.read_pickle('../data_top5/events/events_com.pkl')

In [63]:
# Filtering out passes events which were performed by defenders in Wyscout events data
df_events_wyscout_pass = df_events_wyscout.loc[(df_events_wyscout['eventName']=='Pass') & (df_events_wyscout['role']=='DEF')]

In [64]:
# Creating a key to identify each row using matchid, event_seconds and player name
df_events_wyscout_pass['key'] = df_events_wyscout_pass['matchId'].astype(str) + np.round(df_events_wyscout_pass['eventSec'],6).astype(str) + df_events_wyscout_pass['playerName'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [65]:
df_events_wyscout_pass[df_events_wyscout_pass['key']=='25656942426.710948JonathanCastroOtto']

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,wyId,foot,playerName,role,key
56094,8,Simple pass,[Accurate],4424,"[[21.84, 59.84], [23.92, 60.52]]",2565694,Pass,692,1H,2426.710948,85,213121395,4424.0,right,JonathanCastroOtto,DEF,25656942426.710948JonathanCastroOtto
56095,8,Simple pass,[Accurate],4424,"[[23.92, 60.52], [13.52, 60.52]]",2565694,Pass,692,1H,2426.710948,85,213121396,4424.0,right,JonathanCastroOtto,DEF,25656942426.710948JonathanCastroOtto


In [66]:
# Counting duplicate keys in the Wyscout filtered dataframe. 
# Note: Some events in the dataframe have same key attributes (game_id, time_seconds and player name)
# When checked at a deeper level, these passes (roughly 32 of them) have the same player performing two 
# different passes from the same starting location to different end locations at the same time stamp.
# We have dropped the duplicates from df_events_wyscout_pass
df_events_wyscout_pass = df_events_wyscout_pass.drop_duplicates(subset=['key'],keep='first')

In [67]:
# Appending VAEP values along with offensive and defensive values to Wyscout events data from VAEP data
# by joining the two dataframes
df_events_vaep = df_events_wyscout_pass.merge(vaep_values_pass_def[[
    'key', 'vaep_value', 'offensive_value', 'defensive_value'
]],
                                              how='left',
                                              left_on='key',
                                              right_on='key')

In [68]:
# df_events_vaep.groupby(['teamId']).count()['eventId']

In [69]:
df_events_vaep[df_events_vaep['key']=='25656942426.710948JonathanCastroOtto']

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,wyId,foot,playerName,role,key,vaep_value,offensive_value,defensive_value
11938,8,Simple pass,[Accurate],4424,"[[21.84, 59.84], [23.92, 60.52]]",2565694,Pass,692,1H,2426.710948,85,213121395,4424.0,right,JonathanCastroOtto,DEF,25656942426.710948JonathanCastroOtto,-0.005484,4.5e-05,-0.005529


In [70]:
len(df_events_vaep)

660023

In [71]:
# Total number of passes in Wyscout events data['']
# df_events_vaep.loc[(df_events_vaep['playerName'].str.contains("AaronCres")) & ((df_events_vaep['eventName'].str.contains("Pass"))) & (df_events_vaep['matchId']==2500098)]

## Metrics Collection & Clustering

In [72]:
footedness_patterns = match_def["footedness"].unique()

In [73]:
match_def.rename(columns={'R-CB':'R_CB',"L-CB":'L_CB'},inplace=True)

**Creating seperate dataframes for four and three/five defenders formations**

In [74]:
df_four_defs = match_def[match_def['backline']==4]
df_three_defs = match_def[match_def['backline']==3]
df_five_defs = match_def[match_def['backline']==5]

**Creating a list of such dataframes**

In [75]:
df_clusters = [df_four_defs,df_three_defs,df_five_defs]

**Creating a metrics collection function that takes in x (match_id) and y (player name) and returns the standard metrics-**

In [76]:
def getmetrics(x, y):
    split_y = re.findall('[A-Z][^A-Z]*',y)
    try:
        pass_df = df_events_vaep.loc[(df_events_vaep['playerName'].str.contains(split_y[-1]))&
                                     (df_events_vaep['playerName'].str.contains(split_y[-2]))&
                                     (df_events_vaep['playerName'].str.contains(split_y[-3]))&
                                     (df_events_vaep['matchId']==int(x))]
    except:
        try:
            pass_df = df_events_vaep.loc[(df_events_vaep['playerName'].str.contains(split_y[-1]))&
                                     (df_events_vaep['playerName'].str.contains(split_y[-2]))&
                                     (df_events_vaep['matchId']==int(x))]
        except:
            pass_df = df_events_vaep.loc[(df_events_vaep['playerName'].str.contains(split_y[-1]))&
                                             (df_events_vaep['matchId']==int(x))]
    numpasses = len(pass_df)
    numaccpasses = len(
        pass_df.loc[pass_df['tags'].apply(lambda a: "Accurate" in a)])
    accpasslocs = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Accurate" in a)]['positions'].tolist()
    inaccpasslocs = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Not accurate" in a)]['positions'].tolist()
    acc_vaep_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Accurate" in a)]['vaep_value'].tolist()
    inacc_vaep_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Not accurate" in a)]['vaep_value'].tolist()
    acc_off_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Accurate" in a)]['offensive_value'].tolist()
    inacc_off_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Not accurate" in a)]['offensive_value'].tolist()
    acc_def_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Accurate" in a)]['defensive_value'].tolist()
    inacc_def_values = pass_df.loc[pass_df['tags'].apply(
        lambda a: "Not accurate" in a)]['defensive_value'].tolist()

    return [
        numpasses, numaccpasses, accpasslocs, inaccpasslocs, acc_vaep_values,
        inacc_vaep_values, acc_off_values, inacc_off_values, acc_def_values,
        inacc_def_values
    ]

In [77]:
# getmetrics(2500081,"Bruno")

In [78]:
new_cols = ['RB_all',
            'R_CB_all',
            'L_CB_all',
            'LB_all',
            'RCB_all',
            'CB_all',
            'LCB_all',
            'RWB_all',
            'LWB_all']

**Collecting metrics for each defender location for various clusters**

In [79]:
#R_CB - Right center back for 4 defender formation
#RCB - Right center back for 3 or 5 defender formation
#L_CB - Left center back for 4 defender formation
#LCB - Left center back for 3 or 5 defender formation
df_clusters_updated = list()
for df in tqdm(df_clusters):
    df = df.reindex(columns = df.columns.tolist() + new_cols)
    if df.iloc[0]['backline'] == 4.0:     
        df['RB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RB), axis=1)
        df['R_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['R_CB']), axis=1)
        df['L_CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x['L_CB']), axis=1)
        df['LB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LB), axis=1)
        df_clusters_updated.append(df)
    
    elif df.iloc[0]['backline'] == 3.0:
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df_clusters_updated.append(df)
        
    elif df.iloc[0]['backline'] == 5.0:
        df['RWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RWB), axis=1)
        df['RCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.RCB), axis=1)
        df['CB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.CB), axis=1)
        df['LCB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LCB), axis=1)
        df['LWB_all'] = df.apply(lambda x: getmetrics(x.wyId,x.LWB), axis=1)
        df_clusters_updated.append(df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




**Splitting the metrics into individual columns**

In [81]:
df_clusters_metrics = list()
for df in tqdm(df_clusters_updated):
    if df.iloc[0]['backline'] == 4.0:
        df[[
            'RB_pass', 'RB_accpass', 'RB_accpassloc', 'RB_inaccpassloc',
            'RB_accvaep', 'RB_inaccvaep', 'RB_accoff', 'RB_inaccoff',
            'RB_accdef', 'RB_inaccdef'
        ]] = pd.DataFrame(df['RB_all'].to_list(), index=df.index)
        df[[
            'R_CB_pass', 'R_CB_accpass', 'R_CB_accpassloc',
            'R_CB_inaccpassloc', 'R_CB_accvaep', 'R_CB_inaccvaep',
            'R_CB_accoff', 'R_CB_inaccoff', 'R_CB_accdef', 'R_CB_inaccdef'
        ]] = pd.DataFrame(df['R_CB_all'].to_list(), index=df.index)
        df[[
            'L_CB_pass', 'L_CB_accpass', 'L_CB_accpassloc',
            'L_CB_inaccpassloc', 'L_CB_accvaep', 'L_CB_inaccvaep',
            'L_CB_accoff', 'L_CB_inaccoff', 'L_CB_accdef', 'L_CB_inaccdef'
        ]] = pd.DataFrame(df['L_CB_all'].to_list(), index=df.index)
        df[[
            'LB_pass', 'LB_accpass', 'LB_accpassloc', 'LB_inaccpassloc',
            'LB_accvaep', 'LB_inaccvaep', 'LB_accoff', 'LB_inaccoff',
            'LB_accdef', 'LB_inaccdef'
        ]] = pd.DataFrame(df['LB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

    elif df.iloc[0]['backline'] == 3.0:
        df[[
            'RCB_pass', 'RCB_accpass', 'RCB_accpassloc', 'RCB_inaccpassloc',
            'RCB_accvaep', 'RCB_inaccvaep', 'RCB_accoff', 'RCB_inaccoff',
            'RCB_accdef', 'RCB_inaccdef'
        ]] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[[
            'CB_pass', 'CB_accpass', 'CB_accpassloc', 'CB_inaccpassloc',
            'CB_accvaep', 'CB_inaccvaep', 'CB_accoff', 'CB_inaccoff',
            'CB_accdef', 'CB_inaccdef'
        ]] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[[
            'LCB_pass', 'LCB_accpass', 'LCB_accpassloc', 'LCB_inaccpassloc',
            'LCB_accvaep', 'LCB_inaccvaep', 'LCB_accoff', 'LCB_inaccoff',
            'LCB_accdef', 'LCB_inaccdef'
        ]] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

    elif df.iloc[0]['backline'] == 5.0:
        df[[
            'RCB_pass', 'RCB_accpass', 'RCB_accpassloc', 'RCB_inaccpassloc',
            'RCB_accvaep', 'RCB_inaccvaep', 'RCB_accoff', 'RCB_inaccoff',
            'RCB_accdef', 'RCB_inaccdef'
        ]] = pd.DataFrame(df['RCB_all'].to_list(), index=df.index)
        df[[
            'CB_pass', 'CB_accpass', 'CB_accpassloc', 'CB_inaccpassloc',
            'CB_accvaep', 'CB_inaccvaep', 'CB_accoff', 'CB_inaccoff',
            'CB_accdef', 'CB_inaccdef'
        ]] = pd.DataFrame(df['CB_all'].to_list(), index=df.index)
        df[[
            'LCB_pass', 'LCB_accpass', 'LCB_accpassloc', 'LCB_inaccpassloc',
            'LCB_accvaep', 'LCB_inaccvaep', 'LCB_accoff', 'LCB_inaccoff',
            'LCB_accdef', 'LCB_inaccdef'
        ]] = pd.DataFrame(df['LCB_all'].to_list(), index=df.index)
        df[[
            'RWB_pass', 'RWB_accpass', 'RWB_accpassloc', 'RWB_inaccpassloc',
            'RWB_accvaep', 'RWB_inaccvaep', 'RWB_accoff', 'RWB_inaccoff',
            'RWB_accdef', 'RWB_inaccdef'
        ]] = pd.DataFrame(df['RWB_all'].to_list(), index=df.index)
        df[[
            'LWB_pass', 'LWB_accpass', 'LWB_accpassloc', 'LWB_inaccpassloc',
            'LWB_accvaep', 'LWB_inaccvaep', 'LWB_accoff', 'LWB_inaccoff',
            'LWB_accdef', 'LWB_inaccdef'
        ]] = pd.DataFrame(df['LWB_all'].to_list(), index=df.index)
        df.drop([
            'RB_all', 'R_CB_all', 'L_CB_all', 'LB_all', 'RCB_all', 'LCB_all',
            'CB_all', 'RWB_all', 'LWB_all'
        ],
                axis=1,
                inplace=True)
        df_clusters_metrics.append(df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [83]:
# df_clusters_metrics[0].loc[df_clusters_metrics[0]['wyId']==2500089]

In [84]:
df_clusters_metrics_combined = list()
df_clusters_metrics_combined.append(df_clusters_metrics[0])
df_clusters_metrics_combined.append(pd.concat([df_clusters_metrics[1],df_clusters_metrics[2]]))

In [85]:
# Saving the clusters into individual pickle files
names = ['four_defs','three_five_defs']

for i,df in enumerate(df_clusters_metrics_combined):
    df.to_pickle(f'../../data_top5/clusters/clusters_vaep/cluster_{names[i]}.pkl')

# Value sums per Team per Match 

In [86]:
vaep_values = pd.read_pickle('../../data_top5/vaep/vaep_values.pkl')
match_def = pd.read_pickle("../../data_top5/matches/match+def_lineup+footedness_ver2_top5.pkl")

In [87]:
vaep_values = vaep_values.replace({'short_team_name':{
        'Brighton & Hove Albion': 'Brighton',
        'AFC Bournemouth': 'Bournemouth',
        'Huddersfield Town': 'Huddersfield',
        'Manchester United': 'Manchester Utd',
        'Newcastle United': 'Newcastle Utd',
        'Tottenham Hotspur': 'Tottenham',
        'West Bromwich Albion': 'West Brom',
        'West Ham United': 'West Ham',
        'Bayer Leverkusen': 'Leverkusen',
        'Bayern München': 'Bayern Munich',
        'Borussia Dortmund': 'Dortmund',
        "Borussia M'gladbach": "M'Gladbach",
        'Eintracht Frankfurt': 'Eint Frankfurt',
        'Amiens SC': 'Amiens',
        'Angers SCO': 'Angers',
        'Olympique Lyonnais': 'Lyon',
        'PSG': 'Paris S-G',
        'Olympique Marseille': 'Marseille',
        'Deportivo Alavés': 'Alavés',
        'Real Betis': 'Betis',
        'Celta de Vigo': 'Celta Vigo',
        'Deportivo La Coruña': 'La Coruña',
        'Internazionale': 'Inter'
    }}
)

vaep_values['game_id'] = vaep_values['game_id'].astype(int)
vaep_values['temp'] = vaep_values['game_id'].astype(str) + vaep_values['short_team_name']
match_def['temp'] = match_def['wyId'].astype(str) + match_def['team']

In [88]:
vaep_values = vaep_values.merge(match_def[['temp', 'footedness']], left_on='temp', right_on='temp', how='left')
vaep_values.drop(columns = ['temp'], inplace=True)

In [89]:
vaep_values['name'] = vaep_values['first_name']+vaep_values['last_name']
vaep_values['name'] = vaep_values['name'].astype(str).apply(lambda x: unidecode(x))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace('-', ''))
vaep_values['name'] = vaep_values['name'].apply(lambda x: x.replace(' ', ''))

In [90]:
# Creating a key to identify each row using game_id, time_seconds and player name
vaep_values['key'] = vaep_values['game_id'].astype(str) + np.round(vaep_values['time_seconds'],6).astype(str) +vaep_values['name'].astype(str)

In [91]:
# vaep_values_pass = vaep_values#.loc[(vaep_values['type_name'] == 'pass') | (vaep_values['type_name'] =='cross') ]

In [92]:
# Merging player roles to SPADL events data
df_players = pd.read_pickle('../../data_top5/players/players.pkl')
roles_temp = df_players['role'].values
roles = list()
for i in roles_temp:
    roles.append(i['code3'])
players_roles = list(zip(roles,df_players['wyId'],df_players['playerName']))
df_players_roles = pd.DataFrame(players_roles,columns = ['role','playerId','playerName1'])
vaep_values_proles = vaep_values.merge(df_players_roles, left_on = 'player_id', right_on = 'playerId')
vaep_values_proles.drop(['playerName1'], axis = 1, inplace = True)
# vaep_values_pass_def = vaep_values_pass_proles#.loc[vaep_values_pass_proles['role']=='DEF']

In [93]:
# Removing duplicate key entries (i.e. different actions at the same timestamp by the same player)
vaep_values_proles = vaep_values_proles.drop_duplicates(subset=['key'],keep='first')

In [94]:
# Importing Wyscout events data
df_events_wyscout = pd.read_pickle('../../data_top5/events/events_com.pkl')

In [95]:
# Filtering out passes events which were performed by defenders in Wyscout events data
# df_events_wyscout_pass = df_events_wyscout#.loc[(df_events_wyscout['eventName']=='Pass') & (df_events_wyscout['role']=='DEF')]

In [96]:
# Creating a key to identify each row using matchid, event_seconds and player name
df_events_wyscout['key'] = df_events_wyscout['matchId'].astype(str) + np.round(df_events_wyscout['eventSec'],6).astype(str) + df_events_wyscout['playerName'].astype(str)

In [97]:
# Removing duplicate key entries (i.e. different actions at the same timestamp by the same player)
df_events_wyscout = df_events_wyscout.drop_duplicates(subset=['key'],keep='first')

In [98]:
# Appending VAEP values along with offensive and defensive values to Wyscout events data from VAEP data
# by joining the two dataframes
df_events_vaep = df_events_wyscout.merge(vaep_values_proles[[
    'key', 'vaep_value', 'offensive_value', 'defensive_value'
]],
                                              how='left',
                                              left_on='key',
                                              right_on='key')

In [99]:
len(df_events_vaep)

2843958

**Creating DF for Value sums per team per match**

In [100]:
value_sums = pd.DataFrame(df_events_vaep.groupby(['matchId', 'teamId']))

In [101]:
value_sums.head()

Unnamed: 0,0,1
0,"(2499719, 1609)",eventId subEventName \ 2...
1,"(2499719, 1631)",eventId subEventName \ 22...
2,"(2499720, 1625)",eventId subEventName \ 2...
3,"(2499720, 1651)",eventId subEventName \ 23...
4,"(2499721, 1610)",eventId subEventName \ 16...


In [102]:
def value_sum_regionwise(df):
    off_sum = [0,0,0,0]
    vaep_sum = [0,0,0,0]
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    for i in range(len(df)):
        try:
            if (df['positions'][i][1][0] >= 52) and (df['positions'][i][1][1] <= 17):
                off_sum[0]+= df['offensive_value'].loc[i]
                vaep_sum[0]+= df['vaep_value'].loc[i]
            elif (df['positions'][i][1][0] >= 52) and (df['positions'][i][1][1] > 17) and (df['positions'][i][1][1] <= 34):
                off_sum[1]+= df['offensive_value'].loc[i]
                vaep_sum[1]+= df['vaep_value'].loc[i]
            elif (df['positions'][i][1][0] >= 52) and (df['positions'][i][1][1] > 34) and (df['positions'][i][1][1] <= 51):
                off_sum[2]+= df['offensive_value'].loc[i]
                vaep_sum[2]+= df['vaep_value'].loc[i]
            elif (df['positions'][i][1][0] >= 52) and (df['positions'][i][1][1] > 51):
                off_sum[3]+= df['offensive_value'].loc[i]
                vaep_sum[3]+= df['vaep_value'].loc[i]
            else:
                continue
        except:
            print(i)
            
    return off_sum, vaep_sum


In [103]:
value_sums['offsum_regionwise'] = value_sums[1].apply(lambda x: value_sum_regionwise(x)[0])
value_sums['vaepsum_regionwise'] = value_sums[1].apply(lambda x: value_sum_regionwise(x)[1])

146
181
341
369
221
347
235
518
369
563
5
161
607
536
709
121
305
101
482
135
271
62
277
478
364
469
519
278
297
319
374
356
423
578
616
318
396
283
510
179
208
401
108
373
233
415
130
64
284
226
131
196
156
67
130
738
81
272
101
126
462
600
286
234
390
608
207
302
75
143
9
369
172
202
331
207
117
411
368
280
244
170
436
243
413
397
957
63
300
284
176
17
74
256
22
512
175
18
285
416
263
123
222
261
473
451
321
242
418
170
407
65
168
44
374
99
301
333
480
97
131
369
361
80
204
246
224
201
264
378
491
13
238
334
415
292
411
412
480
44
159
200
171
202
36
176
263
246
198
274
338
114
255
720
490
356
357
174
248
150
331
357
469
128
274
138
270
484
175
220
252
444
488
58
771
652
49
3
188
265
313
80
234
266
155
90
26
51
391
598
89
157
253
255
66
186
112
434
219
432
618
380
84
137
409
456
238
643
187
118
272
58
132
286
311
493
310
84
104
864
407
144
461
574
464
246
278
596
92
407
325
218
326
197
33
173
214
87
269
274
823
368
457
222
317
434
537
367
112
32
141
15
253
441
387
195
15
96
22
509
403

In [104]:
value_sums.head()

Unnamed: 0,0,1,offsum_regionwise,vaepsum_regionwise
0,"(2499719, 1609)",eventId subEventName \ 0 ...,"[0.010938184102997184, 0.34169068839401007, 0....","[0.006355275749228895, 0.3151636745315045, 0.7..."
1,"(2499719, 1631)",eventId subEventName \ 0 ...,"[2.1650491072796285, 0.6080674800323322, 0.395...","[2.0625359127297997, 0.5934236499015242, 0.319..."
2,"(2499720, 1625)",eventId subEventName \ 0 ...,"[0.4876551937777549, 0.9191598648903891, 0.622...","[0.33298436366021633, 0.8706628917716444, 0.57..."
3,"(2499720, 1651)",eventId subEventName \ 0 ...,"[-0.029110628413036466, 0.13315539667382836, 0...","[-0.0759745518444106, 0.08726157562341541, 0.0..."
4,"(2499721, 1610)",eventId subEventName \ 0 ...,"[0.08428994892165065, 0.6808061979245394, 0.78...","[0.12251864897552878, 0.5640728371217847, 0.70..."


In [105]:
teams = pd.read_json('../../data_top5/teams/teams.json')

In [106]:
value_sums['team_name'] = value_sums[0].apply(lambda x: teams[teams['wyId']==x[1]]['name'].values[0])

In [107]:
value_sums['match_id'] = value_sums[0].apply(lambda x: x[0])

In [108]:
value_sums = value_sums.replace({'team_name':{
        'Brighton & Hove Albion': 'Brighton',
        'AFC Bournemouth': 'Bournemouth',
        'Huddersfield Town': 'Huddersfield',
        'Manchester United': 'Manchester Utd',
        'Newcastle United': 'Newcastle Utd',
        'Tottenham Hotspur': 'Tottenham',
        'West Bromwich Albion': 'West Brom',
        'West Ham United': 'West Ham',
        'Bayer Leverkusen': 'Leverkusen',
        'Bayern München': 'Bayern Munich',
        'Borussia Dortmund': 'Dortmund',
        "Borussia M'gladbach": "M'Gladbach",
        'Eintracht Frankfurt': 'Eint Frankfurt',
        'Amiens SC': 'Amiens',
        'Angers SCO': 'Angers',
        'Olympique Lyonnais': 'Lyon',
        'PSG': 'Paris S-G',
        'Olympique Marseille': 'Marseille',
        'Deportivo Alavés': 'Alavés',
        'Real Betis': 'Betis',
        'Celta de Vigo': 'Celta Vigo',
        'Deportivo La Coruña': 'La Coruña',
        'Internazionale': 'Inter'
    }}
)

In [109]:
value_sums.drop(columns=[0,1], inplace=True)
value_sums.tail()

Unnamed: 0,offsum_regionwise,vaepsum_regionwise,team_name,match_id
3647,"[0.11343531752936542, 0.6142485038144514, 0.47...","[0.10669509379658848, 0.5678160234820098, 0.45...",Sassuolo,2576336
3648,"[0.6080314521677792, 0.43118888954631984, 0.60...","[-0.06507996458094567, 0.40493164549116045, 0....",Sampdoria,2576337
3649,"[0.13377808569930494, 0.5273727693129331, 0.68...","[0.12469712854363024, 0.5012582537019625, 0.69...",SPAL,2576337
3650,"[1.194976296275854, 0.9772993098013103, 0.3403...","[1.1921986722154543, 0.9372680665692315, 0.367...",Torino,2576338
3651,"[0.014245298807509243, 0.8883648137561977, 0.6...","[0.004369883798062801, 0.7049350732704625, 0.6...",Genoa,2576338


In [63]:
value_sums['team_name'] = value_sums['team_name'].swifter.set_npartitions(
    8).apply(lambda x: x.encode().decode('unicode_escape').replace('\xad', ''))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=3652.0, style=ProgressStyle(descriptio…




In [64]:
value_sums = value_sums.replace({
    'team_name': {
        'Brighton & Hove Albion': 'Brighton',
        'AFC Bournemouth': 'Bournemouth',
        'Huddersfield Town': 'Huddersfield',
        'Manchester United': 'Manchester Utd',
        'Newcastle United': 'Newcastle Utd',
        'Tottenham Hotspur': 'Tottenham',
        'West Bromwich Albion': 'West Brom',
        'West Ham United': 'West Ham',
        'Bayer Leverkusen': 'Leverkusen',
        'Bayern München': 'Bayern Munich',
        'Borussia Dortmund': 'Dortmund',
        "Borussia M'gladbach": "M'Gladbach",
        'Eintracht Frankfurt': 'Eint Frankfurt',
        'Amiens SC': 'Amiens',
        'Angers SCO': 'Angers',
        'Olympique Lyonnais': 'Lyon',
        'PSG': 'Paris S-G',
        'Olympique Marseille': 'Marseille',
        'Deportivo Alavés': 'Alavés',
        'Real Betis': 'Betis',
        'Celta de Vigo': 'Celta Vigo',
        'Deportivo La Coruña': 'La Coruña',
        'Internazionale': 'Inter'
    }
})

In [110]:
value_sums.to_pickle('../../data_top5/vaep/value_sums.pkl')