# Compute Players Vectors
In this notebook we load the raw data from WyScout and we extract the different characteristics that compose our 79-length vectors describing each player's style.
For this we write 3 different functions:
- extract_characteristics: extracts all attributes related to player's actions, location, etc. In other words, everything except the attributes related to Social Network Analysis. In total these are 63 attributes. Some of these were inspired by the work of Decroos & Davis (2019)
- extract_player_motifs: extracts all 15 network motifs intensity of each player. Inspired by the work of Bekkers (2019)
- extract_flow_centrality: extracts the flow centrality of each player. That is, the fraction of his team's total possession plays in which he was involved with at least one pass or shot.

In [1]:
import json
from collections import Counter
import numpy as np
import operator
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import pandas as pd
import networkx as nx
import base64
from collections import defaultdict
import sys, os
import math
import random
import operator
import csv
import matplotlib.pylab as pyl
import itertools
import scipy as sp
from scipy import stats
import scipy.stats as stats
from scipy import optimize
from scipy.integrate import quad
import networkx as nx
import re
import warnings
from IPython import display

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 180)
from tqdm.notebook import tqdm

In [2]:
def load_data(nations):
    events_idx = {}
    for nation in nations:
        with open("events_%s.json" % nation) as json_data:
            events_idx[nation] = json.load(json_data)
    matches_idx = {}
    for nation in nations:
        with open("matches_%s.json" % nation) as json_data:
            matches_idx[nation] = json.load(json_data)
    players_idx = {}
    with open("players.json") as json_data:
        players_idx = json.load(json_data)
    competitions_idx = {}
    with open("competitions.json") as json_data:
        competitions_idx = json.load(json_data)
    teams_idx = {}
    with open("teams.json") as json_data:
        teams_idx = json.load(json_data)
    events = pd.DataFrame()
    matches = pd.DataFrame()
    for nation in nations:
        events = events.append(events_idx[nation])
        matches = matches.append(matches_idx[nation])
    players = pd.DataFrame(players_idx)
    competitions = pd.DataFrame(competitions_idx)
    teams = pd.DataFrame(teams_idx)
    events=events[['eventId', 
                   'eventName', 
                   'eventSec', 
                   'id', 
                   'matchId', 
                   'matchPeriod', 
                   'playerId', 
                   'positions', 
                   'subEventId',
                  'subEventName', 
                   'tags', 
                   'teamId'
                    ]]
    return events, matches, players, competitions, teams

In [3]:
dict_chars = {"\\u00c0": "À", "\\u00c1": "Á", "\\u00c2": "Â", "\\u00c3": "Ã", "\\u00c4": "Ä", "\\u00c5": "Å", "\\u00c6": "Æ", 
            "\\u00c7": "Ç", "\\u00c8": "È", "\\u00c9": "É", "\\u00ca": "Ê", "\\u00cb": "Ë", "\\u00cc": "Ì", "\\u00cd": "Í",
             "\\u00ce": "Î", "\\u00cf": "Ï", "\\u00d1": "Ñ", "\\u00d2": "Ò", "\\u00d3": "Ó", "\\u00d4": "Ô", "\\u00d5": "Õ", 
             "\\u00d6": "Ö", "\\u00d8": "Ø", "\\u00d9": "Ù", "\\u00da": "Ú", "\\u00db": "Û", "\\u00dc": "Ü", "\\u00dc": "Ü",
             "\\u00dd": "Ý","\\u00df": "ß", "\\u00e0": "à", "\\u00e1": "á", "\\u00e2": "â", "\\u00e3": "ã", "\\u00e4": "ä", 
             "\\u00e5": "å", "\\u00e6": "æ", "\\u00e7": "ç", "\\u00e8": "è", "\\u00e9": "é", "\\u00ea": "ê", "\\u00eb": "ë",
             "\\u00ec": "ì", "\\u00ed": "í", "\\u00ee": "î", "\\u00ef": "ï", "\\u00f0": "ð", "\\u00f1": "ñ", "\\u00f2": "ò",
             "\\u00f3": "ó", "\\u00f4": "ô", "\\u00f5": "õ", "\\u00f6": "ö", "\\u00f8": "ø", "\\u00f9": "ù", "\\u00fa": "ú", 
             "\\u00fb": "û", "\\u00fc": "ü", "\\u00fd": "ý", "\\u00ff": "ÿ", "\\u00ad": "", "\\u00fe": "b", "\\u0144": "ń", 
             "\\u0110": "D", "\\u010d": "c", "\\u0107": "ć", "\\u0148": "n", "\\u0119": "ę", "\\u0146": "n", "\\u0141": "L",
             "\\u0106": "Ć", "\\u015e": "S", "\\u0131": "i", "\\u010d": "č", "\\u015e": "S", "\\u0160": "S", "\\u0103": "a",
             "\\u0161": "s", "\\u010e": "D", "\\u017e": "z", "\\u017d": "Z", "\\u0142": "l", "\\u011f": "g", "\\u0130": "I",
             "\\u0131": "i", "\\u0159": "r", "\\u0163": "t", "\\u0219": "ș", "\\u0105": "a", "\\u021b": "t", "\\u014c": "O",
             "\\u015f": "s", "\\u014d": "o", "\\u013d": "L", "\\u010c": "C"}

def fix_errors_char(column):
    for i, j in dict_chars.items():
        column = column.apply(lambda x: x.replace(i, j))
    return column

In [4]:
def extract_characteristics(events):
    df= events.copy()
    df['teamId_prev']=0; df['playerId_prev']=0
    df['teamId_prev'][1:] = df.teamId[0:-1].reset_index(drop=True)
    df.fillna(0, inplace=True)
    df.teamId_prev = df.teamId_prev.astype(int)
    df['playerId_prev'][1:] = df.playerId[0:-1].reset_index(drop=True)
    df.fillna(0, inplace=True)
    df.playerId_prev = df.playerId_prev.astype(int)
    
    df['same_team_passes'] = [1 if ((x==y) & (a!=b)) else 0 for x,y,a,b in zip(df.teamId, df.teamId_prev, df.playerId, df.playerId_prev)] 
    df.drop(['teamId_prev', 'playerId_prev'], axis=1, inplace=True)

    # Moving info inside dictionaries inside lists to df columns
    df['tags_values'] = [[y['id'] for y in x]  for x in df.tags]
    df['X'] = [[x['x'] for x in start] for start in df.positions]
    df['Y'] = [[x['y'] for x in end] for end in df.positions]
    df['X_start'] = [x[0] for x in df.X]
    df['X_end'] = [x[1] if len(x)>1 else 0 for x in df.X]
    df['Y_start'] = [y[0] for y in df.Y]
    df['Y_end'] = [y[1] if len(y)>1 else 0 for y in df.Y]
    df.drop(['X', 'Y'], axis=1, inplace=True)

    #PASSING
    df['total_passes'] = [1 if x==8 else 0 for x in df.eventId]
    df['accurate_passes'] = [1 if ((x==8) & (1801 in y)) else 0 for x,y in zip(df.eventId, df.tags_values)]
    df['smart_passes'] = [1 if ((x==1) & (y==86)) else 0 for x,y in zip(df.accurate_passes, df.subEventId)]
    df['forward_passes'] = [1 if ((z==1) & ((x2!=0) | (y2!=0)) & (x2>x1)) else 0 for x1,x2,y2,z in zip(df.X_start, df.X_end, df.Y_end, df.accurate_passes)]
    df['X_pass'] = [abs(x2-x1) if ((z==1) & ((x2!=0) | (y2!=0))) else 0 for x1,x2,y2,z in zip(df.X_start, df.X_end, df.Y_end, df.accurate_passes)]
    df['Y_pass'] = [abs(y2-y1) if ((z==1) & ((x2!=0) | (y2!=0))) else 0 for x2,y1,y2,z in zip(df.X_end, df.Y_start, df.Y_end, df.accurate_passes)]
    df['pass_end_z1'] = [1 if ((z==1) & (0<x<50) & (0<y<34)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z2'] = [1 if ((z==1) & (0<x<50) & (34<=y<67)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z3'] = [1 if ((z==1) & (0<x<50) & (67<=y<101)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z4'] = [1 if ((z==1) & (50<=x<75) & (0<y<34)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z5'] = [1 if ((z==1) & (50<=x<75) & (34<=y<67)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z6'] = [1 if ((z==1) & (50<=x<75) & (67<=y<101)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z7'] = [1 if ((z==1) & (75<=x<101) & (0<y<34)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z8'] = [1 if ((z==1) & (75<=x<101) & (34<=y<67)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_end_z9'] = [1 if ((z==1) & (75<=x<101) & (67<=y<101)) else 0 for x,y,z in zip(df.X_end, df.Y_end, df.accurate_passes)]
    df['pass_reception_z1'] = [1 if ((z==1) & (0<x<50) & (0<y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z2'] = [1 if ((z==1) & (0<x<50) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z3'] = [1 if ((z==1) & (0<x<50) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z4'] = [1 if ((z==1) & (50<=x<75) & (0<y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z5'] = [1 if ((z==1) & (50<=x<75) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z6'] = [1 if ((z==1) & (50<=x<75) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z7'] = [1 if ((z==1) & (75<=x<101) & (0<y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z8'] = [1 if ((z==1) & (75<=x<101) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]
    df['pass_reception_z9'] = [1 if ((z==1) & (75<=x<101) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.same_team_passes)]

    #SHOTS
    df['total_shots'] = [1 if z==10 else 0 for z in df.eventId]
    df['shot_z1'] = [1 if ((z==10) & (0<x<86)) else 0 for x,z in zip(df.X_start, df.eventId)]
    df['shot_z2'] = [1 if ((z==10) & (x>=86) & (y<43)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.eventId)]
    df['shot_z3'] = [1 if ((z==10) & (x>=86) & (42<y<59)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.eventId)]
    df['shot_z4'] = [1 if ((z==10) & (x>=86) & (58<y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.eventId)] 
    df['goal'] = [1 if ((z==10) & (101 in y)) else 0 for z,y in zip(df.eventId, df.tags_values)]
    df['shot_wleft'] = [1 if ((z==10) & (401 in y)) else 0 for z,y in zip(df.eventId, df.tags_values)]
    df['shot_wright'] = [1 if ((z==10) & (402 in y)) else 0 for z,y in zip(df.eventId, df.tags_values)]
    df['shot_whead'] = [1 if ((z==10) & (403 in y)) else 0 for z,y in zip(df.eventId, df.tags_values)]

    #DRIBBLES
    df['dribble']=  [1 if ((x==11) & (703 in y) & ((503 in y) | (504 in y))) else 0 for x,y in zip(df.subEventId, df.tags_values)]
    df['dribble_z1'] = [1 if ((z==1) & (x<50) & (y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z2'] = [1 if ((z==1) & (x<50) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z3'] = [1 if ((z==1) & (x<50) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z4'] = [1 if ((z==1) & (50<=x<75) & (0<=y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z5'] = [1 if ((z==1) & (50<=x<75) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z6'] = [1 if ((z==1) & (50<=x<75) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z7'] = [1 if ((z==1) & (75<=x<101) & (0<=y<34)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z8'] = [1 if ((z==1) & (75<=x<101) & (34<=y<67)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]
    df['dribble_z9'] = [1 if ((z==1) & (75<=x<101) & (67<=y<101)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.dribble)]

    #CROSSES
    df['cross_z1'] = [1 if ((z==80) & (x<93) & (y<45)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['cross_z2'] = [1 if ((z==80) & (x<93) & (y>55)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['cross_z3'] = [1 if ((z==80) & (x>=93) & (y<45)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['cross_z4'] = [1 if ((z==80) & (x>=93) & (y>55)) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]

    #DEFENSIVE ACTIONS
    df['clearances'] = [1 if x==71 else 0 for x in df.subEventId]
    df['interceptions'] = [1 if (1401 in x) else 0 for x in df.tags_values]
    df['tackles'] = [1 if ((x==12) & (703 in y)) else 0 for x,y in zip(df.subEventId, df.tags_values)]
    df['loose_ball_won'] = [1 if ((x==13) & (703 in y)) else 0 for x,y in zip(df.subEventId, df.tags_values)]
    df['final_third_recovery'] = [1 if ((x>60) & ((a==1) | (b==1) | (c==1))) else 0 for x,a,b,c in zip(df.X_start, df.interceptions, df.tackles, df.loose_ball_won)]
    df['air_duels'] = [1 if x==10 else 0 for x in df.subEventId]
    df['air_duels_won'] = [1 if ((x==1) & (703 in y)) else 0 for x,y in zip(df.air_duels, df.tags_values)]
    df['headers'] = [1 if ((x==82) | (y==1) | (z==1)) else 0 for x,y,z in zip(df.subEventId, df.shot_whead, df.air_duels_won)]
    df['defense_z1'] = [1 if (((z==12) | (z==13)) & ((x<25) & (y<34))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z2'] = [1 if (((z==12) | (z==13)) & ((x<25) & (34<=y<67))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z3'] = [1 if (((z==12) | (z==13)) & ((x<25) & (67<=y<101))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z4'] = [1 if (((z==12) | (z==13)) & ((25<=x<50) & (y<34))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z5'] = [1 if (((z==12) | (z==13)) & ((25<=x<50) & (34<=y<67))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z6'] = [1 if (((z==12) | (z==13)) & ((25<=x<50) & (67<=y<101))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z7'] = [1 if (((z==12) | (z==13)) & ((50<=x<75) & (y<34))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z8'] = [1 if (((z==12) | (z==13)) & ((50<=x<75) & (34<=y<67))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z9'] = [1 if (((z==12) | (z==13)) & ((50<=x<75) & (67<=y<101))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z10'] = [1 if (((z==12) | (z==13)) & ((75<=x<101) & (y<34))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z11'] = [1 if (((z==12) | (z==13)) & ((75<=x<101) & (34<=y<67))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]
    df['defense_z12'] = [1 if (((z==12) | (z==13)) & ((75<=x<101) & (67<=y<101))) else 0 for x,y,z in zip(df.X_start, df.Y_start, df.subEventId)]

    #MINUTES PLAYED          
    minutes = df.groupby(['playerId', 'matchId'], as_index=False).agg({'goal': 'sum'})
    minutes = pd.merge(minutes[['playerId', 'matchId']], playerank[['playerId', 'matchId', 'minutesPlayed']], how='left')
    minutes = minutes.groupby('playerId', as_index=False).agg({'minutesPlayed': 'sum'})
    
    # TOTALS BY PLAYER
    players_stats = df.groupby(['playerId'], as_index=False).agg({'goal': 'sum', 
                                                    'X_pass':['mean', 'std'], 
                                                    'Y_pass': ['mean', 'std'],
                                                    'forward_passes': 'sum', 
                                                    'accurate_passes': 'sum', 
                                                    'total_passes': 'sum',
                                                    'smart_passes': 'sum',
                                                    'pass_end_z1': 'sum', 'pass_end_z2': 'sum', 'pass_end_z3': 'sum',
                                                    'pass_end_z4': 'sum', 'pass_end_z5': 'sum', 'pass_end_z6': 'sum',
                                                    'pass_end_z7': 'sum', 'pass_end_z8': 'sum', 'pass_end_z9': 'sum',
                                                    'pass_reception_z1': 'sum', 'pass_reception_z2': 'sum', 'pass_reception_z3': 'sum',
                                                    'pass_reception_z4': 'sum', 'pass_reception_z5': 'sum', 'pass_reception_z6': 'sum',
                                                    'pass_reception_z7': 'sum', 'pass_reception_z8': 'sum', 'pass_reception_z9': 'sum',
                                                    'dribble_z1': 'sum', 'dribble_z2': 'sum', 'dribble_z3': 'sum',
                                                    'dribble_z4': 'sum', 'dribble_z5': 'sum', 'dribble_z6': 'sum',
                                                    'dribble_z7': 'sum', 'dribble_z8': 'sum', 'dribble_z9': 'sum',
                                                    'cross_z1': 'sum', 'cross_z2': 'sum', 'cross_z3': 'sum', 'cross_z4': 'sum',
                                                    'total_shots': 'sum', 
                                                    'shot_z1': 'sum', 'shot_z2':'sum', 'shot_z3':'sum', 'shot_z4':'sum',
                                                    'shot_wleft':'sum','shot_wright':'sum','shot_whead':'sum',
                                                    'clearances':'sum', 'interceptions':'sum', 'tackles':'sum',
                                                    'loose_ball_won':'sum', 'final_third_recovery':'sum', 'defense_z1':'sum',
                                                    'defense_z2':'sum','defense_z3':'sum','defense_z4':'sum',
                                                    'defense_z5':'sum','defense_z6':'sum','defense_z7':'sum','defense_z8':'sum',
                                                    'defense_z9':'sum','defense_z10':'sum','defense_z11':'sum','defense_z12':'sum',
                                                    'headers': 'sum', 'air_duels':'sum', 'air_duels_won':'sum',
                                                    }) 
    #RENAME COLUMNS
    players_stats.columns=['playerId', 'goals', 'X_pass_mean', 'X_pass_std', 'Y_pass_mean', 'Y_pass_std', 'forward_passes',
                           'acc_passes', 'total_passes', 'smart_passes',
                           'pass_end_z1', 'pass_end_z2', 'pass_end_z3', 'pass_end_z4', 'pass_end_z5',
                          'pass_end_z6', 'pass_end_z7', 'pass_end_z8', 'pass_end_z9',
                           'pass_reception_z1', 'pass_reception_z2', 'pass_reception_z3', 'pass_reception_z4', 'pass_reception_z5',
                          'pass_reception_z6', 'pass_reception_z7', 'pass_reception_z8', 'pass_reception_z9',
                          'dribble_z1', 'dribble_z2', 'dribble_z3', 'dribble_z4', 'dribble_z5', 'dribble_z6',
                          'dribble_z7', 'dribble_z8', 'dribble_z9', 'cross_z1',
                          'cross_z2', 'cross_z3', 'cross_z4', 'total_shots', 'shot_z1', 'shot_z2', 'shot_z3','shot_z4', \
                          'shot_wleft', 'shot_wright', 'shot_whead', 'clearances', 'interceptions', 'tackles',\
                          'loose_ball_won', 'final_third_recovery', 'defense_z1', 'defense_z2', 'defense_z3', 'defense_z4', \
                          'defense_z5', 'defense_z6', 'defense_z7', 'defense_z8', 'defense_z9', 'defense_z10', 'defense_z11',\
                          'defense_z12','headers', 'air_duels', 'air_duels_won']
    
    # MERGE MINUTES PLAYED AND PLAYERS INFO TO DF
    players_stats.reset_index(inplace=True, drop=True)
    players_stats = pd.merge(players_stats, players[['playerId', 'shortName','team','position','height','weight','foot']], how='left', on='playerId')
    players_stats = pd.merge(players_stats, minutes, how='left', on='playerId')
    
    #COMPUTE CHARACTERISTICS PER 90 MINUTES OR RELATIVE FREQUENCY, ACCORDING TO EACH CASE
    players_stats['goals_per90'] = (players_stats['goals'] / players_stats['minutesPlayed']) * 90
    players_stats['clearances_per90'] = (players_stats['clearances'] / players_stats['minutesPlayed']) * 90
    players_stats['interceptions_per90'] = (players_stats['interceptions'] / players_stats['minutesPlayed']) * 90
    players_stats['tackles_per90'] = (players_stats['tackles'] / players_stats['minutesPlayed']) * 90
    players_stats['loose_ball_won_per90'] = (players_stats['loose_ball_won'] / players_stats['minutesPlayed']) * 90
    players_stats['final_third_recovery_per90'] = (players_stats['final_third_recovery'] / players_stats['minutesPlayed']) * 90
    players_stats['forward_passes'] = players_stats['forward_passes'] / players_stats ['total_passes']
    players_stats['smart_passes'] = players_stats['smart_passes'] / players_stats ['acc_passes']
    players_stats['pass_end_z1'] = players_stats['pass_end_z1'] / players_stats ['acc_passes']
    players_stats['pass_end_z2'] = players_stats['pass_end_z2'] / players_stats ['acc_passes']
    players_stats['pass_end_z3'] = players_stats['pass_end_z3'] / players_stats ['acc_passes']
    players_stats['pass_end_z4'] = players_stats['pass_end_z4'] / players_stats ['acc_passes']
    players_stats['pass_end_z5'] = players_stats['pass_end_z5'] / players_stats ['acc_passes']
    players_stats['pass_end_z6'] = players_stats['pass_end_z6'] / players_stats ['acc_passes']
    players_stats['pass_end_z7'] = players_stats['pass_end_z7'] / players_stats ['acc_passes']
    players_stats['pass_end_z8'] = players_stats['pass_end_z8'] / players_stats ['acc_passes']
    players_stats['pass_end_z9'] = players_stats['pass_end_z9'] / players_stats ['acc_passes']
    players_stats['total_passes_received'] = players_stats['pass_reception_z1'] + players_stats['pass_reception_z2'] + \
    players_stats['pass_reception_z3'] + players_stats['pass_reception_z4'] + players_stats['pass_reception_z5'] + \
    players_stats['pass_reception_z6'] + players_stats['pass_reception_z7'] + players_stats['pass_reception_z8'] + \
    players_stats['pass_reception_z9']
   
    players_stats['pass_reception_z1'] = players_stats['pass_reception_z1'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z2'] = players_stats['pass_reception_z2'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z3'] = players_stats['pass_reception_z3'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z4'] = players_stats['pass_reception_z4'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z5'] = players_stats['pass_reception_z5'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z6'] = players_stats['pass_reception_z6'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z7'] = players_stats['pass_reception_z7'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z8'] = players_stats['pass_reception_z8'] / players_stats ['total_passes_received']
    players_stats['pass_reception_z9'] = players_stats['pass_reception_z9'] / players_stats ['total_passes_received']

    players_stats['dribble_z1_per90'] = players_stats['dribble_z1'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z2_per90'] = players_stats['dribble_z2'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z3_per90'] = players_stats['dribble_z3'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z4_per90'] = players_stats['dribble_z4'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z5_per90'] = players_stats['dribble_z5'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z6_per90'] = players_stats['dribble_z6'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z7_per90'] = players_stats['dribble_z7'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z8_per90'] = players_stats['dribble_z8'] / players_stats['minutesPlayed'] * 90
    players_stats['dribble_z9_per90'] = players_stats['dribble_z9'] / players_stats['minutesPlayed'] * 90
    
    players_stats['shot_z1_per90'] = players_stats['shot_z1'] / players_stats['minutesPlayed'] * 90
    players_stats['shot_z2_per90'] = players_stats['shot_z2'] / players_stats['minutesPlayed'] * 90
    players_stats['shot_z3_per90'] = players_stats['shot_z3'] / players_stats['minutesPlayed'] * 90
    players_stats['shot_z4_per90'] = players_stats['shot_z4'] / players_stats['minutesPlayed'] * 90
    players_stats['shot_wleft'] = players_stats['shot_wleft'] / players_stats ['total_shots']
    players_stats['shot_wright'] = players_stats['shot_wright'] / players_stats ['total_shots']
    players_stats['shot_whead'] = players_stats['shot_whead'] / players_stats ['total_shots']
    
    players_stats['cross_z1_per90'] = players_stats['cross_z1'] / players_stats['minutesPlayed'] * 90
    players_stats['cross_z2_per90'] = players_stats['cross_z2'] / players_stats['minutesPlayed'] * 90
    players_stats['cross_z3_per90'] = players_stats['cross_z3'] / players_stats['minutesPlayed'] * 90
    players_stats['cross_z4_per90'] = players_stats['cross_z4'] / players_stats['minutesPlayed'] * 90
    
    players_stats['total_defense']=players_stats['defense_z1']+players_stats['defense_z2']+players_stats['defense_z3']+ \
    players_stats['defense_z4']+players_stats['defense_z5']+players_stats['defense_z6']+players_stats['defense_z7']+ \
    players_stats['defense_z8']+players_stats['defense_z9']+players_stats['defense_z10']+players_stats['defense_z11']+ \
    players_stats['defense_z12']
    
    players_stats['defense_z1'] = players_stats['defense_z1']/players_stats['total_defense']
    players_stats['defense_z2'] = players_stats['defense_z2']/players_stats['total_defense']
    players_stats['defense_z3'] = players_stats['defense_z3']/players_stats['total_defense']
    players_stats['defense_z4'] = players_stats['defense_z4']/players_stats['total_defense']
    players_stats['defense_z5'] = players_stats['defense_z5']/players_stats['total_defense']
    players_stats['defense_z6'] = players_stats['defense_z6']/players_stats['total_defense']
    players_stats['defense_z7'] = players_stats['defense_z7']/players_stats['total_defense']
    players_stats['defense_z8'] = players_stats['defense_z8']/players_stats['total_defense']
    players_stats['defense_z9'] = players_stats['defense_z9']/players_stats['total_defense']
    players_stats['defense_z10'] = players_stats['defense_z10']/players_stats['total_defense']
    players_stats['defense_z11'] = players_stats['defense_z11']/players_stats['total_defense']
    players_stats['defense_z12'] = players_stats['defense_z12']/players_stats['total_defense']
    
    players_stats['air_success']=players_stats['air_duels_won']/players_stats['air_duels']
    players_stats['headers_per90'] = (players_stats['headers'] / players_stats['minutesPlayed']) * 90

    # REORDER DATAFRAME AND FINISH
    players_stats = players_stats[['playerId', 'shortName', 'team', 'position', 'height', 'foot', 'minutesPlayed',
                                  'goals_per90', 'pass_reception_z1', 'pass_reception_z2', 'pass_reception_z3',
                                  'pass_reception_z4', 'pass_reception_z5', 'pass_reception_z6', 'pass_reception_z7',
                                  'pass_reception_z8', 'pass_reception_z9', 'defense_z1', 'defense_z2', 'defense_z3',
                                  'defense_z4', 'defense_z5', 'defense_z6', 'defense_z7', 'defense_z8', 'defense_z9',
                                  'defense_z10', 'defense_z11', 'defense_z12', 'X_pass_mean', 'X_pass_std', 'Y_pass_mean',
                                  'Y_pass_std', 'forward_passes', 'smart_passes', 'pass_end_z1', 'pass_end_z2', 
                                  'pass_end_z3', 'pass_end_z4', 'pass_end_z5', 'pass_end_z6', 'pass_end_z7', 
                                  'pass_end_z8', 'pass_end_z9', 'dribble_z1_per90', 'dribble_z2_per90',
                                  'dribble_z3_per90', 'dribble_z4_per90', 'dribble_z5_per90', 'dribble_z6_per90',
                                  'dribble_z7_per90', 'dribble_z8_per90', 'dribble_z9_per90', 'cross_z1_per90',
                                  'cross_z2_per90', 'cross_z3_per90', 'cross_z4_per90', 'shot_z1_per90', 'shot_z2_per90',
                                  'shot_z3_per90', 'shot_z4_per90', 'shot_wleft', 'shot_wright', 'shot_whead',
                                  'clearances_per90', 'interceptions_per90', 'tackles_per90', 'loose_ball_won_per90',
                                  'final_third_recovery_per90', 'headers_per90', 'air_success']]
    players_stats.fillna(0,inplace=True)
    players_stats = players_stats[players_stats['minutesPlayed']!=0].reset_index(drop=True)
    return players_stats


In [5]:
def extract_player_motifs(events):
    # LOAD IN BATCHES TO REDUCE MEMORY DEMANDS
    c=0
    batch=100000
    total=pd.DataFrame()
    while c<len(events):
        if (len(events)-c) >= batch:
            df = events.iloc[c:c+batch,:]
        else:
            df = events.iloc[c:,:]
            
        df['teamId_next']=0; df['playerId_next']=0; df['matchId_next']=0; df['matchPeriod_next']=0; df['eventId_next']=0
        df['teamId_next'][:-1] = df.teamId[1:].reset_index(drop=True)
        df.teamId_next.fillna(0, inplace=True)
        df.teamId_next = df.teamId_next.astype(int)
        df['playerId_next'][:-1] = df.playerId[1:].reset_index(drop=True)
        df.playerId_next.fillna(0, inplace=True)
        df.playerId_next = df.playerId_next.astype(int)
        df['matchId_next'][:-1] = df.matchId[1:].reset_index(drop=True)
        df.matchId_next.fillna(0, inplace=True)
        df.matchId_next = df.matchId_next.astype(int)
        df['matchPeriod_next'][:-1] = df.matchPeriod[1:].reset_index(drop=True)
        df.matchPeriod_next.fillna(0, inplace=True)
        df['eventId_next'][:-1] = df.eventId[1:].reset_index(drop=True)
        df.eventId_next.fillna(0, inplace=True)
        df.eventId_next = df.eventId_next.astype(int)
        
        # Moving info inside dictionaries inside lists to df columns
        df['tags_values'] = [[y['id'] for y in x]  for x in df.tags]
    
        # Remove rows we don't need
        df= df[((df.matchPeriod!='P') & 
                (df.eventId !=4) & 
                (df.eventId !=6) & 
                (df.eventId !=9) & 
                (df.subEventId !=70) & 
                (df.subEventId !=72)
                )]
        df['remove'] = [1 if ((x==1) & ((701 in y) | (702 in y))) else 0 for x,y in zip(df.eventId, df.tags_values)]
        df = df[df.remove==0].reset_index(drop=True)
        
        df['end_play'] = [1 if ((a!=b) | (c!=d) | (e!=f) | (x==10) | (y==3)) else 0 for a,b,c,d,e,f,x,y in \
                          zip(df.matchId, df.matchId_next, df.matchPeriod, df.matchPeriod_next, df.teamId, df.teamId_next, df.eventId, df.eventId_next)]
        
        # NUMBER EACH PASSING SEQUENCE (PLAY_NUM)
        df['play_num']=1
        for i in tqdm(range(0,len(df))):
            if i==0: continue
            if df.loc[i, "matchId"] != df.loc[i-1, "matchId"]:
                df.at[i, "play_num"]=1
            else:
                if df.loc[i-1, "end_play"]==1:
                    df.at[i, "play_num"] = df.loc[i-1, "play_num"]+1
                else:
                    df.at[i, "play_num"] = df.loc[i-1, "play_num"]

        # NOW THAT PASSING SEQUENCES ARE DEFINED, LEAVE ONLY PASSES AND SHOTS IN DF
        passes = df[(df['eventId']==8) | (df['eventId']==10)].reset_index(drop=True)
        
        # Extracting Network Motifs (sequences of 4 consecutive passes)
        #motif ABCD
        passes['ABCD']=0; passes['BACD']=0; passes['BCAD']=0; passes['BCDA']=0 
        #motif ABAB
        passes['ABAB']=0; passes['BABA']=0
        #motif ABAC
        passes['ABAC']=0; passes['BABC']=0; passes['BCBA']=0
        #motif ABCA
        passes['ABCA']=0; passes['BACB']=0; passes['BCAB']=0  #23
        #motif ABCB
        passes['ABCB']=0; passes['BACA']=0; passes['BCAC']=0  #26
        
        for i in tqdm(range(0,(len(passes)-3))):
            if ((passes.loc[i, "play_num"]==passes.loc[i+1,"play_num"]) & (passes.loc[i,"play_num"]==passes.loc[i+2,"play_num"]) & (passes.loc[i,"play_num"]==passes.loc[i+3,"play_num"])):
                if ((passes.loc[i, "teamId"]==passes.loc[i+1,"teamId"]==passes.loc[i+2,"teamId"]==passes.loc[i+3,"teamId"])):
                    if ((passes.loc[i, "eventId"]==passes.loc[i+1, "eventId"]) & (passes.loc[i+1, "eventId"]==passes.loc[i+2, "eventId"]) & (passes.loc[i+2, "eventId"]==8)):
                        if ((passes.loc[i,"playerId"] != passes.loc[i+1,"playerId"]) and\
                            (passes.loc[i,"playerId"]!= passes.loc[i+2,"playerId"]) and\
                            (passes.loc[i,"playerId"]!= passes.loc[i+3, "playerId"]) and\
                            (passes.loc[i+1,"playerId"]!=passes.loc[i+2,"playerId"]) and\
                            (passes.loc[i+2,"playerId"]!=passes.loc[i+3,"playerId"]) and\
                            (passes.loc[i+1,"playerId"]!=passes.loc[i+3,"playerId"])):
                            passes.at[i,"ABCD"]=1; passes.at[i+1,"BACD"]=1; passes.at[i+2,"BCAD"]=1; passes.at[i+3,"BCDA"]=1
                        elif passes.loc[i, "playerId"]==passes.loc[i+2, "playerId"]:
                            if passes.loc[i+1,"playerId"]==passes.loc[i+3,"playerId"]:
                                passes.at[i,"ABAB"]=1; passes.at[i+1,"BABA"]=1
                            else:
                                passes.at[i,"ABAC"]=1; passes.at[i+1,"BABC"]=1; passes.at[i+3,"BCBA"]=1
                        elif passes.loc[i,"playerId"]==passes.loc[i+3,"playerId"]:
                            passes.at[i, "ABCA"]=1; passes.at[i+1, "BACB"]=1; passes.at[i+2,"BCAB"]=1
                        elif passes.loc[i+1,"playerId"]==passes.loc[i+3,"playerId"]:
                            passes.at[i, "ABCB"]=1; passes.at[i+1, "BACA"]=1; passes.at[i+2, "BCAC"]=1
        total = total.append(passes)
        c=c+batch

    total = total[['playerId', 'ABCD','BACD','BCAD','BCDA','ABAB','BABA','ABAC','BABC','BCBA','ABCA',\
                                  'BACB', 'BCAB','ABCB','BACA','BCAC']]
                                                    
    player_motifs = total.groupby('playerId', as_index=False).sum()
    return player_motifs

In [6]:
def extract_flow_centrality(data):
    df=data.copy()
    df['teamId_next']=0; df['playerId_next']=0; df['matchId_next']=0; df['matchPeriod_next']=0; df['eventId_next']=0
    df['teamId_next'][:-1] = df.teamId[1:].reset_index(drop=True)
    df.teamId_next.fillna(0, inplace=True)
    df.teamId_next = df.teamId_next.astype(int)
    df['playerId_next'][:-1] = df.playerId[1:].reset_index(drop=True)
    df.playerId_next.fillna(0, inplace=True)
    df.playerId_next = df.playerId_next.astype(int)
    df['matchId_next'][:-1] = df.matchId[1:].reset_index(drop=True)
    df.matchId_next.fillna(0, inplace=True)
    df.matchId_next = df.matchId_next.astype(int)
    df['matchPeriod_next'][:-1] = df.matchPeriod[1:].reset_index(drop=True)
    df.matchPeriod_next.fillna(0, inplace=True)
    df['eventId_next'][:-1] = df.eventId[1:].reset_index(drop=True)
    df.eventId_next.fillna(0, inplace=True)
    df.eventId_next = df.eventId_next.astype(int)

    # Moving info inside dictionaries inside lists to df columns
    df['tags_values'] = [[y['id'] for y in x]  for x in df.tags]

    # Remove rows we don't need
    df= df[((df.matchPeriod!='P') & 
            (df.eventId !=4) & 
            (df.eventId !=6) & 
            (df.eventId !=9) & 
            (df.subEventId !=70) & 
            (df.subEventId !=72)
            )]
    df['remove'] = [1 if ((x==1) & ((701 in y) | (702 in y))) else 0 for x,y in zip(df.eventId, df.tags_values)]
    df = df[df.remove==0].reset_index(drop=True)

    df['end_play'] = [1 if ((a!=b) | (c!=d) | (e!=f) | (x==10) | (y==3)) else 0 for a,b,c,d,e,f,x,y in \
                      zip(df.matchId, df.matchId_next, df.matchPeriod, df.matchPeriod_next, df.teamId, df.teamId_next, df.eventId, df.eventId_next)]

    # NUMBER EACH PASSING SEQUENCE (PLAY_NUM)
    df['play_num']=1
    for i in tqdm(range(0,len(df))):
        if i==0: continue
        if df.loc[i, "matchId"] != df.loc[i-1, "matchId"]:
            df.at[i, "play_num"]=1
        else:
            if df.loc[i-1, "end_play"]==1:
                df.at[i, "play_num"] = df.loc[i-1, "play_num"]+1
            else:
                df.at[i, "play_num"] = df.loc[i-1, "play_num"]

    # NOW THAT PASSING SEQUENCES ARE DEFINED, LEAVE ONLY PASSES AND SHOTS IN DF
    passes = df[(df['eventId']==8) | (df['eventId']==10)].reset_index(drop=True)

    # WE COUNT THE NUMBER OF PLAYS (N_PASSES>3) PER TEAM ON THE ENTIRE DATA
    by_play = passes.groupby(['matchId', 'teamId', 'play_num'], as_index=False)['eventId'].count().rename(columns={'eventId': 'n_passes'})
    by_play = by_play[by_play.n_passes>=4].reset_index(drop=True)
    by_team = by_play.groupby(['teamId', 'matchId'], as_index=False)['play_num'].count().rename(columns={'play_num': 'n_plays'})
    by_team = by_team.groupby('teamId', as_index=False)['n_plays'].sum()

    #NOW WE COUNT BY PLAYER
    group_plays = passes.groupby(['matchId', 'play_num'], as_index=False)['eventId'].count().rename(columns={'eventId': 'num_passes'})
    passes = pd.merge(passes, group_plays[['matchId', 'play_num', 'num_passes']], how='left', on=['matchId', 'play_num'])
    passes_filtered = passes[passes['num_passes']>3] #plays with less than 4 passes are not possession plays
    passes_filtered.drop_duplicates(['matchId', 'play_num', 'playerId'], inplace=True) #players count only once per play even if they made more
    by_player_match = passes_filtered.groupby(['matchId', 'playerId', 'teamId'], as_index=False)['play_num'].count().rename(columns={'play_num': 'num_plays_player'})
    by_player = by_player_match.groupby(['playerId', 'teamId'], as_index=False)['num_plays_player'].sum()

    # WE HAVE ONE DATAFRAME WITH TOTAL PLAYS PER TEAM AND ANOTHER ONE WITH TOTAL PLAYS PER PLAYER.
    # WE MERGE BOTH AND FLOW CENTRALITY IS N_PLAYS_PLAYER / N_PLAYS_TEAM
    fc = by_player.merge(by_team, on='teamId', how='left')

    # MINUTES PLAYED
    minutes = playerank[playerank.matchId.isin(list(df.matchId.unique()))].reset_index(drop=True)
    minutes = minutes.groupby('playerId', as_index=False)['minutesPlayed'].sum().reset_index(drop=True)

    # WE ADD TOTAL MINUTES PLAYED PER PLAYER
    fc = fc.merge(minutes, on='playerId', how='left')
    fc = fc[~fc.minutesPlayed.isna()]

    # WE NORMALIZE TOTAL TEAM PLAYS BY NUM MINUTES PLAYED BY THE PLAYER
    fc['n_plays_team_norm'] = fc['n_plays'] * fc['minutesPlayed'] / fc['minutesPlayed'].max()
    fc['flow_centrality'] = fc['num_plays_player'] / fc['n_plays_team_norm']
    fc.flow_centrality = [1 if x>1 else x for x in fc.flow_centrality]
    fc = fc[['playerId', 'flow_centrality']]
    return fc

## 1. Load data from WyScout and fix character encoding errors

In [7]:
# Load data: WyScout raw files belonging to Seasons 2017-18 of these leagues should be in the same folder as the notebook
events, matches, players, competitions, teams = load_data(nations=['Spain', 'Italy', 'France', 'Germany', 'England'])

In [8]:
players['shortName'] = fix_errors_char(players['shortName'])
players['lastName'] = fix_errors_char(players['lastName'])
players['firstName'] = fix_errors_char(players['firstName'])
teams['name'] = fix_errors_char(teams['name'])

In [9]:
players.rename(columns={'wyId': 'playerId', 'currentTeamId': 'teamId'}, inplace=True)
teams.rename(columns={'wyId': 'teamId', 'name': 'team'}, inplace=True)
players = pd.merge(players,teams[['teamId', 'team']], how='left', on='teamId')

In [10]:
players['position'] = [x['name'] for x in players.role]

In [11]:
playerank = pd.read_json('playerank.json')

In [12]:
events.head(1)

Unnamed: 0,eventId,eventName,eventSec,id,matchId,matchPeriod,playerId,positions,subEventId,subEventName,tags,teamId
0,8,Pass,2.994582,180864419,2565548,1H,3542,"[{'y': 61, 'x': 37}, {'y': 50, 'x': 50}]",85,Simple pass,[{'id': 1801}],682


In [13]:
matches.rename(columns={'wyId': 'matchId'}, inplace=True)

In [14]:
events = events.merge(matches[['gameweek', 'competitionId', 'matchId']], on='matchId', how='left')

## 2. Prepare H1 & H2 data sets for Distance2Clone calculation

In [234]:
# IF WE DIVIDE FIRST AND SECOND HALF OF THE SEASON:
eventsH1 = pd.concat([events[(events.gameweek<=19) & (events.competitionId!=426)], 
                     events[(events.gameweek<=17) & (events.competitionId==426)]]).reset_index(drop=True)

eventsH2 = pd.concat([events[(events.gameweek>19) & (events.competitionId!=426)], 
                     events[(events.gameweek>17) & (events.competitionId==426)]]).reset_index(drop=True)

In [None]:
# IF WE DIVIDE RANDOMLY:
matches_H2 = matches.groupby('competitionId', group_keys=False).apply(lambda x: x.sample(min(len(x), 190)))
all_matches = matches['matchId'].unique()
matches_H2 = np.array(matches_H2.matchId)
matches_H1 = np.setdiff1d(all_matches, matches_H2)

eventsH1 = events[events['matchId'].isin(matches_H1)]
eventsH2 = events[events['matchId'].isin(matches_H2)]

## 3. Extract all characteristics and merge to obtain one vector per player

We need to do this 3 times: One for our entire data set, so that we have 1 unique vector per player that we later use for searching for similar players to Player X. And two for each random half of the data. We will need these two halves later for computing the Distance2Clone of every player. That is, how similar he is to himself at other matches. We use this metric as validation to check whether our model works well.

In [104]:
players_stats_H1 = extract_characteristics(eventsH1)

In [105]:
players_stats_H2 = extract_characteristics(eventsH2)

In [267]:
players_stats_full = extract_characteristics(events)

We now have 63 of the 79 elements of our players vectors computed. We still need the 15 network motifs (passing sequences patterns) and Flow Centrality. We have to compute these with different functions after creating networks for each match.

In [132]:
motifs_H1 = extract_player_motifs(eventsH1)

HBox(children=(FloatProgress(value=0.0, max=77099.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52667.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76231.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51902.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76304.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51891.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77820.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54938.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77225.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54757.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76373.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52630.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76523.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52372.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75981.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51059.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51409.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76605.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52433.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76871.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53594.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75890.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52013.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76706.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52750.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76582.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51847.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75941.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51832.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19767.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=13155.0), HTML(value='')))




In [133]:
motifs_H2 = extract_player_motifs(eventsH2)

HBox(children=(FloatProgress(value=0.0, max=75893.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51155.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76177.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51846.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75761.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51833.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76723.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53326.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76808.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76118.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75668.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51670.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76096.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52036.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76180.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52809.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75539.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51556.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75591.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51648.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75802.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52033.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75711.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52373.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75589.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51333.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75538.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51373.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=23511.0), HTML(value='')))




In [268]:
motifs_full = extract_player_motifs(events)

HBox(children=(FloatProgress(value=0.0, max=77099.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52667.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76231.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51902.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76304.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51891.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76063.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51527.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75991.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51540.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75699.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51485.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77955.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=55590.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77172.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76442.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52642.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76510.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52691.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76456.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52839.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76440.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52742.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76362.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53055.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76380.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51922.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75542.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50630.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76107.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51836.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75939.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51807.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75858.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51912.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75946.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52222.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77391.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53247.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75986.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75632.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51123.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75513.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51184.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75526.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51496.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76603.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52856.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=76503.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53082.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75924.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52363.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75409.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=51264.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=75985.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=52398.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=54062.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=37674.0), HTML(value='')))




In [221]:
fc_H1 = extract_flow_centrality(eventsH1)

HBox(children=(FloatProgress(value=0.0, max=1167716.0), HTML(value='')))




In [247]:
fc_H2 = extract_flow_centrality(eventsH2)

HBox(children=(FloatProgress(value=0.0, max=1173294.0), HTML(value='')))




In [269]:
fc_full = extract_flow_centrality(events)

HBox(children=(FloatProgress(value=0.0, max=2341010.0), HTML(value='')))




In [259]:
H1_data = players_stats_H1.merge(motifs_H1, on='playerId', how='left').merge(fc_H1, on='playerId', how='left')

In [260]:
H2_data = players_stats_H2.merge(motifs_H2, on='playerId', how='left').merge(fc_H2, on='playerId', how='left')

In [270]:
full_data = players_stats_full.merge(motifs_full, on='playerId', how='left').merge(fc_full, on='playerId', how='left')

In [271]:
# Normalize Network Motifs per 90 minutes played
for column in ['ABCD', 'BACD', 'BCAD', 'BCDA', 'ABAB', 'BABA', 'ABAC', 'BABC', 'BCBA', 'ABCA', 'BACB', 'BCAB', 'ABCB', 'BACA', 'BCAC']:
    H1_data[column] = H1_data[column] / H1_data.minutesPlayed * 90
    H2_data[column] = H2_data[column] / H2_data.minutesPlayed * 90
    full_data[column] = full_data[column] / full_data.minutesPlayed * 90

In [264]:
# Export the data
H1_data.to_csv('final_data/H1_random.csv', index=False)
H2_data.to_csv('final_data/H2_random.csv', index=False)

In [273]:
full_data.to_csv('final_data/full_data.csv', index=False)