In [503]:
import pandas as pd
import networkx as nx
import numpy as np
import json
from collections import defaultdict


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [504]:
events = pd.read_json('events\events_World_Cup.json')
matches = pd.read_json('matches\matches_World_Cup.json')
teams = pd.read_json('teams.json')

In [505]:
def load_public_dataset(tournament='World_Cup'):
    """
    Load the json files with the matches, events, players and competitions
    
    Parameters
    ----------
    data_folder : str, optional
        the path to the folder where json files are stoyellow.
        
    tournaments : list, optional
        the list of tournaments to load. 
        
    Returns
    -------
    tuple
        a tuple of four dictionaries, containing matches, events, players and competitions
        
    """
    # loading the matches and events data
    matches, events = {}, {}
    with open('events/events_%s.json' %tournament) as json_data:
        events = json.load(json_data)
    with open('matches/matches_%s.json' %tournament) as json_data:
        matches = json.load(json_data)
    
    match_id2events = defaultdict(list)
    match_id2match = defaultdict(dict)
    for event in events:
        match_id = event['matchId']
        match_id2events[match_id].append(event)
                                         
    for match in matches:
        match_id = match['wyId']
        match_id2match[match_id] = match

    # loading the players data
    with open('players.json') as json_data:
        players = json.load(json_data)
    
    player_id2player = defaultdict(dict)
    for player in players:
        player_id = player['wyId']
        player_id2player[player_id] = player
    
    # loading the competitions data
    teams={}
    with open('teams.json') as json_data:
        teams = json.load(json_data)
    team_id2team = defaultdict(dict)
    for team in teams:
        team_id = team['wyId']
        team_id2team[team_id] = team
    
    return match_id2match, match_id2events, player_id2player, team_id2team

In [506]:
match_id2match, match_id2events, player_id2player, team_id2team = load_public_dataset()

In [507]:
events.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",2057954,Pass,16521,1H,1.656214,85,258612104
1,8,High pass,[{'id': 1801}],139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",2057954,Pass,16521,1H,4.487814,83,258612106
2,1,Air duel,"[{'id': 703}, {'id': 1801}]",103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",2057954,Duel,14358,1H,5.937411,10,258612077
3,1,Air duel,"[{'id': 701}, {'id': 1802}]",122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",2057954,Duel,16521,1H,6.406961,10,258612112
4,8,Simple pass,[{'id': 1801}],122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",2057954,Pass,16521,1H,8.562167,85,258612110


In [508]:
#fazer o mesmo do df das yellowes para esse aqui (criar df com features)

df = matches[['wyId']]
df = df.rename(columns={'wyId':'matchID'})

#colocar team1 e team2
home_l = []
away_l = []

for match in matches['teamsData']:
    t0 = list(match.values())[0]
    t1 = list(match.values())[1]

    if t0['side'] == 'home':
        home_l.append(t0['teamId'])
        away_l.append(t1['teamId'])  
    else: 
        away_l.append(t0['teamId'])
        home_l.append(t1['teamId'])

df['team1_ID'] = home_l
df['team2_ID'] = away_l

df.head()

Unnamed: 0,matchID,team1_ID,team2_ID
0,2058017,4418,9598
1,2058016,5629,2413
2,2058015,9598,2413
3,2058014,4418,5629
4,2058012,14358,9598


In [509]:
df.shape

(64, 3)

In [510]:
# adaptando coluna 'tags' para conseguir interpretar se evento foi bem sucedido ou não. 'id' = 1801 ou 1802
def converting_tags(aux):
    new = pd.Series()
    try:
        tags = []
        for tag in aux:
            tags.append(list(tag.values())[0])
        new = tags
    except IndexError:
        return None

    return new

events['tags'] = events['tags'].map(converting_tags)

  new = pd.Series()


In [511]:
# GK_SAVES

#contar quantos subEvents = 'Save attempt' com tag = 1801(bem sucedido) para todos os (match, team)

gk_saves = events[(events['eventName'] == 'Save attempt') & (events['tags'].apply(lambda x: 1801 in x))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
gk_saves.columns = gk_saves.columns.droplevel(0)
gk_saves = gk_saves.reset_index().rename(columns={'count':'gk_saves'})
gk_saves.head()

#juntar feature com df
df = df.merge(gk_saves, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(gk_saves, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"gk_saves_x": "gk_saves_T1", "gk_saves_y": "gk_saves_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2
0,2058017,4418,9598,1.0,3.0
1,2058016,5629,2413,6.0,2.0
2,2058015,9598,2413,1.0,5.0
3,2058014,4418,5629,2.0,4.0
4,2058012,14358,9598,2.0,6.0
5,2058013,7047,2413,0.0,3.0
6,2058011,6380,5629,2.0,8.0
7,2058010,15670,4418,0.0,3.0
8,2058009,12430,2413,2.0,4.0
9,2058008,7047,6697,4.0,2.0


In [512]:
#RED_CARDS

#contar quantos subEvents = 'Foul' com tag = 1701(red card) ou 1703(second yellow card) para todos os (match, team)

red_card = events[events['tags'].apply(lambda x: (1701 in x) or (1703 in x))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
red_card.columns = red_card.columns.droplevel(0)
red_card = red_card.reset_index().rename(columns={'count':'red_card'})

#juntar feature com df
df = df.merge(red_card, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(red_card, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"red_card_x": "red_card_T1", "red_card_y": "red_card_T2"})

df = df.fillna(0)
df


Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0
1,2058016,5629,2413,6.0,2.0,0.0,0.0
2,2058015,9598,2413,1.0,5.0,0.0,0.0
3,2058014,4418,5629,2.0,4.0,0.0,0.0
4,2058012,14358,9598,2.0,6.0,0.0,0.0
5,2058013,7047,2413,0.0,3.0,0.0,0.0
6,2058011,6380,5629,2.0,8.0,0.0,0.0
7,2058010,15670,4418,0.0,3.0,0.0,0.0
8,2058009,12430,2413,2.0,4.0,0.0,0.0
9,2058008,7047,6697,4.0,2.0,0.0,1.0


In [513]:
#YELLOW_CARDS

#contar quantos subEvents = 'Foul' com tag = 1702(yellow card) ou 1703(second yellow card) para todos os (match, team)

yellow_card = events[events['tags'].apply(lambda x: (1702 in x) or (1703 in x))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
yellow_card.columns = yellow_card.columns.droplevel(0)
yellow_card = yellow_card.reset_index().rename(columns={'count':'yellow_card'})

#juntar feature com df
df = df.merge(yellow_card, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(yellow_card, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"yellow_card_x": "yellow_card_T1", "yellow_card_y": "yellow_card_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0,2.0,1.0
1,2058016,5629,2413,6.0,2.0,0.0,0.0,1.0,2.0
2,2058015,9598,2413,1.0,5.0,0.0,0.0,2.0,1.0
3,2058014,4418,5629,2.0,4.0,0.0,0.0,2.0,3.0
4,2058012,14358,9598,2.0,6.0,0.0,0.0,1.0,4.0
5,2058013,7047,2413,0.0,3.0,0.0,0.0,2.0,1.0
6,2058011,6380,5629,2.0,8.0,0.0,0.0,2.0,2.0
7,2058010,15670,4418,0.0,3.0,0.0,0.0,2.0,2.0
8,2058009,12430,2413,2.0,4.0,0.0,0.0,6.0,2.0
9,2058008,7047,6697,4.0,2.0,0.0,1.0,1.0,2.0


In [514]:
#ASSISTS

assists = events[events['tags'].apply(lambda x: (301 in x))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
assists.columns = assists.columns.droplevel(0)
assists = assists.reset_index().rename(columns={'count':'assists'})

#juntar feature com df
df = df.merge(assists, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(assists, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"assists_x": "assists_T1", "assists_y": "assists_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2,assists_T1,assists_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0
1,2058016,5629,2413,6.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0
2,2058015,9598,2413,1.0,5.0,0.0,0.0,2.0,1.0,2.0,0.0
3,2058014,4418,5629,2.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0
4,2058012,14358,9598,2.0,6.0,0.0,0.0,1.0,4.0,1.0,2.0
5,2058013,7047,2413,0.0,3.0,0.0,0.0,2.0,1.0,0.0,2.0
6,2058011,6380,5629,2.0,8.0,0.0,0.0,2.0,2.0,1.0,1.0
7,2058010,15670,4418,0.0,3.0,0.0,0.0,2.0,2.0,0.0,2.0
8,2058009,12430,2413,2.0,4.0,0.0,0.0,6.0,2.0,1.0,0.0
9,2058008,7047,6697,4.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0


In [515]:
#FINALIZAÇÕES

#contar quantos subEventName = 'Shot', 'Free kick shot' ou 'Penalty' para todos os (match, team)

shots = events[events['subEventName'].isin(['Shot', 'Free kick shot', 'Penalty'])].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
shots.columns = shots.columns.droplevel(0)
shots = shots.reset_index().rename(columns={'count':'shots'})

#juntar feature com df
df = df.merge(shots, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(shots, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"shots_x": "shots_T1", "shots_y": "shots_T2"})

df = df.fillna(0)
df


Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2,assists_T1,assists_T2,shots_T1,shots_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,8,14
1,2058016,5629,2413,6.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,11,14
2,2058015,9598,2413,1.0,5.0,0.0,0.0,2.0,1.0,2.0,0.0,22,11
3,2058014,4418,5629,2.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0,18,9
4,2058012,14358,9598,2.0,6.0,0.0,0.0,1.0,4.0,1.0,2.0,16,22
5,2058013,7047,2413,0.0,3.0,0.0,0.0,2.0,1.0,0.0,2.0,7,11
6,2058011,6380,5629,2.0,8.0,0.0,0.0,2.0,2.0,1.0,1.0,25,8
7,2058010,15670,4418,0.0,3.0,0.0,0.0,2.0,2.0,0.0,2.0,9,9
8,2058009,12430,2413,2.0,4.0,0.0,0.0,6.0,2.0,1.0,0.0,18,20
9,2058008,7047,6697,4.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0,12,17


In [516]:
#CHUTES A GOL

#contar quantos subEventName = 'Shot', 'Free kick shot' ou 'Penalty' com tag = 1801(accurate) para todos os (match, team)

shots_on_target = events[(events['subEventName'].isin(['Shot', 'Free kick shot', 'Penalty'])) & (events['tags'].apply(lambda x: (1801 in x)))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
shots_on_target.columns = shots_on_target.columns.droplevel(0)
shots_on_target = shots_on_target.reset_index().rename(columns={'count':'shots_on_target'})

#juntar feature com df
df = df.merge(shots_on_target, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(shots_on_target, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"shots_on_target_x": "shots_on_target_T1", "shots_on_target_y": "shots_on_target_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2,assists_T1,assists_T2,shots_T1,shots_T2,shots_on_target_T1,shots_on_target_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,8,14,6.0,3.0
1,2058016,5629,2413,6.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,11,14,4.0,5.0
2,2058015,9598,2413,1.0,5.0,0.0,0.0,2.0,1.0,2.0,0.0,22,11,7.0,2.0
3,2058014,4418,5629,2.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0,18,9,5.0,2.0
4,2058012,14358,9598,2.0,6.0,0.0,0.0,1.0,4.0,1.0,2.0,16,22,9.0,7.0
5,2058013,7047,2413,0.0,3.0,0.0,0.0,2.0,1.0,0.0,2.0,7,11,3.0,2.0
6,2058011,6380,5629,2.0,8.0,0.0,0.0,2.0,2.0,1.0,1.0,25,8,9.0,3.0
7,2058010,15670,4418,0.0,3.0,0.0,0.0,2.0,2.0,0.0,2.0,9,9,3.0,2.0
8,2058009,12430,2413,2.0,4.0,0.0,0.0,6.0,2.0,1.0,0.0,18,20,7.0,6.0
9,2058008,7047,6697,4.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0,12,17,3.0,4.0


In [519]:
#GOLS

goals = events[(events['eventName'] != 'Save attempt') & (events['tags'].apply(lambda x: (101 in x)))].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
goals.columns = goals.columns.droplevel(0)
goals = goals.reset_index().rename(columns={'count':'goals'})

#juntar feature com df
df = df.merge(goals, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(goals, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"goals_x": "goals_T1", "goals_y": "goals_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2,red_card_T1,red_card_T2,yellow_card_T1,yellow_card_T2,assists_T1,assists_T2,shots_T1,shots_T2,shots_on_target_T1,shots_on_target_T2,goals_T1,goals_T2
0,2058017,4418,9598,1.0,3.0,0.0,0.0,2.0,1.0,0.0,1.0,8,14,6.0,3.0,3.0,2.0
1,2058016,5629,2413,6.0,2.0,0.0,0.0,1.0,2.0,2.0,0.0,11,14,4.0,5.0,2.0,0.0
2,2058015,9598,2413,1.0,5.0,0.0,0.0,2.0,1.0,2.0,0.0,22,11,7.0,2.0,2.0,1.0
3,2058014,4418,5629,2.0,4.0,0.0,0.0,2.0,3.0,1.0,0.0,18,9,5.0,2.0,1.0,0.0
4,2058012,14358,9598,2.0,6.0,0.0,0.0,1.0,4.0,1.0,2.0,16,22,9.0,7.0,5.0,6.0
5,2058013,7047,2413,0.0,3.0,0.0,0.0,2.0,1.0,0.0,2.0,7,11,3.0,2.0,0.0,2.0
6,2058011,6380,5629,2.0,8.0,0.0,0.0,2.0,2.0,1.0,1.0,25,8,9.0,3.0,1.0,1.0
7,2058010,15670,4418,0.0,3.0,0.0,0.0,2.0,2.0,0.0,2.0,9,9,3.0,2.0,0.0,2.0
8,2058009,12430,2413,2.0,4.0,0.0,0.0,6.0,2.0,1.0,0.0,18,20,7.0,6.0,4.0,5.0
9,2058008,7047,6697,4.0,2.0,0.0,1.0,1.0,2.0,1.0,0.0,12,17,3.0,4.0,1.0,0.0


In [None]:
#POSSE DE BOLA

ball_poss = 