In [127]:
import pandas as pd
import networkx as nx
import numpy as np
import json
from collections import defaultdict


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [285]:
events = pd.read_json('events\events_World_Cup.json')
matches = pd.read_json('matches\matches_World_Cup.json')
teams = pd.read_json('teams.json')

In [123]:
def load_public_dataset(tournament='World_Cup'):
    """
    Load the json files with the matches, events, players and competitions
    
    Parameters
    ----------
    data_folder : str, optional
        the path to the folder where json files are stored.
        
    tournaments : list, optional
        the list of tournaments to load. 
        
    Returns
    -------
    tuple
        a tuple of four dictionaries, containing matches, events, players and competitions
        
    """
    # loading the matches and events data
    matches, events = {}, {}
    with open('events/events_%s.json' %tournament) as json_data:
        events = json.load(json_data)
    with open('matches/matches_%s.json' %tournament) as json_data:
        matches = json.load(json_data)
    
    match_id2events = defaultdict(list)
    match_id2match = defaultdict(dict)
    for event in events:
        match_id = event['matchId']
        match_id2events[match_id].append(event)
                                         
    for match in matches:
        match_id = match['wyId']
        match_id2match[match_id] = match

    # loading the players data
    with open('players.json') as json_data:
        players = json.load(json_data)
    
    player_id2player = defaultdict(dict)
    for player in players:
        player_id = player['wyId']
        player_id2player[player_id] = player
    
    # loading the competitions data
    teams={}
    with open('teams.json') as json_data:
        teams = json.load(json_data)
    team_id2team = defaultdict(dict)
    for team in teams:
        team_id = team['wyId']
        team_id2team[team_id] = team
    
    return match_id2match, match_id2events, player_id2player, team_id2team

In [128]:
match_id2match, match_id2events, player_id2player, team_id2team = load_public_dataset()

In [96]:
events.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,ev_success
0,8,Simple pass,[{'id': 1801}],122671,"[{'y': 50, 'x': 50}, {'y': 53, 'x': 35}]",2057954,Pass,16521,1H,1.656214,85,258612104,1.0
1,8,High pass,[{'id': 1801}],139393,"[{'y': 53, 'x': 35}, {'y': 19, 'x': 75}]",2057954,Pass,16521,1H,4.487814,83,258612106,1.0
2,1,Air duel,"[{'id': 703}, {'id': 1801}]",103668,"[{'y': 81, 'x': 25}, {'y': 83, 'x': 37}]",2057954,Duel,14358,1H,5.937411,10,258612077,1.0
3,1,Air duel,"[{'id': 701}, {'id': 1802}]",122940,"[{'y': 19, 'x': 75}, {'y': 17, 'x': 63}]",2057954,Duel,16521,1H,6.406961,10,258612112,0.0
4,8,Simple pass,[{'id': 1801}],122847,"[{'y': 17, 'x': 63}, {'y': 15, 'x': 71}]",2057954,Pass,16521,1H,8.562167,85,258612110,1.0


In [297]:
#fazer o mesmo do df das redes para esse aqui (criar df com features)

df = matches[['wyId']]
df = df.rename(columns={'wyId':'matchID'})

#colocar team1 e team2
home_l = []
away_l = []

for match in matches['teamsData']:
    t0 = list(match.values())[0]
    t1 = list(match.values())[1]

    if t0['side'] == 'home':
        home_l.append(t0['teamId'])
        away_l.append(t1['teamId'])  
    else: 
        away_l.append(t0['teamId'])
        home_l.append(t1['teamId'])

df['team1_ID'] = home_l
df['team2_ID'] = away_l

df.head()

Unnamed: 0,matchID,team1_ID,team2_ID
0,2058017,4418,9598
1,2058016,5629,2413
2,2058015,9598,2413
3,2058014,4418,5629
4,2058012,14358,9598


In [287]:
df.shape

(64, 3)

In [288]:
# adaptando coluna 'tags' para conseguir interpretar se evento foi bem sucedido ou não. 'id' = 1801 ou 1802
def converting_tags(aux):
    new = pd.Series()
    try:
        new = list(aux[-1].values())[0]
    except IndexError:
        return None

    return new

events['ev_success'] = events['tags'].map(converting_tags)
events['ev_success'] = events['ev_success'].replace([1801, 1802], [1, 0])

  new = pd.Series()


In [289]:
#contar quantos subEvents = 'Save attempt' com tag = 1801(ev_success = 1) para todos os (match, team)

a = events[(events['subEventName'] == 'Save attempt') & events['ev_success'] == 1].groupby(['matchId', 'teamId']).agg({'eventId':['count']})
a.columns = a.columns.droplevel(0)
a = a.reset_index().rename(columns={'count':'gk_saves'})
a.head()

Unnamed: 0,matchId,teamId,gk_saves
0,2057954,16521,1
1,2057955,15670,3
2,2057955,16129,1
3,2057956,16129,1
4,2057957,15670,3


In [298]:
df = df.merge(a, how='left', left_on=['matchID', 'team1_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)
df = df.merge(a, how='left', left_on=['matchID', 'team2_ID'], right_on=['matchId', 'teamId']).drop(['matchId', 'teamId'], axis=1)

df = df.rename(columns={"gk_saves_x": "gk_saves_T1", "gk_saves_y": "gk_saves_T2"})

df = df.fillna(0)
df

Unnamed: 0,matchID,team1_ID,team2_ID,gk_saves_T1,gk_saves_T2
0,2058017,4418,9598,0.0,2.0
1,2058016,5629,2413,5.0,0.0
2,2058015,9598,2413,0.0,3.0
3,2058014,4418,5629,1.0,1.0
4,2058012,14358,9598,1.0,5.0
5,2058013,7047,2413,0.0,0.0
6,2058011,6380,5629,1.0,5.0
7,2058010,15670,4418,0.0,1.0
8,2058009,12430,2413,1.0,3.0
9,2058008,7047,6697,3.0,0.0
