In [1]:
# Import necessary packages
import pandas as pd
from tqdm import tqdm

In [2]:
# Read competition data
competitions = pd.read_json('../open-data/data/competitions.json')

# Filter the data to retain only men's football data
competitions = competitions[competitions.competition_gender == 'male']

## Code for getting matches data

In [4]:
path = '../open-data/data/' # PATH TO THE DATA FOLDER IN OPEN-DATA

# Read the matches data one by one
dfs = []

for i, row in competitions.iterrows():
    filepath = path + 'matches/' + str(row['competition_id']) + '/' + str(row['season_id']) + '.json'
    df = pd.read_json(filepath)
    
    # Retain competition and season ids
    df['competition_id'] = row['competition_id']
    df['season_id'] = row['season_id']
    dfs.append(df)
    
matches = pd.concat(dfs)
matches.match_id= matches.match_id.astype(int)

matches.home_team= matches.home_team.apply(lambda x: x['home_team_name'])
matches.away_team= matches.away_team.apply(lambda x: x['away_team_name'])

# matches.reset_index(drop= True).to_feather('matches.feather')

In [5]:
# Read events data using match_ids

event = []

for i, row in tqdm(matches.iterrows(), total=len(matches), desc="Processing Rows", ncols=100):
    filepath = path + 'events/' + str(row['match_id']) + '.json'
    df = pd.read_json(filepath)
    
    # Retain competition, season, and match ids
    df['match_id'] = row['match_id']
    df['competition_id'] = row['competition_id']
    df['season_id'] = row['season_id']
    

    event.append(df)

events = pd.concat(event)
events.rename({'index': 'shot_id'}, axis= 1, inplace= True)

Processing Rows: 100%|██████████████████████████████████████████| 1604/1604 [39:17<00:00,  1.47s/it]


In [6]:
events.possession_team = events.possession_team.apply(lambda x: x['name'])

# Filter the events and keep only the shots
events.dropna(subset= 'shot', inplace= True)

# Save the necessary attribute in shots variable
shots = events[['shot_id', 'match_id', 'competition_id', 'season_id', 'possession_team', 'player', 'timestamp', 'period', 'minute', 'second', 'position', 'location', 'shot']].dropna(subset= 'shot')

In [None]:
# Save the data into csv or feather files
# Preferrably feather for efficiency 

# events.reset_index(drop= True).to_csv('events.csv')
events.reset_index(drop= True).to_feather('events.feather')

# shots.to_csv('shots_data.csv')
shots.reset_index(drop= True).to_feather('shots_data.feather')