## Import libraries

In [1]:
import pandas as pd
import numpy as np
import json
import glob

## Load files (meta data for every match)

### Load all the meta data from every match as a dictionary object

In [2]:
# Define the wildcard pattern to match the file names
pattern = '../data/*_SecondSpectrum_meta.json'
# Use glob to find all files that match the pattern
file_list = glob.glob(pattern)

In [3]:
# Create an empty dictionary to store the JSON data
data_dict = {}
# Loop through the files and store the data in the dictionary with indexes
for i, file_path in enumerate(file_list):
    with open(file_path, 'r') as f:
        data = json.load(f)
        data_dict[i] = data

### Convert the json file into dataframe

In [4]:
meta_df = pd.DataFrame.from_dict(data_dict, orient='index')

### Transform the columns related to dates in a interpretable format

In [5]:
# Combine day, month, and year columns to create a date column
meta_df['Date'] = pd.to_datetime(meta_df[['day', 'month', 'year']])

# Convert the timestamp column to datetime
meta_df['Datetime'] = pd.to_datetime(meta_df['startTime'], unit='ms')

### Define the table named "Match Review" 

In [6]:
match_review_df = \
meta_df[[
    'optaId',
    'Datetime',
    'Date',
    'description',
    'pitchLength',
    'pitchWidth',
    'homeScore', 
    'awayScore',
    'ssiId',
    'homeSsiId',
    'homeOptaId',
    'homeOptaUuid',
    'awaySsiId',
    'awayOptaId',
    'awayOptaUuid'
]]\
.sort_values(by='Datetime')

## Create tables based on every match review

In [8]:
# Load the profile attributes for every player in every match home/away
homePlayers_list = []
awayPlayers_list = []
j = 0
players_match_dict = {}
for match_dict in data_dict.values():
    homePlayers = match_dict['homePlayers']
    for i in range(len(homePlayers)):
        homePlayers[i]['teamOptaId'] = match_dict['homeOptaId']
        homePlayers[i]['teamOptaUuid'] = match_dict['homeOptaUuid']
        homePlayers[i]['teamSsiId'] = match_dict['homeSsiId']
        homePlayers[i]['opta_game_id'] = match_dict['optaId']
        homePlayers[i]['ss_game_id'] = match_dict['ssiId']
        players_match_dict[j] = homePlayers[i]
        j += 1
    awayPlayers = match_dict['awayPlayers']
    for i in range(len(awayPlayers)):
        awayPlayers[i]['teamOptaId'] = match_dict['awayOptaId']
        awayPlayers[i]['teamOptaUuid'] = match_dict['awayOptaUuid']
        awayPlayers[i]['teamSsiId'] = match_dict['awaySsiId']
        awayPlayers[i]['opta_game_id'] = match_dict['optaId']
        awayPlayers[i]['ss_game_id'] = match_dict['ssiId']
        players_match_dict[j] = awayPlayers[i]
        j += 1
        
    homePlayers_list.append(homePlayers)
    awayPlayers_list.append(awayPlayers)

In [9]:
players_match_df = pd.DataFrame.from_dict(players_match_dict, orient='index')

### Table named "Player" 

In [10]:
players_df = \
players_match_df[[
    # 'number',
    'name',
    'ssiId',
    'teamSsiId',
    'optaId',
    'optaUuid'
]].drop_duplicates(subset=['ssiId'],keep='last')

### Table named "Team" 

In [11]:
teams_df = \
players_match_df[[
    'teamOptaId',
    'teamOptaUuid',
    'teamSsiId'
]].drop_duplicates()

### Table named "Players_Match_Meta" 

In [12]:
players_match_meta_df = \
players_match_df[[
    'position',
    'ssiId',
    'teamSsiId',
    'opta_game_id',
    'ss_game_id'
]]

## Event tracking for every player and the ball

In [14]:
import re
import xml.etree.ElementTree as ET

In [15]:
def transform_tracking_data(xml_file):
    # Get the opta id extracted from file     
    game_opta_id = re.findall(r'\d+',xml_file.split('/')[2].split('_')[0])[0]
    
    # Load XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    tracking_players_dict = {}
    tracking_ball_dict = {}
    i = 0
    j = 0
    # Loop over periods
    for period in root.findall('period'):
        period_num = period.get('number')

        # Loop over frames
        for frame in period.findall('frame'):
            time = frame.get('time')
            possession = frame.get('possession')
            wallclock = frame.get('wall_clock')
            live = frame.get('live')

            # Loop over players in the frame
            for player in frame.findall('player'):
                player_id = player.get('id')
                player_num = player.get('num')
                loc = player.get('loc')
                loc_list = eval(loc)
                loc_x = loc_list[0]
                loc_y = loc_list[1]
                loc_z = loc_list[2]
                spd = player.get('spd')
                dist = player.get('dist')
                opta_id = player.get('opta_id')
                tracking_players_dict[i] = {
                    'game_id' : game_opta_id,
                    'period' : period_num,
                    'time' : time,
                    'wallclock':wallclock,
                    'live':live,
                    'possession' : possession,
                    'player_id' : player_id,
                    'player_name': player_num,
                    # 'loc' : loc_list,
                    'loc_x':loc_x,
                    'loc_y':loc_y,
                    'loc_z':loc_z,
                    'spd' : spd,
                    'dist' : dist,
                    'opta_id' : opta_id
                }
                i+=1

            for ball in frame.findall('ball'):
                loc = ball.get('loc')
                loc_list = eval(loc)
                loc_x = loc_list[0]
                loc_y = loc_list[1]
                loc_z = loc_list[2]
                spd = ball.get('spd')
                dist = ball.get('dist') 
                tracking_ball_dict[j] = {
                    'game_id' : game_opta_id,
                    'period' : period_num,
                    'time' : time,
                    'wallclock':wallclock,
                    'live':live,
                    'possession' : possession,
                    # 'loc' : loc_list,
                    'loc_x':loc_x,
                    'loc_y':loc_y,
                    'loc_z':loc_z,
                    'spd' : spd,
                    'dist' : dist
                }
                j+=1
    tracking_players_df = pd.DataFrame.from_dict(tracking_players_dict, orient='index')  
    tracking_ball_df = pd.DataFrame.from_dict(tracking_ball_dict, orient='index')   
    return({
        'tracking_players':tracking_players_dict,
        'tracking_ball':tracking_ball_dict
    })

In [16]:
# Define the wildcard pattern to match the file names
xml_pattern = '../data/*_SecondSpectrum_tracking-produced.xml'
# Use glob to find all files that match the pattern
xml_file_list = glob.glob(xml_pattern)

In [17]:
tracking_data_dict = {}

In [18]:
# Load and extract data from the xml files
for i,xml_file in enumerate(xml_file_list):
    if i not in tracking_data_dict.keys():
        tracking_data = transform_tracking_data(xml_file)
        tracking_data_dict[i] = tracking_data
        print(f'Xml file {xml_file} completed')

Xml file ../data/g2312183_SecondSpectrum_tracking-produced.xml completed
Xml file ../data/g2312166_SecondSpectrum_tracking-produced.xml completed
Xml file ../data/g2312152_SecondSpectrum_tracking-produced.xml completed
Xml file ../data/g2312135_SecondSpectrum_tracking-produced.xml completed
Xml file ../data/g2312213_SecondSpectrum_tracking-produced.xml completed


In [37]:
tracking_data_dict_str = {str(key):value for key,value in tracking_data_dict.items()}

In [52]:
# Extract the tracking of the game into a list of dataframes per game
players_list_df = [pd.DataFrame.from_dict(tracking_data_dict[i]['tracking_players'],orient='index') for i,data_dict in enumerate(tracking_data_dict)]
ball_list_df = [pd.DataFrame.from_dict(tracking_data_dict[i]['tracking_ball'],orient='index') for i,data_dict in enumerate(tracking_data_dict)]

IOStream.flush timed out


In [59]:
# Transform a list of dataframes extraced for each game to an overall dataframe regarding each player and ball tracking
tracking_players_df = pd.concat(players_list_df)
tracking_ball_df = pd.concat(ball_list_df)

## Physical summary for the team and player

In [72]:
def transform_physical_summary(csv_file):
    game_opta_id = re.findall(r'\d+',csv_file.split('/')[2].split('_')[0])[0]

    data = []
    with open(csv_file, "r") as f:
        lines = f.readlines()
        rows = []
        for line in lines:
            if line.strip() == "":
                # Empty line indicates a new data source
                if rows:
                    # Create a new dataframe and append it to the list
                    df = pd.DataFrame(rows[1:], columns=rows[0])
                    data.append(df)
                    rows = []
            else:
                rows.append([cell.strip() for cell in line.split(",")])

        # Create the last dataframe and append it to the list
        if rows:
            df = pd.DataFrame(rows[1:], columns=rows[0])
            data.append(df)
            
    ### Summary of events in a match (game_id)
    df_event_summary = \
    pd.DataFrame(
        data = [record[1:] for record in data[1].values],
        columns= ['Event'] + list(data[1].columns[2:])
    )
    df_event_summary['Game_id'] = game_opta_id
    
    ### Summary of events for a every player(player_id) in a match(game_id)
    # Clean the name of players
    data[2]['Player'] = data[2]['Player'].str.replace('"','') 
    data[3]['Player'] = data[3]['Player'].str.replace('"','') 
    # Clean the column names
    data[2].columns = [col.replace('"','') for col in list(data[2].columns)]
    data[3].columns = [col.replace('"','') for col in list(data[3].columns)]
    # Determine the Home and Away teams     
    data[2]['Stadium'] = 'Home'
    data[3]['Stadium'] = 'Away'

    players_physical_summary_df = pd.concat([data[2],data[3]])
    players_physical_summary_df['Game ID'] = game_opta_id
    
    return ({
        'players_physical_summary':players_physical_summary_df,
        'overall_physical_summary':df_event_summary
    })

### Load all the meta data from every match as a dictionary object

In [67]:
# Define the wildcard pattern to match the file names
pattern = '../data/*_SecondSpectrum_physical-summary.csv'
# Use glob to find all files that match the pattern
file_list = glob.glob(pattern)

In [69]:
file_list

['../data/g2312135_SecondSpectrum_physical-summary.csv',
 '../data/g2312183_SecondSpectrum_physical-summary.csv',
 '../data/g2312166_SecondSpectrum_physical-summary.csv',
 '../data/g2312152_SecondSpectrum_physical-summary.csv',
 '../data/g2312213_SecondSpectrum_physical-summary.csv']

In [73]:
physical_summary = [transform_physical_summary(csv_file) for csv_file in file_list]

In [82]:
players_physical_summary_df = pd.concat([summary['players_physical_summary'] for summary in physical_summary]).reset_index(drop=True)
overall_physical_summary_df = pd.concat([summary['overall_physical_summary'] for summary in physical_summary]).reset_index(drop=True)

## Physical Splits for Teams and Players

### Load all the requires csv files 

In [91]:
# Define the wildcard pattern to match the file names
pattern = '../data/*_SecondSpectrum_physical-splits.csv'
# Use glob to find all files that match the pattern
file_list = glob.glob(pattern)

### Load physical splits details for each team in aggregate level per 5 minutes

In [92]:
def load_physical_df(df,physical_attrs):
    result_df = df.copy()
    for values in physical_attrs:
        result_df[values[0].replace('"','')] = values[1:]
    return result_df
    return result_df[result_df['Minute Splits']!=0]

In [98]:
def get_players_physical_splits(team_players,team_code,team_name,minutes_col,minutes_val,half_time,game_opta_id):    
    player_df_list = []
    for player in team_players:
        # get the profile attributes of a player         
        player_attr = player[0][0].split("(")
        player_name = player_attr[0].replace('"','').strip()
        try:
            player_id =  re.findall(r'\d+',player_attr[1])[0]
        except:
            player_id = None
        home_players_attrs = player[1:]
        home_players_attrs_df = pd.DataFrame()
        
        player_df = load_physical_df(home_players_attrs_df,home_players_attrs)
        player_df[minutes_col] = minutes_val
        player_df['player_name'] = player_name
        player_df['player_id'] = player_id
        player_df['game_opta_id'] = game_opta_id
        player_df['team_id'] = team_code
        player_df['team_name'] = team_name
        player_df['half'] = half_time
        player_df_list.append(player_df[player_df['Minute Splits']!=0])
    return(pd.concat(player_df_list))

In [102]:
def get_team_physical_splits(result_data,select_team=0,game_opta_id=None):
    team_splits = np.array([1 if 'Minute Splits' in record[0][0] else 0 for record in result_data])
    team_split_index = np.where(team_splits==1)[0][select_team]
    team_split = np.where(team_splits==1)[0]
    
    minutes_split_col = result_data[team_split_index][0][0].replace('"','')
    minutes_split_val = np.array([int(num.replace('','0')) if num=='' else int(num) for num in result_data[team_split_index][0][1:]])
    
    # Identidy the values for the first and second half
    idx = np.where(minutes_split_val == 0)[0][0]
    # create a new array of the same shape as arr with all values set to 0
    half = np.zeros(minutes_split_val.shape, dtype=int)
    # set values before 0 to 1, and values after 0 to 2
    half[:idx] = 1
    half[idx+1:] = 2
    
    team_attr = result_data[team_split_index][1][0].split('(')
    team_attr_name = team_attr[0].strip().replace('"','')
    try:
        team_attr_code = re.findall(r'\d+',team_attr[1])[0]
    except:
        team_attr_code = None
    
    physical_attrs = result_data[team_split_index][2:]
    physical_attrs_df = pd.DataFrame()
    physical_attrs_df[minutes_split_col] = minutes_split_val
    physical_attrs_df['team_id'] = team_attr_code
    physical_attrs_df['team_name'] = team_attr_name
    physical_attrs_df['half'] = half
    physical_attrs_df['game_opta_id'] = game_opta_id

    # get the overall physical attribute of a team
    team_physical_attrs_df = load_physical_df(physical_attrs_df,physical_attrs)
    team_physical_attrs_df = team_physical_attrs_df[team_physical_attrs_df['Minute Splits']!=0]
    
    # get the attributes of the team's players
    if select_team == 0:
        # import pdb;pdb.set_trace()
        team_players = result_data[team_split[0]+1:team_split[1]]
    else:
        team_players = result_data[team_split[1]+1:]
        
    players_physical_attrs_df = \
    get_players_physical_splits(
        team_players=team_players,
        team_code=team_attr_code,
        team_name=team_attr_name,
        minutes_col=minutes_split_col,
        minutes_val=minutes_split_val,
        half_time = half,
        game_opta_id=game_opta_id
    )
    return (team_physical_attrs_df,players_physical_attrs_df)

In [103]:
def get_physical_dataframe(file_list):
    total_team_physical_attrs_list = []
    total_players_plhysical_attrs_list = []
    for csv_file in file_list:
        game_opta_id = re.findall(r'\d+',csv_file.split('/')[2].split('_')[0])[0]
        # print(game_opta_id)
        result_data = []
        with open(csv_file, "r") as f:
            lines = f.readlines()
            rows = []
            for line in lines:
                if line.strip() == "":
                    # Empty line indicates a new data source
                    if rows:
                        result_data.append(rows)
                        rows = []
                else:
                    rows.append([cell.strip() for cell in line.split(",")])

        team_splits = np.array([1 if 'Minute Splits' in record[0][0] else 0 for record in result_data])
        first_team_physical_attrs_df,first_players_plhysical_attrs_df = get_team_physical_splits(result_data,select_team=0,game_opta_id=game_opta_id)
        second_team_physical_attrs_df,second_players_physical_attrs_df = get_team_physical_splits(result_data,select_team=1,game_opta_id=game_opta_id)

        total_team_physical_attrs_df = pd.concat([first_team_physical_attrs_df,second_team_physical_attrs_df])
        total_players_plhysical_attrs_df = pd.concat([first_players_plhysical_attrs_df,second_players_physical_attrs_df])

        total_team_physical_attrs_list.append(total_team_physical_attrs_df)
        total_players_plhysical_attrs_list.append(total_players_plhysical_attrs_df)
    return(
        {
            'Team':pd.concat(total_team_physical_attrs_list),
            'Players':pd.concat(total_players_plhysical_attrs_list)
        }
    )

In [104]:
physical_data = get_physical_dataframe(file_list)

## Export all the data into several dataframes

In [65]:
# directory of the exported data
directory = '../final_data/'

In [66]:
# match reviews
match_review_df.to_csv(directory+'match_review.csv',index=False)

# meta data for every player in every match
players_match_meta_df.to_csv(directory+'players_match_meta.csv',index=False)

# meta data for every team
teams_df.to_csv(directory+'team.csv',index=False)

# meta data for every player 
players_df.to_csv(directory+'player.csv',index=False)

# tracking of players
tracking_players_df.to_csv(directory+'tracking_player.csv',index=False)

# tracking of ball
tracking_ball_df.to_csv(directory+'tracking_ball.csv',index=False)

In [90]:
# Physical summary of players
players_physical_summary_df.to_csv(directory+'players_physical_summary.csv',index=False)

# Overall Physical summary
overall_physical_summary_df.to_csv(directory+'overall_physical_summary.csv',index=False)

In [105]:
# Physical splits of teams
physical_data['Team'].to_csv(directory+'team_physical_splits.csv',index=False)

# Physical splits of players
physical_data['Players'].to_csv(directory+'player_physical_splits.csv',index=False)