In [28]:
import pandas as pd
import numpy as np
import json

## Features

In [29]:
#open desired match dataset as dataframe
match_id = 3753974
match_file = f"Premier_League_1516/{match_id}.json"
with open(match_file, 'r') as file:
    data = json.load(file)
df_match = pd.json_normalize(data)
df_match.set_index('index')

#convert event time to seconds since start
df_match['event_time'] = df_match['minute'] * 60 + df_match['second']

#split by first/second half
df_first = df_match[df_match['period']==1]
df_second = df_match[df_match['period']==2]

first_half_end = df_first.loc[df_first.index[-1], 'event_time']
second_half_start = df_second.loc[df_second.index[0], 'event_time']
second_half_end = df_second.loc[df_second.index[-1], 'event_time']


# read in json file for entire competition (all matches)
with open("matches_2_27.json", 'r') as competition_file:
    competition_data = json.load(competition_file)
df_competition = pd.json_normalize(competition_data)
df_competition.set_index('match_id', inplace=True)
df_competition.sort_index(inplace=True)

#identify home team and away team
home_team = df_competition.loc[match_id,'home_team.home_team_name']
away_team = df_competition.loc[match_id,'away_team.away_team_name']

In [30]:
#initialize time series dataframes (one row per second)
first_time_series = pd.DataFrame({'event_time': range(first_half_end+1)})
second_time_series = pd.DataFrame({'event_time': range(second_half_start, second_half_end+1)})

In [31]:
# Add possession data
def add_possession_data(df, time_series):
    df_possession = df[['possession_team.name', 'duration', 'event_time']].copy()

    #calculate possession time per second per team
    possession = df_possession.groupby(['event_time', 'possession_team.name'])['duration'].sum().unstack(fill_value=0)

    #calculate cumulative possession, label by home/away
    possession.columns = [f'cumulative_possession_home' if c==home_team else 'cumulative_possession_away' for c in possession.columns]
    possession = possession.cumsum()

    #add to time series df (ffill - replaces empty values with previous row's value)
    time_series = time_series.merge(possession, how='left', on='event_time').ffill()
    return time_series

first_time_series = add_possession_data(df_first, first_time_series)
second_time_series = add_possession_data(df_second, second_time_series)


In [32]:
# Add pass data
def add_pass_data(df, time_series):
    passes = df[df['type.name'] == 'Pass'].copy()
    
    for team in [home_team, away_team]:
        label = 'home' if team==home_team else 'away'
        team_pass = passes[passes['possession_team.name'] == team].copy()
        team_pass['cumsum_pass_length'] = team_pass['pass.length'].cumsum()
        team_pass['cumsum_duration'] = team_pass['duration'].cumsum()
        team_pass['cumsum_count'] = range(1, len(team_pass) + 1)

        #in case there is more than one pass in one second, aggregating by last will take the final pass data within that second
        cumulative_df = team_pass.groupby('event_time').agg({
        'cumsum_pass_length': 'last',
        'cumsum_duration': 'last',
        'cumsum_count': 'last'
        }).rename(columns={
            'cumsum_pass_length': f'cumulative_pass_length_{label}',
            'cumsum_duration': f'cumulative_pass_duration_{label}',
            'cumsum_count': f'cumulative_num_passes_{label}'
        })
        
        cumulative_df[f'avg_pass_length_{label}'] = cumulative_df[f'cumulative_pass_length_{label}'] / cumulative_df[f'cumulative_num_passes_{label}']
        cumulative_df[f'avg_pass_duration_{label}'] = cumulative_df[f'cumulative_pass_duration_{label}'] / cumulative_df[f'cumulative_num_passes_{label}']
        
        #average refers to since start of that half
        time_series = time_series.merge(cumulative_df[[f'avg_pass_length_{label}', f'avg_pass_duration_{label}', f'cumulative_num_passes_{label}']],
                                how='left', on='event_time').ffill()
    return time_series

first_time_series = add_pass_data(df_first, first_time_series)
second_time_series = add_pass_data(df_second, second_time_series)


In [33]:
# Add shots data
def add_shots_data(df, time_series):
    shots = df[df['type.name'] == 'Shot'].copy()
    for team in [home_team, away_team]:
        label = 'home' if team==home_team else 'away'
        team_shot = shots[shots['possession_team.name'] == team].copy()
        team_shot[f'cumulative_shots_attempted_{label}'] = range(1, len(team_shot) + 1)
        time_series = time_series.merge(team_shot[[f'cumulative_shots_attempted_{label}', 'event_time']], how = 'left', on = 'event_time').ffill()
    return time_series

first_time_series = add_shots_data(df_first, first_time_series)
second_time_series = add_shots_data(df_second, second_time_series)

In [34]:
# Add location data

def add_location_data (df, time_series):
    for team in [home_team, away_team]:
        label = 'home' if team==home_team else 'away'
        df_team = df[df['possession_team.name'] == team].copy()
        df_location = df_team.groupby(['event_time']).agg({'location': 'last'}).rename(columns={'location':f'location_{label}'})
        #separate into x and y
        df_location[[f'location_x_{label}', f'location_y_{label}']] = pd.DataFrame(df_location[f'location_{label}'].apply(lambda l: l if isinstance(l, list) else [None, None]).to_list(), index = df_location.index)
        
        time_series = time_series.merge(df_location[[f'location_x_{label}',f'location_y_{label}']], how='left', on='event_time').ffill()
        #fill initial seconds (before first event with location data) with field center
        time_series[f'location_x_{label}']=time_series[f'location_x_{label}'].fillna(60)
        time_series[f'location_y_{label}']=time_series[f'location_y_{label}'].fillna(40)
    return time_series


first_time_series = add_location_data(df_first, first_time_series)
second_time_series = add_location_data(df_second, second_time_series)

In [None]:
# fill in missing features with 0
first_time_series.fillna(0, inplace=True)
second_time_series.fillna(0, inplace=True)

In [36]:
goals = df_match[(df_match['type.name']=='Shot')&(df_match['shot.outcome.name']=='Goal')]
home_goals_first = goals.loc[(goals['period']==1)&(goals['possession_team.name']==home_team), 'event_time'].to_numpy()
home_goals_second = goals.loc[(goals['period']==2)&(goals['possession_team.name']==home_team), 'event_time'].to_numpy()
away_goals_first = goals.loc[(goals['period']==1)&(goals['possession_team.name']==away_team), 'event_time'].to_numpy()
away_goals_second = goals.loc[(goals['period']==2)&(goals['possession_team.name']==away_team), 'event_time'].to_numpy()

def get_time_to_goal (time_series, period, team):
    if period == 1:
        if team == 'home': goals = home_goals_first
        else: goals = away_goals_first
    else:
        if team == 'home': goals = home_goals_second
        else: goals = away_goals_second
    print(f'Period {period} {team} goal seconds: {goals}')
    event_times = time_series["event_time"].to_numpy()

    #get index of next goal
    idx = np.searchsorted(goals, event_times, side = 'left')

    # get next goal times
    next_goal = np.full_like(event_times, fill_value=np.nan, dtype=float)
    valid = idx < len(goals)
    next_goal[valid] = goals[idx[valid]]

    time_series[f'time_to_{team}_goal'] = next_goal - event_times
    return time_series

first_time_series = get_time_to_goal(first_time_series, 1, 'home')
first_time_series = get_time_to_goal(first_time_series, 1, 'away')
second_time_series = get_time_to_goal(second_time_series, 2, 'home')
second_time_series = get_time_to_goal(second_time_series, 2, 'away')

Period 1 home goal seconds: []
Period 1 away goal seconds: []
Period 2 home goal seconds: []
Period 2 away goal seconds: [5566]


In [37]:
first_time_series.iloc[:10]

Unnamed: 0,event_time,cumulative_possession_away,cumulative_possession_home,avg_pass_length_home,avg_pass_duration_home,cumulative_num_passes_home,avg_pass_length_away,avg_pass_duration_away,cumulative_num_passes_away,cumulative_shots_attempted_home,cumulative_shots_attempted_away,location_x_home,location_y_home,location_x_away,location_y_away,time_to_home_goal,time_to_away_goal
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.9,60.6,60.0,40.0,,
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.9,60.6,60.0,40.0,,
2,2,0.828777,0.0,0.0,0.0,0.0,2.501999,0.828777,1.0,0.0,0.0,32.9,60.6,61.0,40.1,,
3,3,3.372085,0.0,0.0,0.0,0.0,16.075388,1.686043,2.0,0.0,0.0,32.9,60.6,62.0,43.1,,
4,4,3.372085,0.0,0.0,0.0,0.0,16.075388,1.686043,2.0,0.0,0.0,32.9,60.6,62.0,43.1,,
5,5,3.372085,0.0,0.0,0.0,0.0,16.075388,1.686043,2.0,0.0,0.0,32.9,60.6,62.0,43.1,,
6,6,5.953399,0.0,0.0,0.0,0.0,19.000778,1.738202,3.0,0.0,0.0,32.9,60.6,63.7,13.5,,
7,7,5.953399,0.0,0.0,0.0,0.0,19.000778,1.738202,3.0,0.0,0.0,32.9,60.6,63.7,13.5,,
8,8,7.262287,0.0,0.0,0.0,0.0,17.832221,1.630873,4.0,0.0,0.0,32.9,60.6,38.9,15.1,,
9,9,7.262287,0.0,0.0,0.0,0.0,17.832221,1.630873,4.0,0.0,0.0,32.9,60.6,38.9,15.1,,


In [39]:
second_time_series.iloc[2860:2870]

Unnamed: 0,event_time,cumulative_possession_away,cumulative_possession_home,avg_pass_length_home,avg_pass_duration_home,cumulative_num_passes_home,avg_pass_length_away,avg_pass_duration_away,cumulative_num_passes_away,cumulative_shots_attempted_home,cumulative_shots_attempted_away,location_x_home,location_y_home,location_x_away,location_y_away,time_to_home_goal,time_to_away_goal
2860,5560,873.03831,812.27788,21.139106,1.648989,232.0,19.160553,1.584786,230.0,7.0,12.0,99.9,54.1,12.9,33.8,,6.0
2861,5561,873.03831,812.27788,21.139106,1.648989,232.0,19.160553,1.584786,230.0,7.0,12.0,99.9,54.1,12.9,33.8,,5.0
2862,5562,873.03831,812.27788,21.139106,1.648989,232.0,19.160553,1.584786,230.0,7.0,12.0,99.9,54.1,12.9,33.8,,4.0
2863,5563,873.03831,812.27788,21.139106,1.648989,232.0,19.160553,1.584786,230.0,7.0,12.0,99.9,54.1,12.9,33.8,,3.0
2864,5564,874.635891,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,12.0,99.9,54.1,120.0,80.0,,2.0
2865,5565,874.635891,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,12.0,99.9,54.1,4.6,38.7,,1.0
2866,5566,875.932733,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,13.0,99.9,54.1,104.0,44.1,,0.0
2867,5567,875.932733,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,13.0,99.9,54.1,6.8,38.2,,
2868,5568,875.932733,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,13.0,99.9,54.1,6.8,38.2,,
2869,5569,875.932733,812.27788,21.139106,1.648989,232.0,19.245838,1.584841,231.0,7.0,13.0,99.9,54.1,6.8,38.2,,


In [40]:
first_time_series.to_csv(f'{match_id}_first_time_series.csv', index=False)
second_time_series.to_csv(f'{match_id}_second_time_series.csv', index=False)