In [None]:
import numpy as np
import pandas as pd
import ast
from numpy.lib.stride_tricks import sliding_window_view

# Turn Raw Events into Processed Events

In [None]:
# Load in data
df = pd.read_csv('data/all_events.csv')

# Add second timestamps
df['ts_minute'] = df['minute'].astype(int) + df['second'].astype(int)/60
# Sort by match and timestamp
df = df.sort_values(by=['match_id', 'ts_minute', 'timestamp'], ascending=True).reset_index(drop=True)

# Add season labels
df['season'] = 2021
df.loc[df['match_id'] >= 3829249, 'season'] = 2022
df.loc[df['match_id'] >= 3881483, 'season'] = 2023

# Add location labels
# Convert location values from strings to arrays
df.loc[df['location'].notnull(), 'location'] = df.loc[df['location'].notnull(), 'location'].apply(ast.literal_eval)
# Label events with zones based on locations. 5 horizontal zones and 7 vertical zones.
horizontal_lines = np.linspace(0, 80, 6)  # 5 zones = 6 lines
vertical_lines = np.linspace(0, 120, 8)   # 7 zones = 8 lines

def assign_location_label(location):
    if not isinstance(location, list) or len(location) < 2:
        return np.nan  # Invalid or missing location
    
    x, y = location[:2]
    # Find the horizontal and vertical zone indices
    horizontal_zone = np.digitize([y], horizontal_lines, right=False)[0] - 1
    vertical_zone = np.digitize([x], vertical_lines, right=False)[0] - 1
    # Ensure the indices are within bounds
    if horizontal_zone < 0 or vertical_zone < 0 or horizontal_zone >= 5 or vertical_zone >= 7:
        return np.nan  # Out of bounds
    
    # Create labels like "Zone 3-2"
    return f"Zone {horizontal_zone + 1}-{vertical_zone + 1}"
df['zone'] = df['location'].apply(assign_location_label)

In [None]:
# # Save data
# df.to_csv('data/processed_events.csv', index=False)

# Turn Processed Events into Actions

In [None]:
events = pd.read_csv('data/processed_events.csv')
actions = events.loc[events['player_id'].notnull()].reset_index(drop=True)

In [None]:
# Filter for select event types
action_types = ["Pass", "Carry", "Clearance", "Block", "50/50", "Pressure", "Interception", "Shot", "Foul Committed", "Goal Keeper", "Shield", "Offside", "Own Goal Against"]
actions = actions.loc[actions['type'].isin(action_types)].reset_index(drop=True)

# Specify end time
actions['ts_end'] = actions['ts_minute'] + actions['duration'] / 60

# Specify end location if it exists
actions['end_location'] = actions['location']
actions.loc[actions['pass_end_location'].notnull(), 'end_location'] = actions['pass_end_location']
actions.loc[actions['carry_end_location'].notnull(), 'end_location'] = actions['carry_end_location']
actions.loc[actions['goalkeeper_end_location'].notnull(), 'end_location'] = actions['goalkeeper_end_location']
actions.loc[actions['shot_end_location'].notnull(), 'end_location'] = actions['shot_end_location']

# Specify body part if it exists
actions['body_part'] = 'Undefined'
actions.loc[actions['pass_body_part'].notnull(), 'body_part'] = actions['pass_body_part']
actions.loc[actions['shot_body_part'].notnull(), 'body_part'] = actions['shot_body_part']
actions.loc[actions['clearance_body_part'].notnull(), 'body_part'] = actions['clearance_body_part']
actions.loc[actions['goalkeeper_body_part'].notnull(), 'body_part'] = actions['goalkeeper_body_part']

# Specify outcome of action if it exists
actions['outcome'] = 'Undefined'
actions.loc[actions['pass_outcome'].notnull(), 'outcome'] = actions['pass_outcome']
actions.loc[actions['duel_outcome'].notnull(), 'outcome'] = actions['duel_outcome']
actions.loc[actions['dribble_outcome'].notnull(), 'outcome'] = actions['dribble_outcome']
actions.loc[actions['interception_outcome'].notnull(), 'outcome'] = actions['interception_outcome']
actions.loc[actions['shot_outcome'].notnull(), 'outcome'] = actions['shot_outcome']
actions.loc[actions['goalkeeper_outcome'].notnull(), 'outcome'] = actions['goalkeeper_outcome']

# Specify foul card type, if it exists
actions.loc[actions['foul_committed_card'].notnull(), 'type'] = actions['foul_committed_card']

# Set nan to false for under_pressure
actions['under_pressure'] = actions['under_pressure'].fillna(False)

# Handle location column
actions.loc[actions['location'].notnull(), 'location'] = actions.loc[actions['location'].notnull(), 'location'].apply(ast.literal_eval)
actions.loc[actions['end_location'].notnull(), 'end_location'] = actions.loc[actions['end_location'].notnull(), 'end_location'].apply(ast.literal_eval)
# Clean up the dataframe
actions = actions.dropna(subset=['location', 'end_location']).reset_index(drop=True)
location_cols = ['location', 'end_location']
for col in location_cols:
    expanded = pd.DataFrame(actions[col].tolist(), index=actions.index)
    actions[f'{col}_x'] = expanded[0]
    actions[f'{col}_y'] = expanded[1]

# Narrow down to feature base
feature_base = ['id', 'match_id', 'play_pattern', 'possession', 'possession_team', 'team', 'player', 'position', 'ts_minute', 'ts_end', 'location_x','location_y', 'end_location_x', 'end_location_y',  'type', 'body_part', 'outcome', 'under_pressure']
actions = actions[feature_base].dropna().reset_index(drop=True)

In [None]:
# Add in temporary goal events
goal_events = actions.loc[(actions['type'] == 'Shot') & (actions['outcome'] == 'Goal')].copy()
goal_events['type'] = 'Goal'

# Configure the details of these events
goal_events['ts_minute'] = goal_events['ts_end'] + 0.1
goal_events['ts_end'] = goal_events['ts_minute']
goal_events['location_x'] = goal_events['end_location_x']
goal_events['location_y'] = goal_events['end_location_y']

# Append
actions = pd.concat([actions, goal_events], ignore_index=True).sort_values(by=['match_id', 'ts_minute', 'ts_end']).reset_index(drop=True)

In [None]:
def add_score_columns(df):
    """
    Add running score columns for both teams based on goals and own goals,
    handling multiple matches separately.

    Parameters:
      df (pandas.DataFrame): DataFrame containing match actions
        
    Returns:
    pandas.DataFrame: Original DataFrame with added columns:
        - team_score: Running score for the team performing the action
        - opponent_score: Running score for the opposing team
    """
    df = df.copy()
    
    def process_match(match_df):
        match_df = match_df.copy()
        # Identify the two teams for this match
        teams = match_df['team'].unique()
        if len(teams) != 2:
            raise ValueError(f"Expected exactly 2 teams per match; got {teams}")
        team1, team2 = teams
        
        # Create indicator columns for regular goals.
        # Only count an event if the type is "Goal".
        match_df['goal_team1'] = ((match_df['team'] == team1) &
                                  (match_df['type'] == 'Goal')).astype(int)
        match_df['goal_team2'] = ((match_df['team'] == team2) &
                                  (match_df['type'] == 'Goal')).astype(int)
        
        # Count own goals:
        match_df['goal_team2'] += ((match_df['team'] == team1) &
                                   (match_df['type'] == 'Own Goal Against')).astype(int)
        match_df['goal_team1'] += ((match_df['team'] == team2) &
                                   (match_df['type'] == 'Own Goal Against')).astype(int)
        
        # Compute cumulative sums for each team's goals.
        match_df['cum_team1'] = match_df['goal_team1'].cumsum()
        match_df['cum_team2'] = match_df['goal_team2'].cumsum()

        # Assign the running scores from the perspective of the acting team.
        match_df['team_score'] = np.where(match_df['team'] == team1,
                                          match_df['cum_team1'],
                                          match_df['cum_team2'])
        match_df['opponent_score'] = np.where(match_df['team'] == team1,
                                              match_df['cum_team2'],
                                              match_df['cum_team1'])
        
        match_df.drop(columns=['goal_team1', 'goal_team2', 
                               'cum_team1', 'cum_team2'], inplace=True)
        return match_df

    # Process each match separately using groupby/apply.
    result = df.groupby('match_id', group_keys=False).apply(process_match)
    return result

actions = add_score_columns(actions)

In [None]:
# Some more feature engineering

actions['distance_x'] = np.abs(actions['end_location_x'] - actions['location_x'])
actions['distance_y'] = np.abs(actions['end_location_y'] - actions['location_y'])
actions['goal_diff_start'] = actions['team_score'] - actions['opponent_score']

# Capture contribution to score (score/concede)
actions['goal_contribution'] = 0
actions.loc[(actions['type'] == 'Shot') & (actions['outcome'] == 'Goal'), 'goal_contribution'] = 1
mask_own_goal = (actions['player'] == actions['player'].shift(-1)) & (actions['type'].shift(-1) == 'Own Goal Against')
mask_goalkeeper = (actions['type'].shift(-1) == 'Goal') & (actions['type'] == 'Goal Keeper')
actions.loc[mask_own_goal | mask_goalkeeper, 'goal_contribution'] = -1

# Get angle and distance to goal
def compute_distance_angle(actions, loc_x, loc_y):
    goal_left_post = (120, 44)
    goal_right_post = (120, 36)
    goal_center = (120, 40)
    # Compute distance to the center of the goal
    distance = np.sqrt((goal_center[0] - actions[loc_x])**2 + 
                                    (goal_center[1] - actions[loc_y])**2)
    # Compute angle to goal
    angle_left = np.arctan2(goal_left_post[1] - actions[loc_y], goal_left_post[0] - actions[loc_x])
    angle_right = np.arctan2(goal_right_post[1] - actions[loc_y], goal_right_post[0] - actions[loc_x])
    angle = np.abs(angle_left - angle_right)
    angle = np.where(angle > np.pi, 
                                2*np.pi - angle, 
                                angle)
    return distance, angle

actions['distance_goal'], actions['angle_goal'] = compute_distance_angle(actions, 'location_x', 'location_y')
actions['distance_goal_end'], actions['angle_goal_end'] = compute_distance_angle(actions, 'end_location_x', 'end_location_y')

In [None]:
# Remove the goals, since they aren't actual actions, just events
actions = actions.loc[(actions['type'] != 'Goal') & (actions['type'] != "Own Goal Against")].reset_index(drop=True)

In [None]:
def add_historical_features(df, r):
    """
    Add columns containing values from previous r actions within the same match.
    
    Parameters:
      df (pandas.DataFrame): DataFrame containing match actions
      r (int): Number of previous actions to include
      
    Returns:
      pandas.DataFrame: Original DataFrame with added historical columns
    """
    result = df.copy()
    features = [f for f in df.columns if f != 'match_id' and f != 'id']
    new_cols_list = []
    
    # For each feature, compute shifted versions (historical columns) for lags 1...r-1
    for feature in features:
        shifted_series = [
            result.groupby('match_id')[feature].shift(i).rename(f'{feature}_prev_{i}')
            for i in range(1, r)
        ]
        # Combine all shifts for the current feature along columns.
        feature_shifts_df = pd.concat(shifted_series, axis=1)
        new_cols_list.append(feature_shifts_df)
    
    new_features_df = pd.concat(new_cols_list, axis=1)
    new_features_df = new_features_df.fillna('None')
    result = pd.concat([result, new_features_df], axis=1)
    
    return result

actions = add_historical_features(actions, 15)

In [None]:
def compute_rolling_any(indicator_array, window):
    """
    For each index i in the indicator_array, compute whether any of the next 'window'
    elements (i+1 to i+window) are True. Returns an array of the same length as indicator_array.
    """
    N = len(indicator_array)
    result = np.empty(N, dtype=bool)
    # For indices where a full window is available.
    if N - window > 0:
        windows = sliding_window_view(indicator_array[1:], window_shape=window)
        result[:N - window] = windows.any(axis=1)
    # For the last few indices where a full window is not available.
    for i in range(N - window, N):
        result[i] = indicator_array[i+1:].any() if i < N - 1 else False
    return result

def add_outcome_labels(df, k):
    """
    For each horizon i in 1 to k, add columns indicating:
      - Whether the team scores a goal within the next i actions,
      - Whether the team concedes a goal within the next i actions.
    Additionally, add an overall match outcome column (win/loss/draw) based on final scores.
    
    A goal is scored within the next i actions if:
      - A future row for the same team has goal_contribution == 1, OR
      - A future row for the opposing team has goal_contribution == -1.
    
    A goal is conceded if:
      - A future row for the same team has goal_contribution == -1, OR
      - A future row for the opposing team has goal_contribution == 1.
    """
    df = df.copy()

    def process_group(group):
        group = group.copy()
        teams = group['team'].unique()
        if len(teams) != 2:
            raise ValueError(f"Expected exactly 2 teams per match; got {teams}")
        team1, team2 = teams

        # Build boolean arrays for each event type.
        score_indicator_team1 = (((group['team'] == team1) & (group['goal_contribution'] == 1)) |
                                 ((group['team'] == team2) & (group['goal_contribution'] == -1))
                                ).astype(bool).values
        score_indicator_team2 = (((group['team'] == team2) & (group['goal_contribution'] == 1)) |
                                 ((group['team'] == team1) & (group['goal_contribution'] == -1))
                                ).astype(bool).values

        concede_indicator_team1 = (((group['team'] == team1) & (group['goal_contribution'] == -1)) |
                                   ((group['team'] == team2) & (group['goal_contribution'] == 1))
                                  ).astype(bool).values
        concede_indicator_team2 = (((group['team'] == team2) & (group['goal_contribution'] == -1)) |
                                   ((group['team'] == team1) & (group['goal_contribution'] == 1))
                                  ).astype(bool).values

        # For each horizon i, compute the rolling any indicator vectorized.
        for i in range(1, k + 1):
            # Compute rolling any for each team's indicator.
            rs1 = compute_rolling_any(score_indicator_team1, i)
            rs2 = compute_rolling_any(score_indicator_team2, i)
            group[f'team_scores_in_{i}'] = np.where(group['team'] == team1, rs1, rs2)

            rc1 = compute_rolling_any(concede_indicator_team1, i)
            rc2 = compute_rolling_any(concede_indicator_team2, i)
            group[f'team_concedes_in_{i}'] = np.where(group['team'] == team1, rc1, rc2)
            
        # Overall match outcome based on final scores.
        final_row = group.iloc[-1]
        final_team = final_row['team']
        final_team_score = final_row['team_score']
        final_opponent_score = final_row['opponent_score']
        outcomes = np.where(
            group['team'] == final_team,
            np.where(final_team_score > final_opponent_score, 'win',
                     np.where(final_team_score < final_opponent_score, 'loss', 'draw')),
            np.where(final_team_score < final_opponent_score, 'win',
                     np.where(final_team_score > final_opponent_score, 'loss', 'draw'))
        )
        group['match_outcome'] = pd.Categorical(
            outcomes,
            categories=['win', 'loss', 'draw'],
            ordered=False
        )
        return group

    return df.groupby('match_id', group_keys=False).apply(process_group)

actions = add_outcome_labels(actions, 15)
actions.head(5)

In [None]:
### Add in possession labels

# Create a modified version of goal_contribution that treats -1 as 0, which prevents negative values from affecting our max calculation
actions['goal_contribution_adjusted'] = actions['goal_contribution'].apply(lambda x: max(0, x))

# Group by match_id, possession, team and check if any action contributed to a goal
possession_scores = actions.groupby(['match_id', 'possession', 'team'])['goal_contribution_adjusted'].max().reset_index()
possession_scores.rename(columns={'goal_contribution_adjusted': 'possession_scores'}, inplace=True)
actions = actions.merge(possession_scores, on=['match_id', 'possession', 'team'], how='left')

# First identify all possessions with goals and which team scored
scoring_possessions = actions[actions['goal_contribution'] == 1][['match_id', 'possession', 'team']].drop_duplicates()
scoring_possessions.rename(columns={'team': 'scoring_team'}, inplace=True)
possession_teams = actions[['match_id', 'possession', 'team']].drop_duplicates()

# Merge this with scoring possessions to find cases where another team scored
merged = possession_teams.merge(
    scoring_possessions, 
    on=['match_id', 'possession'], 
    how='left'
)

# Filter to keep only rows where a team conceded (team != scoring_team and scoring_team is not null)
conceded_goals = merged[
    (merged['team'] != merged['scoring_team']) & 
    (~merged['scoring_team'].isna())
].drop_duplicates()
conceded_goals['possession_concedes'] = True
conceded_goals = conceded_goals[['match_id', 'possession', 'team', 'possession_concedes']]
actions = actions.merge(
    conceded_goals,
    on=['match_id', 'possession', 'team'],
    how='left'
)

# Formattig: fill NAs with False and convert to boolean
actions['possession_concedes'] = actions['possession_concedes'].fillna(False)
actions['possession_scores'] = actions['possession_scores'].astype(bool)

actions = actions.drop(columns=['goal_contribution_adjusted'])

In [None]:
# Define all non numeric columns
non_numeric = {'id', 'match_id', 'play_pattern', 'possession', 'possession_team', 'team', 'position', 'player', 'type', 'outcome', 'body_part', 'match_outcome'}
# Extend with historical versions
non_numeric.update({col for col in actions.columns if any(f'{feature}_prev' in col for feature in non_numeric)})

# Convert all intended numeric columns at once
actions = actions.apply(lambda col: pd.to_numeric(col, errors='coerce') if col.name not in non_numeric else col)
# Drop NaNs and reset index
actions = actions.dropna().reset_index(drop=True)

In [None]:
# Add in goal diff end columns
for col in actions.columns:
    if 'goal_diff_start_prev' in col:
        num = col.split('_')[-1]
        actions[f'goal_diff_end_prev_{num}'] = actions[col] + actions[f'goal_contribution_prev_{num}']
actions['goal_diff_end'] = actions['goal_diff_start'] + actions['goal_contribution']

In [None]:
# actions.to_csv('data/processed_actions.csv', index=False)