In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import math
import json
import h5py
from tqdm import tqdm
import os
'''
from google.colab import drive
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')
'''

Mounted at /content/drive


In [None]:
goalpost_x, goalpost_y = 120, 40
nb_prev_actions = 3
nr_actions = 10

# 2. SPADL feature engineering
def action_type_onehot(df):return pd.get_dummies(df['action_type'], prefix='action_type')
def result_onehot(df): return pd.get_dummies(df['result'], prefix='result')
def action_type_result_onehot(df): return pd.get_dummies(df['action_type'] + "_" + df['result'], prefix='action_result')
def bodypart_onehot(df): return pd.get_dummies(df['bodypart'], prefix='bodypart')
def startlocation(df): return df[['start_x', 'start_y']].copy()
def endlocation(df): return df[['end_x', 'end_y']].copy()
def movement(df):
    dx = df['dx']
    dy = df['dy']
    movement = np.sqrt(dx**2 + dy**2)
    return pd.DataFrame({'movement': movement})
def startpolar(df):
    dx = goalpost_x - df['start_x']
    dy = goalpost_y - df['start_y']
    dist_to_goal = np.sqrt(dx**2 + dy**2)
    angle_to_goal = np.degrees(np.arctan2(dy, dx))
    return pd.DataFrame({'start_dist_to_goal': dist_to_goal,'start_angle_to_goal': angle_to_goal})
def endpolar(df):
    dx = goalpost_x - df['end_x']
    dy = goalpost_y - df['end_y']
    dist_to_goal = np.sqrt(dx**2 + dy**2)
    angle_to_goal = np.degrees(np.arctan2(dy, dx))
    return pd.DataFrame({'end_dist_to_goal': dist_to_goal, 'end_angle_to_goal': angle_to_goal})
def time(df):
    time_seconds_overall = df['seconds']

    period_start_times = df.groupby('period_id')['seconds'].transform('min')

    time_seconds = df['seconds'] - period_start_times

    return pd.DataFrame({
        'period_id': df['period_id'],
        'time_seconds': time_seconds,
        'time_seconds_overall': time_seconds_overall
    })
def player_possession_time_filled(df):
    possession_time = [0] * len(df)
    last_possession_time = {}

    for i, row in df.iterrows():
        player = row['player_id']
        action = row['action_type']
        result = row['result']
        curr_time = row['seconds']

        if action in ['dribble', 'take_on'] and result == 'success':
            last_time = last_possession_time.get(player)
            if last_time is not None:
                possession_time[i] = curr_time - last_time
            else:
                possession_time[i] = 0
            last_possession_time[player] = curr_time

        elif (action in ['interception', 'tackle'] and result == 'success') or \
             (action == 'take_on' and result == 'fail'):
            last_possession_time.pop(player, None)
            possession_time[i] = 0

    return pd.DataFrame({'player_possession_time': possession_time})

def onball_contribution(df):
    return df[["onball_contribution_score"]]

def offball_contribution(df):
    return df[["offball_contribution_score"]]

In [None]:
def feature_column_names(fs, df, nb_prev_actions=3):
    feature_names = []
    for i in range(nb_prev_actions + 1):
        for f in fs:
            cols = f(df, gamestates, nb_prev_actions=nb_prev_actions).columns.tolist()
            prefix = f"prev_{nb_prev_actions - i}_" if i < nb_prev_actions else ""
            feature_names.extend([prefix + col for col in cols])
    return feature_names

def create_gamestates(df, nb_prev_actions=3):
    game_states = []

    for i in range(nb_prev_actions):
        shifted = df.shift(nb_prev_actions - i)
        shifted.columns = [f"prev_{nb_prev_actions - i}_{col}" for col in df.columns]
        game_states.append(shifted)

    game_states.append(df)
    result = pd.concat(game_states, axis=1).dropna().reset_index(drop=True)

    return result

def create_labels(df, nr_actions=10):
    labels = {
        "scores": [],
        "concedes": [],
        "goal_from_shot": [],
        "event_uuid": []
    }
    goal_actions = ['shot', 'shot_freekick', 'shot_penalty']

    # Processed by game
    for game_id, df_game in df.groupby("game_id"):
        df_game = df_game.sort_values("index").reset_index(drop=True)

        for i, row in df_game.iterrows():
            team_id = row["team_id"]
            event_uuid = row["event_uuid"]

            future_actions = df_game.iloc[i+1:i+1+nr_actions]

            scored = ((future_actions["team_id"] == team_id) &
                      (future_actions["action_type"].isin(goal_actions)) &
                      (future_actions["result"] == "success")).any()

            conceded = ((future_actions["team_id"] != team_id) &
                        (future_actions["action_type"].isin(goal_actions)) &
                        (future_actions["result"] == "success")).any()

            is_goal = (row["action_type"] in goal_actions) and (row["result"] == "success")

            labels["scores"].append(int(scored))
            labels["concedes"].append(int(conceded))
            labels["goal_from_shot"].append(int(is_goal))
            labels["event_uuid"].append(event_uuid)

    return pd.DataFrame(labels)

# Apply after applying Gamestates
def get_home_team_id(df_game):
    home_team = df_game[df_game["is_home_team"] == True]["team"].unique()

    if len(home_team) != 1:
        raise ValueError(f"Home team more than one: {home_team}")

    return home_team[0]

def play_left_to_right(gamestates, home_team_id):
    ltr_gamestates = []

    for gs in gamestates:
        if gs.empty:
            ltr_gamestates.append(gs)
            continue

        last_action = gs.iloc[-1]
        team_id = last_action['team_id']
        left_to_right = (team_id == home_team_id)

        gs_copy = gs.copy()

        if not left_to_right:
            gs_copy["start_x"] = 120 - gs_copy["start_x"]
            gs_copy["end_x"]   = 120 - gs_copy["end_x"]
            gs_copy["start_y"] = 80 - gs_copy["start_y"]
            gs_copy["end_y"]   = 80 - gs_copy["end_y"]

        ltr_gamestates.append(gs_copy)

    return ltr_gamestates

def goalscore(df, gamestates, nb_prev_actions=None):
    GOAL_TYPES = {'shot','shot_freekick','shot_penalty'}
    out_team, out_opp = [], []
    for gs in gamestates:
        if gs.empty:
            out_team.append(0); out_opp.append(0); continue
        a0 = gs.iloc[-1]
        team_id, game_id, idx0 = a0['team_id'], a0['game_id'], a0['index']
        match = df[df['game_id']==game_id]
        past  = match[match['index'] < idx0]
        goals = past[(past['action_type'].isin(GOAL_TYPES)) & (past['result']=='success')]
        team_goals = (goals['team_id'] == team_id).sum()
        opp_goals  = (goals['team_id'] != team_id).sum()
        out_team.append(team_goals); out_opp.append(opp_goals)
    return pd.DataFrame({
        'goalscore_team': out_team,
        'goalscore_opponent': out_opp,
        'goalscore_diff': np.array(out_team) - np.array(out_opp)
    })


def space_delta(gamestates, nb_prev_actions=None):
    features = {}
    for i in range(1, nb_prev_actions + 1):
        features[f'dx_a0{i}'] = []
        features[f'dy_a0{i}'] = []
        features[f'mov_a0{i}'] = []

    for gs in gamestates:
        a0 = gs.iloc[-1]

        for i in range(1, nb_prev_actions + 1):
            if len(gs) >= i + 1:
                ai = gs.iloc[-1 - i]
                dx = a0['start_x'] - ai['start_x']
                dy = a0['start_y'] - ai['start_y']
                mov = np.sqrt(dx ** 2 + dy ** 2)
            else:
                dx, dy, mov = np.nan, np.nan, np.nan

            features[f'dx_a0{i}'].append(dx)
            features[f'dy_a0{i}'].append(dy)
            features[f'mov_a0{i}'].append(mov)

    return pd.DataFrame(features)
def speed(gamestates, nb_prev_actions=None):
    features = {}
    for i in range(1, nb_prev_actions + 1):
        features[f'speedx_a0{i}'] = []
        features[f'speedy_a0{i}'] = []
        features[f'speed_a0{i}'] = []

    for gs in gamestates:
        a0 = gs.iloc[-1]

        for i in range(1, nb_prev_actions + 1):
            if len(gs) >= i + 1:
                ai = gs.iloc[-1 - i]

                dx = a0['start_x'] - ai['start_x']
                dy = a0['start_y'] - ai['start_y']

                dt = a0['seconds'] - ai['seconds']
                if dt > 0:
                    speedx = dx / dt
                    speedy = dy / dt
                    speed = np.sqrt(speedx ** 2 + speedy ** 2)
                else:
                    speedx, speedy, speed = np.nan, np.nan, np.nan
            else:
                speedx, speedy, speed = np.nan, np.nan, np.nan

            features[f'speedx_a0{i}'].append(speedx)
            features[f'speedy_a0{i}'].append(speedy)
            features[f'speed_a0{i}'].append(speed)

    return pd.DataFrame(features)
def team(gamestates, nb_prev_actions=None):
    features = {f'team_a0{i}': [] for i in range(1, nb_prev_actions + 1)}

    for gs in gamestates:
        if gs.empty or len(gs) < nb_prev_actions + 1:
            for i in range(1, nb_prev_actions + 1):
                features[f'team_a0{i}'].append(np.nan)
            continue

        a0_team = gs.iloc[-1]['team_id']
        for i in range(1, nb_prev_actions + 1):
            ai_team = gs.iloc[-1 - i]['team_id']
            features[f'team_a0{i}'].append(a0_team == ai_team)

    return pd.DataFrame(features)
def time_delta(gamestates, nb_prev_actions=None):
    features = {f'time_delta_{i}': [] for i in range(1, nb_prev_actions + 1)}

    for gs in gamestates:
        if gs.empty or len(gs) < nb_prev_actions + 1:
            for i in range(1, nb_prev_actions + 1):
                features[f'time_delta_{i}'].append(np.nan)
            continue

        a0_time = gs.iloc[-1]['seconds']

        for i in range(1, nb_prev_actions + 1):
            ai_time = gs.iloc[-1 - i]['seconds']
            time_diff = a0_time - ai_time
            features[f'time_delta_{i}'].append(time_diff)

    return pd.DataFrame(features)

fs_df = [
    startlocation,
    endlocation,
    movement,
    startpolar,
    endpolar,
    action_type_onehot,
    result_onehot,
    action_type_result_onehot,
    bodypart_onehot,
    time,
    player_possession_time_filled,
    onball_contribution,
    offball_contribution
]

fs_gamestate = [
    space_delta,
    speed,
    team,
    time_delta
]


In [None]:

input_path = '/content/drive/MyDrive/Data/Processed/euro2020_spadl_enriched.h5'
vaep_features_h5 = '/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_features.h5'
vaep_labels_h5 = '/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_labels.h5'
feature_names_txt = '/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_feature_names.txt'

all_features = []
all_labels = []

with pd.HDFStore(input_path, mode='r') as store:
    game_keys = [k for k in store.keys() if k.startswith('/actions/')]
    game_ids = [int(k.split('/')[-1]) for k in game_keys]

    for game_id in tqdm(game_ids, desc="Processing games"):
        df_game = store[f"/actions/{game_id}"].copy().reset_index(drop=True)

        home_team_id = df_game[df_game['is_home_team'] == True]['team_id'].iloc[0]

        # Gamestate generation and left-right normalization
        gamestates = [
            df_game.iloc[i - nb_prev_actions:i + 1].reset_index(drop=True)
            for i in range(nb_prev_actions, len(df_game))
        ]
        gamestates_ltr = play_left_to_right(gamestates, home_team_id)

        # Features
        X_df = pd.concat(
            [f(df_game.iloc[nb_prev_actions:].reset_index(drop=True)) for f in fs_df], axis=1
        )
        X_gs = pd.concat(
            [f(gamestates_ltr, nb_prev_actions=nb_prev_actions) for f in fs_gamestate], axis=1
        )
        X_goalscore = goalscore(df_game, gamestates_ltr, nb_prev_actions=nb_prev_actions)
        X = pd.concat([X_df, X_gs, X_goalscore], axis=1)

        # Labels
        y = create_labels(df_game.iloc[nb_prev_actions:].reset_index(drop=True))

        all_features.append(X)
        all_labels.append(y)

X_all = pd.concat(all_features, axis=0).reset_index(drop=True)
y_all = pd.concat(all_labels, axis=0).reset_index(drop=True)

print(X_all.dtypes[X_all.dtypes == "object"])

X_all = X_all.apply(lambda col: col.astype(str) if col.dtype == "object" else col)

# Save as HDF (overwrite, prevent duplicates)
X_all.to_hdf(vaep_features_h5, key='features', mode='w', format='table')
y_all.to_hdf(vaep_labels_h5, key='labels', mode='w', format='table')

with open(feature_names_txt, 'w') as f:
    for col in X_all.columns.tolist():
        f.write(col + '\n')

print("Save Complete!")


Processing games: 100%|██████████| 51/51 [09:00<00:00, 10.61s/it]


action_type_keeper_claim              object
action_type_keeper_punch              object
result_yellow_card                    object
action_result_foul_yellow_card        object
action_result_keeper_claim_success    object
action_result_keeper_punch_success    object
action_result_shot_success            object
action_result_keeper_claim_fail       object
result_red_card                       object
action_result_foul_red_card           object
dtype: object
Save Complete!
