In [57]:
import json, gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from sklearn.cluster import KMeans
from scipy.special import comb
from copy import deepcopy
import random
from tqdm import tqdm
import seaborn as sns



In [58]:

TRACKING_DIR = pathlib.Path("tracking-compressed")
json_gz_paths = sorted(TRACKING_DIR.glob("tracking_*.json.gz"))

n_files = 1 # or set to len(json_gz_paths) to load all

frames = []
players = []
used_match_ids = []  # <- store used match_ids here

for file_idx, json_gz_path in enumerate(json_gz_paths[:n_files]):
    match_id = json_gz_path.stem  # e.g. "tracking_g2444470"
    used_match_ids.append(match_id)  # <- keep track of what's loaded

    records = []

    with gzip.open(json_gz_path, "rt", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))

    for r in records:
        f_data = {
            "match_id": match_id,
            "period": r["period"],
            "frameIdx": r["frameIdx"],
            "gameClock": r["gameClock"],
            "lastTouch_team": r["lastTouch"],
            "ball_x": r["ball"]["xyz"][0],
            "ball_y": r["ball"]["xyz"][1],
            "ball_z": r["ball"]["xyz"][2],
        }
        frames.append(f_data)

        for side in ["homePlayers", "awayPlayers"]:
            for p in r[side]:
                px, py, pz = p["xyz"]
                players.append({
                    "match_id": match_id,
                    "period": r["period"],
                    "frameIdx": r["frameIdx"],
                    "side": "home" if side == "homePlayers" else "away",
                    "playerId": p["playerId"],
                    "optaId": str(p["optaId"]),
                    "number": p["number"],
                    "x": px, "y": py, "z": pz,
                    "speed": p["speed"],
                })

# Convert to DataFrames
frames_df = pd.DataFrame(frames)
players_df = pd.DataFrame(players)
  

In [59]:
def extract_event_metadata(events_path):
    teams = set()
    match_date = None

    with open(events_path, "r", encoding="utf-8-sig") as f:
        for line in f:
            obj = json.loads(line)
            
            # Grab match date
            if match_date is None and "match_date" in obj:
                match_date = obj["match_date"]

            # Grab team names
            if "team" in obj and obj["team"] is not None:
                team_name = obj["team"]["name"]
                teams.add(team_name)
            if "possession_team" in obj and obj["possession_team"] is not None:
                team_name = obj["possession_team"]["name"]
                teams.add(team_name)

    return {
        "events_file": str(events_path),
        "match_date": match_date,
        "team_names": list(teams),
    }

# Assume your event files are in a directory named 'events'
EVENTS_DIR = pathlib.Path("statsbomb_pl_data")
event_files = sorted(EVENTS_DIR.glob("*.json"))

event_metadata_records = []

for event_file in event_files:
    meta = extract_event_metadata(event_file)
    event_metadata_records.append(meta)

event_meta_df = pd.DataFrame(event_metadata_records)

print(event_meta_df.head())

                      events_file  match_date  \
0  statsbomb_pl_data/3837230.json  2022-08-05   
1  statsbomb_pl_data/3837231.json  2022-08-06   
2  statsbomb_pl_data/3837232.json  2022-08-07   
3  statsbomb_pl_data/3837233.json  2022-08-06   
4  statsbomb_pl_data/3837234.json  2022-08-06   

                                team_names  
0                [Crystal Palace, Arsenal]  
1  [Wolverhampton Wanderers, Leeds United]  
2              [Leicester City, Brentford]  
3    [Newcastle United, Nottingham Forest]  
4         [Southampton, Tottenham Hotspur]  


In [60]:

# Collect all metadata files
metadata_dir = pathlib.Path("metadata_SecondSpectrum")
all_metadata_files = list(metadata_dir.glob("*.json"))

tracking_meta_records = []

for path in all_metadata_files:
    with open(path, "r", encoding="utf-8-sig") as f:
        meta = json.load(f)

    # Build match_date from year, month, day
    if all(k in meta for k in ["year", "month", "day"]):
        match_date = f"{meta['year']:04}-{meta['month']:02}-{meta['day']:02}"
    else:
        match_date = None

    # Parse teams from description
    desc = meta.get("description", "")
    home_team, away_team = None, None
    if " - " in desc:
        teams_part = desc.split(":")[0].strip()
        home_team, away_team = teams_part.split(" - ")

    # Get tracking suffix from filename
    if path.stem.startswith("metadata_g"):
        suffix = path.stem.split("_")[1]
    else:
        suffix = path.stem.split("_")[0]

    tracking_meta_records.append({
        "metadata_path": str(path),
        "tracking_suffix": suffix,
        "match_date": match_date,
        "home_team": home_team,
        "away_team": away_team,
    })

tracking_meta_df = pd.DataFrame(tracking_meta_records)

print(tracking_meta_df.head())

                                       metadata_path tracking_suffix  \
0     metadata_SecondSpectrum/metadata_g2444536.json        g2444536   
1  metadata_SecondSpectrum/g2292852_SecondSpectru...        g2292852   
2     metadata_SecondSpectrum/metadata_g2444473.json        g2444473   
3  metadata_SecondSpectrum/g2293073_SecondSpectru...        g2293073   
4  metadata_SecondSpectrum/g2367750_SecondSpectru...        g2367750   

   match_date home_team away_team  
0  2024-10-05       EVE       NEW  
1  2022-08-30       CRY       BRE  
2  2024-08-17       IPS       LIV  
3  2023-03-12       FUL       ARS  
4  2024-01-30       FUL       EVE  


In [63]:
team_code_map = {'FUL': 'Fulham',
                'BRE': 'Brentford',
                'CRY': 'Crystal Palace', 
                'TOT': 'Tottenham Hotspur', 
                'BOU': 'AFC Bournemouth', 
                'SOU': 'Southampton',
                'AVL': 'Aston Villa', 
                'WHU': 'West Ham United', 
                'MUN': 'Manchester United',
                'ARS': 'Arsenal', 
                'LEI': 'Leicester City',
                'NEW': 'Newcastle United',
                'BHA': 'Brighton & Hove Albion',
                'IPS': 'Ipswich Town', 
                'EVE': 'Everton', 
                'LIV': 'Liverpool',
                'LEE': 'Leeds United', 
                'NOT': 'Nottingham Forest',
                'MCI': 'Manchester City', 
                'WOL': 'Wolverhampton Wanderers',
                'SHU': 'Sheffield United',
                'CHE': 'Chelsea', 
                'LUT': 'Luton Town', 
                'BUR': 'Burnley'}

print(" TEAM CODE MAP:")
print(team_code_map)

 TEAM CODE MAP:
{'FUL': 'Fulham', 'BRE': 'Brentford', 'CRY': 'Crystal Palace', 'TOT': 'Tottenham Hotspur', 'BOU': 'AFC Bournemouth', 'SOU': 'Southampton', 'AVL': 'Aston Villa', 'WHU': 'West Ham United', 'MUN': 'Manchester United', 'ARS': 'Arsenal', 'LEI': 'Leicester City', 'NEW': 'Newcastle United', 'BHA': 'Brighton & Hove Albion', 'IPS': 'Ipswich Town', 'EVE': 'Everton', 'LIV': 'Liverpool', 'LEE': 'Leeds United', 'NOT': 'Nottingham Forest', 'MCI': 'Manchester City', 'WOL': 'Wolverhampton Wanderers', 'SHU': 'Sheffield United', 'CHE': 'Chelsea', 'LUT': 'Luton Town', 'BUR': 'Burnley'}


In [64]:
tracking_meta_df["home_team_full"] = tracking_meta_df["home_team"].map(team_code_map)
tracking_meta_df["away_team_full"] = tracking_meta_df["away_team"].map(team_code_map)

tracking_meta_long = pd.concat([
    tracking_meta_df.assign(team_name=tracking_meta_df["home_team_full"]),
    tracking_meta_df.assign(team_name=tracking_meta_df["away_team_full"]),
])

tracking_meta_long["key"] = (
    tracking_meta_long["match_date"].fillna("") + "_" +
    tracking_meta_long["team_name"].fillna("")
)

event_meta_exploded = event_meta_df.explode("team_names")
event_meta_exploded["team_name"] = event_meta_exploded["team_names"].fillna("")

event_meta_exploded["key"] = (
    event_meta_exploded["match_date"].fillna("") + "_" +
    event_meta_exploded["team_name"].fillna("")
)

event_tracking_df = event_meta_exploded.merge(
    tracking_meta_long,
    on="key",
    how="left",
    suffixes=("", "_tracking")
)

print(event_tracking_df.head())

                      events_file  match_date               team_names  \
0  statsbomb_pl_data/3837230.json  2022-08-05           Crystal Palace   
1  statsbomb_pl_data/3837230.json  2022-08-05                  Arsenal   
2  statsbomb_pl_data/3837231.json  2022-08-06  Wolverhampton Wanderers   
3  statsbomb_pl_data/3837231.json  2022-08-06             Leeds United   
4  statsbomb_pl_data/3837232.json  2022-08-07           Leicester City   

                 team_name                                 key  \
0           Crystal Palace           2022-08-05_Crystal Palace   
1                  Arsenal                  2022-08-05_Arsenal   
2  Wolverhampton Wanderers  2022-08-06_Wolverhampton Wanderers   
3             Leeds United             2022-08-06_Leeds United   
4           Leicester City           2022-08-07_Leicester City   

                                       metadata_path tracking_suffix  \
0  metadata_SecondSpectrum/g2292810_SecondSpectru...        g2292810   
1  metadata_Se

In [66]:
event_tracking_df_clean = (
    event_tracking_df
    .dropna(subset=["tracking_suffix"])
    .drop_duplicates(subset=["events_file"])
    .reset_index(drop=True)
)
print(event_tracking_df_clean.head())

                      events_file  match_date               team_names  \
0  statsbomb_pl_data/3837230.json  2022-08-05           Crystal Palace   
1  statsbomb_pl_data/3837231.json  2022-08-06  Wolverhampton Wanderers   
2  statsbomb_pl_data/3837232.json  2022-08-07           Leicester City   
3  statsbomb_pl_data/3837233.json  2022-08-06         Newcastle United   
4  statsbomb_pl_data/3837234.json  2022-08-06              Southampton   

                 team_name                                 key  \
0           Crystal Palace           2022-08-05_Crystal Palace   
1  Wolverhampton Wanderers  2022-08-06_Wolverhampton Wanderers   
2           Leicester City           2022-08-07_Leicester City   
3         Newcastle United         2022-08-06_Newcastle United   
4              Southampton              2022-08-06_Southampton   

                                       metadata_path tracking_suffix  \
0  metadata_SecondSpectrum/g2292810_SecondSpectru...        g2292810   
1  metadata_Se

In [None]:
# Add absolute seconds column
def add_event_timestamps(events_df):
    events_df["minute"] = events_df["minute"].fillna(0).astype(int)
    events_df["second"] = events_df["second"].fillna(0).astype(int)

    if "milliseconds" in events_df.columns:
        events_df["milliseconds"] = events_df["milliseconds"].fillna(0).astype(int)
    else:
        events_df["milliseconds"] = 0

    events_df["seconds_period"] = (
        events_df["minute"] * 60 +
        events_df["second"] +
        events_df["milliseconds"] / 1000
    )

    return events_df

def match_events_to_frames(events_df, frames_df_match):
    """
    Assigns nearest frameIdx to each event.
    """
    period_to_frame_times = {}
    for period, group in frames_df_match.groupby("period"):
        period_to_frame_times[period] = group[["seconds_period", "frameIdx"]].sort_values("seconds_period")

    assigned_frames = []
    for _, e in events_df.iterrows():
        period = e["period"]
        seconds_event = e["seconds_period"]

        if period not in period_to_frame_times:
            assigned_frames.append(None)
            continue

        times = period_to_frame_times[period]["seconds_period"].values
        frames = period_to_frame_times[period]["frameIdx"].values

        # Find closest frame
        idx = np.argmin(np.abs(times - seconds_event))
        assigned_frame = frames[idx]
        assigned_frames.append(assigned_frame)

    events_df["frameIdx"] = assigned_frames
    return events_df


def find_events_during_run(run_df, events_df):
    """
    Given a single run DataFrame and events_df
    returns list of events that overlap this run
    """
    period = run_df["period"].iloc[0]
    start_frame = run_df["frameIdx"].min()
    end_frame = run_df["frameIdx"].max()

    overlapping_events = events_df[
        (events_df["period"] == period) &
        (events_df["frameIdx"] >= start_frame) &
        (events_df["frameIdx"] <= end_frame)
    ]
    return overlapping_events

# for _, row in event_tracking_df_clean.iterrows():
#     events_path = row["events_file"]
#     tracking_suffix = row["tracking_suffix"]
#     tracking_match_id = f"tracking_{tracking_suffix}"

#     # Slice tracking data for this match
#     frames_df_match = frames_df[frames_df["match_id"] == tracking_match_id]
#     players_df_match = players_df[players_df["match_id"] == tracking_match_id]

#     print("=" * 100)
#     print(f" MATCH FOUND:")
#     print(f"→ Events file:   {events_path}")
#     print(f"→ Tracking file: {tracking_match_id}.json.gz")
#     print(f"→ Match Date:    {row['match_date']}")
#     print(f"→ Home Team:     {row['home_team']}")
#     print(f"→ Away Team:     {row['away_team']}")
#     print(f"→ Number of tracking frames: {len(frames_df_match)}")
#     print(f"→ Number of tracking player rows: {len(players_df_match)}")
#     print("=" * 100)

#     # Load events JSON
#     with open(events_path, "r", encoding="utf-8-sig") as f:
#         event_rows = [json.loads(line) for line in f]

#     events_df = pd.DataFrame(event_rows)
#     #print(events_df.head())

#         # Add absolute seconds column
#     events_df = add_event_timestamps(events_df)

#     # Compute seconds for frames
#     PERIOD_DURATION_SEC = 45 * 60
#     frames_df_match["seconds_period"] = PERIOD_DURATION_SEC - frames_df_match["gameClock"]

#     # Assign nearest frameIdx to each event
#     events_df = match_events_to_frames(events_df, frames_df_match)

#     # Find overlapping events for each run in this match
#     runs_in_match = final_runs_df[final_runs_df["match_id"] == tracking_match_id]

#     for run_id, run_df in runs_in_match.groupby("run_id"):
#         overlapping_events = find_events_during_run(run_df, events_df)

#         if not overlapping_events.empty:
#             print(f"Run {run_id} overlaps with {len(overlapping_events)} events:")
#             print(overlapping_events[["type", "minute", "second", "player", "team"]].head())

events_dfs_by_match = {}

for _, row in event_tracking_df_clean.iterrows():
    events_path = row["events_file"]
    tracking_suffix = row["tracking_suffix"]
    tracking_match_id = f"tracking_{tracking_suffix}"

    # Slice tracking data for this match
    frames_df_match = frames_df[frames_df["match_id"] == tracking_match_id]
    players_df_match = players_df[players_df["match_id"] == tracking_match_id]

    print("=" * 100)
    print(f" MATCH FOUND:")
    print(f"→ Events file:   {events_path}")
    print(f"→ Tracking file: {tracking_match_id}.json.gz")
    print(f"→ Match Date:    {row['match_date']}")
    print(f"→ Home Team:     {row['home_team']}")
    print(f"→ Away Team:     {row['away_team']}")
    print(f"→ Number of tracking frames: {len(frames_df_match)}")
    print(f"→ Number of tracking player rows: {len(players_df_match)}")
    print("=" * 100)

    # Load events JSON
    with open(events_path, "r", encoding="utf-8-sig") as f:
        event_rows = [json.loads(line) for line in f]
    events_df = pd.DataFrame(event_rows)
    
    # add timestamps
    events_df = add_event_timestamps(events_df)

    # attach frameIdx
    frames_df_match["seconds_period"] = 45*60 - frames_df_match["gameClock"]
    events_df = match_events_to_frames(events_df, frames_df_match)

    # store this DataFrame for later:
    events_dfs_by_match[tracking_match_id] = events_df


    
    
# REMEMBER TO ADD THIS AFTER FINAL_RUNS_DF IS DEFINED
# for match_id, events_df in events_dfs_by_match.items():
#     runs_in_match = final_runs_df[final_runs_df["match_id"] == match_id]

#     for run_id, run_df in runs_in_match.groupby("run_id"):
#         overlapping_events = find_events_during_run(run_df, events_df)

#         if not overlapping_events.empty:
#             print(f"Run {run_id} overlaps with {len(overlapping_events)} events:")
#             print(overlapping_events[["type", "minute", "second", "player", "team"]].head())

 MATCH FOUND:
→ Events file:   statsbomb_pl_data/3837230.json
→ Tracking file: tracking_g2292810.json.gz
→ Match Date:    2022-08-05
→ Home Team:     CRY
→ Away Team:     ARS
→ Number of tracking frames: 0
→ Number of tracking player rows: 0
 MATCH FOUND:
→ Events file:   statsbomb_pl_data/3837231.json
→ Tracking file: tracking_g2292814.json.gz
→ Match Date:    2022-08-06
→ Home Team:     LEE
→ Away Team:     WOL
→ Number of tracking frames: 0
→ Number of tracking player rows: 0
 MATCH FOUND:
→ Events file:   statsbomb_pl_data/3837232.json
→ Tracking file: tracking_g2292815.json.gz
→ Match Date:    2022-08-07
→ Home Team:     LEI
→ Away Team:     BRE
→ Number of tracking frames: 0
→ Number of tracking player rows: 0
 MATCH FOUND:
→ Events file:   statsbomb_pl_data/3837233.json
→ Tracking file: tracking_g2292816.json.gz
→ Match Date:    2022-08-06
→ Home Team:     NEW
→ Away Team:     NOT
→ Number of tracking frames: 0
→ Number of tracking player rows: 0
 MATCH FOUND:
→ Events file:   s

In [75]:
sample_match_id = list(events_dfs_by_match.keys())[0]

# Grab its DataFrame
events_df = events_dfs_by_match[sample_match_id]

# View basic info
#print(events_df.head(20))
print(events_df.columns)
print(events_df.shape)

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'obv_for_after',
       'obv_for_before', 'obv_for_net', 'obv_against_after',
       'obv_against_before', 'obv_against_net', 'obv_total_net', 'team',
       'duration', 'tactics', 'load_datetime', 'user', 'match_date',
       'match_id', 'pipeline_run_id', 'extraction_timestamp', 'related_events',
       'player', 'position', 'location', 'pass', 'carry', 'ball_receipt',
       'under_pressure', 'duel', 'counterpress', 'interception', 'out',
       'dribble', 'ball_recovery', 'off_camera', 'shot', 'goalkeeper',
       'clearance', 'block', 'foul_committed', 'foul_won', 'miscontrol',
       'substitution', 'milliseconds', 'seconds_period', 'frameIdx'],
      dtype='object')
(3806, 52)


In [76]:
# Step 1: Extract suffixes like "g2444470" from used tracking files
used_match_suffixes = [match_id.split("_", 1)[1].replace(".json", "") for match_id in used_match_ids]

# Step 2: Gather metadata file paths from both formats
metadata_dir = pathlib.Path("metadata_SecondSpectrum")
all_metadata_files = list(metadata_dir.glob("*.json"))

# Build map: match_suffix (e.g., "g2444470") → metadata_path
metadata_file_map = {}
for path in all_metadata_files:
    filename = path.name
    if filename.startswith("metadata_g") and filename.endswith(".json"):
        suffix = filename.split("_")[1].split(".")[0]  # 'g2444470'
    elif filename.endswith("_SecondSpectrum_Metadata.json"):
        suffix = filename.split("_")[0]  # 'g2444470'
    else:
        continue  # skip non-matching files
    metadata_file_map[suffix] = path

# Build a lookup DataFrame linking tracking suffixes to match date and teams
tracking_meta_records = []
for suffix, metadata_path in metadata_file_map.items():
    with open(metadata_path, "r", encoding="utf-8-sig") as f:
        meta = json.load(f)
    tracking_meta_records.append({
        "match_date": meta.get("matchDate"),
        "home_team": meta.get("homeTeamName"),
        "away_team": meta.get("awayTeamName"),
        "tracking_suffix": suffix,
        "metadata_path": str(metadata_path),
    })


# Step 3: Load metadata and build lookup for used matches
opta_meta_lookup = {}

for suffix in used_match_suffixes:
    metadata_path = metadata_file_map.get(suffix)
    if not metadata_path:
        print(f" No metadata found for match {suffix}")
        continue

    with open(metadata_path, "r", encoding="utf-8-sig") as f:
        meta = json.load(f)

    match_id = f"tracking_{suffix}"  # same format as tracking match_id

    for side, team in [("homePlayers", "home"), ("awayPlayers", "away")]:
        for p in meta.get(side, []):
            key = (match_id, str(p["optaId"]))
            opta_meta_lookup[key] = {
                "player_name": p.get("name"),
                "position": p.get("position"),
                "team_role": team,
            }

print(f" Loaded metadata for {len(opta_meta_lookup)} players.")

meta_df = pd.DataFrame([
    {
        "match_id": match_id,
        "optaId": opta_id,
        "player_name": info["player_name"],
        "position": info["position"],
        "team_role": info["team_role"],
    }
    for (match_id, opta_id), info in opta_meta_lookup.items()
])

players_df["match_id_clean"] = players_df["match_id"].str.replace(".json", "", regex=False)

# Merge using match_id and optaId as keys
players_df = players_df.merge(
    meta_df,
    how="left",
    left_on=["match_id_clean", "optaId"],
    right_on=["match_id", "optaId"]
)

players_df.drop(columns=["match_id_clean", "match_id_y"], inplace=True)
players_df.rename(columns={"match_id_x": "match_id"}, inplace=True)


 Loaded metadata for 40 players.


In [77]:
# Segmenting Runs (across multiple matches)
def segment_runs(players_df, speed_threshold=2.0):
    """
    Segments continuous runs for each player within each match and period
    when speed exceeds a threshold.
    """
    runs = []
    for (match_id, period, playerId), group in players_df.groupby(["match_id", "period", "playerId"]):
        group = group.sort_values("frameIdx")
        current_run = []
        for _, row in group.iterrows():
            if row["speed"] > speed_threshold:
                current_run.append(row)
            elif current_run:
                runs.append(pd.DataFrame(current_run))
                current_run = []
        if current_run:
            runs.append(pd.DataFrame(current_run))
    return runs

def filter_off_ball_runs_with_distance(runs_list, frames_df, players_df, min_distance=3.0):
    """
    Filters runs to keep only those where:
    - The player never touched the ball (not lastTouch)
    - The player is always at least `min_distance` away from the ball
    """
    frame_last_touch = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch_team"].to_dict()
    ball_positions = frames_df.set_index(["match_id", "period", "frameIdx"])[["ball_x", "ball_y"]].to_dict("index")
    
    off_ball_runs = []

    for run_df in runs_list:
        player_id = run_df["playerId"].iloc[0]
        match_id = run_df["match_id"].iloc[0]
        period = run_df["period"].iloc[0]
        frame_idxs = run_df["frameIdx"].values

        is_off_ball = True
        for frame_idx in frame_idxs:
            key = (match_id, period, frame_idx)

            # Check lastTouch
            if frame_last_touch.get(key) == player_id:
                is_off_ball = False
                break

            # Check distance from ball
            ball_pos = ball_positions.get(key)
            if ball_pos is None:
                continue  # Skip frames with missing ball info

            player_pos = run_df[run_df["frameIdx"] == frame_idx][["x", "y"]].values
            if player_pos.size == 0:
                continue

            dist = np.linalg.norm(player_pos[0] - np.array([ball_pos["ball_x"], ball_pos["ball_y"]]))
            if dist < min_distance:
                is_off_ball = False
                break

        if is_off_ball:
            off_ball_runs.append(run_df)

    return off_ball_runs

runs_list = segment_runs(players_df)
print(f"Total runs segmented: {len(runs_list)}")

runs_list = filter_off_ball_runs_with_distance(runs_list, frames_df, players_df, min_distance=3.0)
print(f"Total off-ball runs (with min distance): {len(runs_list)}")

# Annotate each run with player metadata
annotated_runs = []

for run_df in runs_list:
    # Make a copy of the run to avoid modifying in-place
    run_df = run_df.copy()

    # Extract metadata from the first row (same for entire run)
    meta_fields = ["playerId", "optaId", "match_id", "player_name", "position", "team_role"]
    for field in meta_fields:
        run_df[field] = run_df.iloc[0][field]

    annotated_runs.append(run_df)

# Assign a unique run_id to each run
for i, run_df in enumerate(annotated_runs):
    run_df["run_id"] = i

# Optional: Combine into one dataframe
all_runs_df = pd.concat(annotated_runs, ignore_index=True)

# Preview
print(all_runs_df[["player_name", "position", "team_role", "x", "y", "speed"]].head())

Total runs segmented: 11272
Total off-ball runs (with min distance): 8881
          player_name position team_role     x     y  speed
0  Gabriel Martinelli       LW      away  2.93 -7.94   2.01
1  Gabriel Martinelli       LW      away  2.85 -7.91   2.07
2  Gabriel Martinelli       LW      away  2.77 -7.89   2.07
3  Gabriel Martinelli       LW      away  2.69 -7.87   2.07
4  Gabriel Martinelli       LW      away  2.61 -7.85   2.08


In [78]:
# Def mirror function
def mirror_group(group):

    y_mean = group["y"].mean()
    if y_mean < 0:
        group["y_mirror"] = -group["y"]
    else:
        group["y_mirror"] = group["y"]
    group["x_mirror"] = group["x"]


    return group

def should_flip_x(team_role, period):
    """
    Returns True if this team in this period attacks right-to-left.
    """
    if period == 1:
        return team_role == "away"
    elif period == 2:
        return team_role == "home"
    else:
        return False  # Just in case

# Apply mirroring per run
all_runs_df = all_runs_df.groupby("run_id", group_keys=False).apply(mirror_group)

  all_runs_df = all_runs_df.groupby("run_id", group_keys=False).apply(mirror_group)


In [79]:
# Step 1: Precompute centroids for all frames and both teams (excluding goalkeeper)
players_df["number"] = players_df["number"].astype(int)

centroid_dict = {}
for (match_id, period, frame_idx), group in players_df.groupby(["match_id", "period", "frameIdx"]):
    for side in ["home", "away"]:
        team_players = group[(group["side"] == side) & (group["number"] != 1)]
        centroid = team_players[["x", "y"]].mean().values if not team_players.empty else np.array([0.0, 0.0])
        centroid_dict[(match_id, period, frame_idx, side)] = centroid

print(f"Centroids computed for {len(centroid_dict)} frame-side combinations.")
print(list(centroid_dict.items())[:5])  # show first few for inspection

frame_last_touch_team = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch_team"].to_dict()
player_side_lookup = players_df.set_index(["match_id", "period", "frameIdx", "playerId"])["side"].to_dict()

adjusted_runs_list = []

grouped = all_runs_df.groupby(["match_id", "period", "playerId", "run_id"], group_keys=False)

for _, run_df in grouped:
    run_df = run_df.sort_values("frameIdx")
    match_id = run_df["match_id"].iloc[0]
    period = run_df["period"].iloc[0]
    start_frame = run_df["frameIdx"].iloc[0]

    team_centroid = np.array([0.0, 0.0])  # fallback default

    key = (match_id, period, start_frame)
    possession_side = frame_last_touch_team.get(key)

    team_role = run_df["team_role"].iloc[0]

    if possession_side is None: 
        in_possession = np.nan
        phase_of_play = np.nan
    else: 
        in_possession = (team_role == possession_side)
        phase_of_play = "attack" if in_possession else "defend" 

    run_df["in_possession"] = in_possession
    run_df["phase_of_play"] = phase_of_play

    # if possession_side is not None:
    #     team_centroid = centroid_dict.get((match_id, period, start_frame, possession_side), team_centroid)

    # Always compute centroid for this player's own team. As opposed to above where we used possession side.
    team_centroid = centroid_dict.get(
        (match_id, period, start_frame, team_role),
        np.array([0.0, 0.0])
    )

    #print(team_centroid)

    run_df["x_c"] = run_df["x"] - team_centroid[0]
    run_df["y_c"] = run_df["y"] - team_centroid[1]
    run_df["x_mirror_c"] = run_df["x_mirror"] - team_centroid[0]
    run_df["y_mirror_c"] = run_df["y_mirror"] - team_centroid[1]

    flip_x = should_flip_x(team_role, period)
    if flip_x:
        run_df["x_mirror"] = -run_df["x_mirror"]
        run_df["x_mirror_c"] = -run_df["x_mirror_c"]

    adjusted_runs_list.append(run_df)

final_runs_df = pd.concat(adjusted_runs_list, ignore_index=True)

Centroids computed for 284880 frame-side combinations.
[(('tracking_g2292810.json', 1, 0, 'home'), array([-9.79272727,  8.73818182])), (('tracking_g2292810.json', 1, 0, 'away'), array([9.522, 9.349])), (('tracking_g2292810.json', 1, 1, 'home'), array([-9.78818182,  8.74090909])), (('tracking_g2292810.json', 1, 1, 'away'), array([9.523, 9.355])), (('tracking_g2292810.json', 1, 2, 'home'), array([-9.77909091,  8.74181818]))]


In [65]:
missing = event_tracking_df[event_tracking_df["tracking_suffix"].isna()]
print("🚫 Events without tracking match:")
print(missing[["events_file", "team_name"]])

🚫 Events without tracking match:
Empty DataFrame
Columns: [events_file, team_name]
Index: []


In [54]:
print(tracking_meta_df[["match_date", "home_team", "away_team", "home_team_full", "away_team_full"]])

      match_date home_team away_team     home_team_full    away_team_full
0     2024-10-05       EVE       NEW            Everton  Newcastle United
1     2022-08-30       CRY       BRE     Crystal Palace         Brentford
2     2024-08-17       IPS       LIV       Ipswich Town         Liverpool
3     2023-03-12       FUL       ARS             Fulham           Arsenal
4     2024-01-30       FUL       EVE             Fulham           Everton
...          ...       ...       ...                ...               ...
1135  2025-01-19       NOT       SOU  Nottingham Forest       Southampton
1136  2025-02-22       AVL       CHE        Aston Villa           Chelsea
1137  2023-01-14       BRE       BOU          Brentford   AFC Bournemouth
1138  2025-01-04       SOU       BRE        Southampton         Brentford
1139  2022-08-13       ARS       LEI            Arsenal    Leicester City

[1140 rows x 5 columns]


In [55]:
missing_home = tracking_meta_df.loc[
    tracking_meta_df["home_team"].notna() &
    tracking_meta_df["home_team_full"].isna(),
    "home_team"
].unique()

missing_away = tracking_meta_df.loc[
    tracking_meta_df["away_team"].notna() &
    tracking_meta_df["away_team_full"].isna(),
    "away_team"
].unique()

print("Missing home team codes:", missing_home)
print("Missing away team codes:", missing_away)

Missing home team codes: []
Missing away team codes: []


In [56]:
print(event_meta_exploded["team_name"].unique())
print()
print(print(tracking_meta_long["team_name"].unique()))

print("Number of unique teams in event metadata:", len(event_meta_exploded["team_name"].unique()))
print("Number of unique teams in tracking metadata:", len(tracking_meta_long["team_name"].unique()))

missing_in_tracking = set(event_meta_exploded["team_name"].unique()) - set(tracking_meta_long["team_name"].unique())
print("Teams in events but missing in tracking:", missing_in_tracking)

['Crystal Palace' 'Arsenal' 'Wolverhampton Wanderers' 'Leeds United'
 'Leicester City' 'Brentford' 'Newcastle United' 'Nottingham Forest'
 'Southampton' 'Tottenham Hotspur' 'Everton' 'Chelsea' 'Liverpool'
 'Fulham' 'AFC Bournemouth' 'Aston Villa' 'Manchester City'
 'West Ham United' 'Brighton & Hove Albion' 'Manchester United' 'Burnley'
 'Sheffield United' 'Luton Town' 'Ipswich Town']

['Everton' 'Crystal Palace' 'Ipswich Town' 'Fulham' 'Brentford'
 'AFC Bournemouth' 'Liverpool' 'Sheffield United' 'Southampton'
 'Manchester United' 'Tottenham Hotspur' 'West Ham United' 'Leeds United'
 'Newcastle United' 'Chelsea' 'Aston Villa' 'Wolverhampton Wanderers'
 'Manchester City' 'Nottingham Forest' 'Brighton & Hove Albion' 'Arsenal'
 'Leicester City' 'Luton Town' 'Burnley']
None
Number of unique teams in event metadata: 24
Number of unique teams in tracking metadata: 24
Teams in events but missing in tracking: set()
