In [None]:
import json, gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pathlib
from sklearn.cluster import KMeans
from scipy.special import comb
from copy import deepcopy
import random
from tqdm import tqdm
import seaborn as sns


In [None]:
TRACKING_DIR = pathlib.Path("tracking-compressed")
json_gz_paths = sorted(TRACKING_DIR.glob("tracking_*.json.gz"))

n_files = 1 # or set to len(json_gz_paths) to load all

frames = []
players = []
used_match_ids = []  # <- store used match_ids here

for file_idx, json_gz_path in enumerate(json_gz_paths[:n_files]):
    match_id = json_gz_path.stem  # e.g. "tracking_g2444470"
    used_match_ids.append(match_id)  # <- keep track of what's loaded

    records = []

    with gzip.open(json_gz_path, "rt", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))

    for r in records:
        f_data = {
            "match_id": match_id,
            "period": r["period"],
            "frameIdx": r["frameIdx"],
            "gameClock": r["gameClock"],
            "lastTouch_team": r["lastTouch"],
            "ball_x": r["ball"]["xyz"][0],
            "ball_y": r["ball"]["xyz"][1],
            "ball_z": r["ball"]["xyz"][2],
        }
        frames.append(f_data)

        for side in ["homePlayers", "awayPlayers"]:
            for p in r[side]:
                px, py, pz = p["xyz"]
                players.append({
                    "match_id": match_id,
                    "period": r["period"],
                    "frameIdx": r["frameIdx"],
                    "side": "home" if side == "homePlayers" else "away",
                    "playerId": p["playerId"],
                    "optaId": str(p["optaId"]),
                    "number": p["number"],
                    "x": px, "y": py, "z": pz,
                    "speed": p["speed"],
                })

# Convert to DataFrames
frames_df = pd.DataFrame(frames)
players_df = pd.DataFrame(players)

In [None]:
# Step 1: Extract suffixes like "g2444470" from used tracking files
used_match_suffixes = [match_id.split("_", 1)[1].replace(".json", "") for match_id in used_match_ids]

# Step 2: Gather metadata file paths from both formats
metadata_dir = pathlib.Path("metadata_SecondSpectrum")
all_metadata_files = list(metadata_dir.glob("*.json"))

# Build map: match_suffix (e.g., "g2444470") → metadata_path
metadata_file_map = {}
for path in all_metadata_files:
    filename = path.name
    if filename.startswith("metadata_g") and filename.endswith(".json"):
        suffix = filename.split("_")[1].split(".")[0]  # 'g2444470'
    elif filename.endswith("_SecondSpectrum_Metadata.json"):
        suffix = filename.split("_")[0]  # 'g2444470'
    else:
        continue  # skip non-matching files
    metadata_file_map[suffix] = path



# Step 3: Load metadata and build lookup for used matches
opta_meta_lookup = {}

for suffix in used_match_suffixes:
    metadata_path = metadata_file_map.get(suffix)
    if not metadata_path:
        print(f" No metadata found for match {suffix}")
        continue

    with open(metadata_path, "r", encoding="utf-8-sig") as f:
        meta = json.load(f)

    match_id = f"tracking_{suffix}"  # same format as tracking match_id

    for side, team in [("homePlayers", "home"), ("awayPlayers", "away")]:
        for p in meta.get(side, []):
            key = (match_id, str(p["optaId"]))
            opta_meta_lookup[key] = {
                "player_name": p.get("name"),
                "position": p.get("position"),
                "team_role": team,
            }

print(f" Loaded metadata for {len(opta_meta_lookup)} players.")

meta_df = pd.DataFrame([
    {
        "match_id": match_id,
        "optaId": opta_id,
        "player_name": info["player_name"],
        "position": info["position"],
        "team_role": info["team_role"],
    }
    for (match_id, opta_id), info in opta_meta_lookup.items()
])

players_df["match_id_clean"] = players_df["match_id"].str.replace(".json", "", regex=False)

# Merge using match_id and optaId as keys
players_df = players_df.merge(
    meta_df,
    how="left",
    left_on=["match_id_clean", "optaId"],
    right_on=["match_id", "optaId"]
)

players_df.drop(columns=["match_id_clean", "match_id_y"], inplace=True)
players_df.rename(columns={"match_id_x": "match_id"}, inplace=True)



In [None]:
# Segmenting Runs (across multiple matches)
def segment_runs(players_df, speed_threshold=2.0):
    """
    Segments continuous runs for each player within each match and period
    when speed exceeds a threshold.
    """
    runs = []
    for (match_id, period, playerId), group in players_df.groupby(["match_id", "period", "playerId"]):
        group = group.sort_values("frameIdx")
        current_run = []
        for _, row in group.iterrows():
            if row["speed"] > speed_threshold:
                current_run.append(row)
            elif current_run:
                runs.append(pd.DataFrame(current_run))
                current_run = []
        if current_run:
            runs.append(pd.DataFrame(current_run))
    return runs

def filter_off_ball_runs_with_distance(runs_list, frames_df, players_df, min_distance=3.0):
    """
    Filters runs to keep only those where:
    - The player never touched the ball (not lastTouch)
    - The player is always at least `min_distance` away from the ball
    """
    frame_last_touch = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch_team"].to_dict()
    ball_positions = frames_df.set_index(["match_id", "period", "frameIdx"])[["ball_x", "ball_y"]].to_dict("index")
    
    off_ball_runs = []

    for run_df in runs_list:
        player_id = run_df["playerId"].iloc[0]
        match_id = run_df["match_id"].iloc[0]
        period = run_df["period"].iloc[0]
        frame_idxs = run_df["frameIdx"].values

        is_off_ball = True
        for frame_idx in frame_idxs:
            key = (match_id, period, frame_idx)

            # Check lastTouch
            if frame_last_touch.get(key) == player_id:
                is_off_ball = False
                break

            # Check distance from ball
            ball_pos = ball_positions.get(key)
            if ball_pos is None:
                continue  # Skip frames with missing ball info

            player_pos = run_df[run_df["frameIdx"] == frame_idx][["x", "y"]].values
            if player_pos.size == 0:
                continue

            dist = np.linalg.norm(player_pos[0] - np.array([ball_pos["ball_x"], ball_pos["ball_y"]]))
            if dist < min_distance:
                is_off_ball = False
                break

        if is_off_ball:
            off_ball_runs.append(run_df)

    return off_ball_runs

runs_list = segment_runs(players_df)
print(f"Total runs segmented: {len(runs_list)}")

runs_list = filter_off_ball_runs_with_distance(runs_list, frames_df, players_df, min_distance=3.0)
print(f"Total off-ball runs (with min distance): {len(runs_list)}")

# Annotate each run with player metadata
annotated_runs = []

for run_df in runs_list:
    # Make a copy of the run to avoid modifying in-place
    run_df = run_df.copy()

    # Extract metadata from the first row (same for entire run)
    meta_fields = ["playerId", "optaId", "match_id", "player_name", "position", "team_role"]
    for field in meta_fields:
        run_df[field] = run_df.iloc[0][field]

    annotated_runs.append(run_df)

# Assign a unique run_id to each run
for i, run_df in enumerate(annotated_runs):
    run_df["run_id"] = i

# Optional: Combine into one dataframe
all_runs_df = pd.concat(annotated_runs, ignore_index=True)

# Preview
print(all_runs_df[["player_name", "position", "team_role", "x", "y", "speed"]].head())

In [None]:
# Def mirror function
def mirror_group(group):

    # y_start = group.iloc[0]["y"]

    # if y_start < 0:
    #     group["x_mirror"] = group["x"]
    #     group["y_mirror"] = -group["y"]
    # else:
    #     group["x_mirror"] = group["x"]
    #     group["y_mirror"] = group["y"]

    y_mean = group["y"].mean()
    if y_mean < 0:
        group["y_mirror"] = -group["y"]
    else:
        group["y_mirror"] = group["y"]
    group["x_mirror"] = group["x"]


    return group

def should_flip_x(team_role, period):
    """
    Returns True if this team in this period attacks right-to-left.
    """
    if period == 1:
        return team_role == "away"
    elif period == 2:
        return team_role == "home"
    else:
        return False  # Just in case

# Apply mirroring per run
all_runs_df = all_runs_df.groupby("run_id", group_keys=False).apply(mirror_group)

In [None]:
# THIS IS FOR CENTROID CALCULATION FOR PERSON PERFORMING RUN 

# # Precompute centroids for all frames and both teams
# centroid_dict = {}  # (match_id, period, frameIdx, team_side) -> centroid np.array

# for (match_id, period, frame_idx), group in players_df.groupby(["match_id", "period", "frameIdx"]):
#     for side in ["home", "away"]:
#         team_players = group[(group["side"] == side) & (group["number"] != 1)]
#         if not team_players.empty:
#             centroid = team_players[["x", "y"]].mean().values
#         else:
#             centroid = np.array([0.0, 0.0])
#         centroid_dict[(match_id, period, frame_idx, side)] = centroid

# # Precompute playerId → side map for each (match_id, period, frameIdx)
# player_side_lookup = players_df.set_index(["match_id", "period", "frameIdx", "playerId"])["side"].to_dict()

# # Convert frames_df for fast access to lastTouch per frame
# frame_last_touch = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch"].to_dict()

# adjusted_runs_list = []

# # Now adjust each run
# for run_df in runs_list:
#     run_df = run_df.sort_values("frameIdx")
#     match_id = run_df["match_id"].iloc[0]
#     period = run_df["period"].iloc[0]
#     start_frame = run_df["frameIdx"].iloc[0]

#     team_centroid = np.array([0.0, 0.0])  # fallback

#     key = (match_id, period, start_frame)
#     first_player = run_df["playerId"].iloc[0]
#     side = player_side_lookup.get((match_id, period, start_frame, first_player))

#     if side is not None:
#         team_centroid = centroid_dict.get((match_id, period, start_frame, side), team_centroid)

#     run_df["x_c"] = run_df["x"] - team_centroid[0]
#     run_df["y_c"] = run_df["y"] - team_centroid[1]
#     run_df["x_mirror_c"] = run_df["x_mirror"] - team_centroid[0]
#     run_df["y_mirror_c"] = run_df["y_mirror"] - team_centroid[1]

#     adjusted_runs_list.append(run_df)

# THIS IS FOR CENTROID CALCULATION ACCORDING TO WHOMEVER IS IN POSSESSION

# # Precompute centroids for all frames and both teams
# centroid_dict = {}  # (match_id, period, frameIdx, team_side) -> centroid np.array

# for (match_id, period, frame_idx), group in players_df.groupby(["match_id", "period", "frameIdx"]):
#     for side in ["home", "away"]:
#         team_players = group[(group["side"] == side) & (group["number"] != 1)]
#         if not team_players.empty:
#             centroid = team_players[["x", "y"]].mean().values
#         else:
#             centroid = np.array([0.0, 0.0])
#         centroid_dict[(match_id, period, frame_idx, side)] = centroid

# # Precompute playerId → side map for each (match_id, period, frameIdx)
# player_side_lookup = players_df.set_index(["match_id", "period", "frameIdx", "playerId"])["side"].to_dict()

# # Convert frames_df for fast access to lastTouch per frame
# frame_last_touch = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch"].to_dict()

# adjusted_runs_list = []

# grouped = all_runs_df.groupby(["match_id", "period", "playerId", "run_id"])  # assuming you added a unique run_id

# for _, run_df in grouped:
#     run_df = run_df.sort_values("frameIdx")
#     match_id = run_df["match_id"].iloc[0]
#     match_id = match_id.replace(".json", "")
#     period = run_df["period"].iloc[0]
#     start_frame = run_df["frameIdx"].iloc[0]

#     team_centroid = np.array([0.0, 0.0])  # fallback if no possession info


#     key = (match_id, period, start_frame)
#     if key not in centroid_dict:
#         print("Missing key:", key)
#     last_touch_player = frame_last_touch.get(key)

#     if last_touch_player is not None:
#         possession_side = player_side_lookup.get((match_id, period, start_frame, last_touch_player))
#         if possession_side is not None:
#             team_centroid = centroid_dict.get((match_id, period, start_frame, possession_side), team_centroid)
    
#     print(team_centroid)

#     run_df["x_c"] = run_df["x"] - team_centroid[0]
#     run_df["y_c"] = run_df["y"] - team_centroid[1]
#     run_df["x_mirror_c"] = run_df["x_mirror"] - team_centroid[0]
#     run_df["y_mirror_c"] = run_df["y_mirror"] - team_centroid[1]

#     adjusted_runs_list.append(run_df)

# # Combine into a final DataFrame
# final_runs_df = pd.concat(adjusted_runs_list, ignore_index=True)


In [None]:
# Precompute centroids for all frames and both teams (excluding goalkeeper)
# centroid_dict = {}
# for (match_id, period, frame_idx), group in players_df.groupby(["match_id", "period", "frameIdx"]):
#     for side in ["home", "away"]:
#         team_players = group[(group["side"] == side) & (group["number"] != 1)]
#         centroid = team_players[["x", "y"]].mean().values if not team_players.empty else np.array([0.0, 0.0])
#         centroid_dict[(match_id, period, frame_idx, side)] = centroid

# Step 1: Precompute centroids for all frames and both teams (excluding goalkeeper)
players_df["number"] = players_df["number"].astype(int)

centroid_dict = {}
for (match_id, period, frame_idx), group in players_df.groupby(["match_id", "period", "frameIdx"]):
    for side in ["home", "away"]:
        team_players = group[(group["side"] == side) & (group["number"] != 1)]
        centroid = team_players[["x", "y"]].mean().values if not team_players.empty else np.array([0.0, 0.0])
        centroid_dict[(match_id, period, frame_idx, side)] = centroid

print(f"Centroids computed for {len(centroid_dict)} frame-side combinations.")
print(list(centroid_dict.items())[:5])  # show first few for inspection

#print(centroid_dict)







In [None]:
# Build lastTouch and player-side lookups
frame_last_touch_team = frames_df.set_index(["match_id", "period", "frameIdx"])["lastTouch_team"].to_dict()
player_side_lookup = players_df.set_index(["match_id", "period", "frameIdx", "playerId"])["side"].to_dict()

# # Pick a few random keys
# sample_keys = list(frame_last_touch.keys())[:5]

# for key in sample_keys:
#     player_id = frame_last_touch[key]
#     print(f"Frame {key} → Last touch by Player ID {player_id}")

#     # Optional: Look up player info
#     player_info = players_df[
#         (players_df["match_id"] == key[0]) &
#         (players_df["period"] == key[1]) &
#         (players_df["frameIdx"] == key[2]) &
#         (players_df["playerId"] == player_id)
#     ]
#     print(player_info[["playerId", "optaId", "player_name", "side"]].drop_duplicates())
#     print("---")

In [None]:
adjusted_runs_list = []

grouped = all_runs_df.groupby(["match_id", "period", "playerId", "run_id"], group_keys=False)

for _, run_df in grouped:
    run_df = run_df.sort_values("frameIdx")
    match_id = run_df["match_id"].iloc[0]
    period = run_df["period"].iloc[0]
    start_frame = run_df["frameIdx"].iloc[0]

    team_centroid = np.array([0.0, 0.0])  # fallback default

    key = (match_id, period, start_frame)
    possession_side = frame_last_touch_team.get(key)

    team_role = run_df["team_role"].iloc[0]

    if possession_side is None: 
        in_possession = np.nan
        phase_of_play = np.nan
    else: 
        in_possession = (team_role == possession_side)
        phase_of_play = "attack" if in_possession else "defend" 

    run_df["in_possession"] = in_possession
    run_df["phase_of_play"] = phase_of_play

    # if possession_side is not None:
    #     team_centroid = centroid_dict.get((match_id, period, start_frame, possession_side), team_centroid)

    # Always compute centroid for this player's own team. As opposed to above where we used possession side.
    team_centroid = centroid_dict.get(
        (match_id, period, start_frame, team_role),
        np.array([0.0, 0.0])
    )

    #print(team_centroid)

    run_df["x_c"] = run_df["x"] - team_centroid[0]
    run_df["y_c"] = run_df["y"] - team_centroid[1]
    run_df["x_mirror_c"] = run_df["x_mirror"] - team_centroid[0]
    run_df["y_mirror_c"] = run_df["y_mirror"] - team_centroid[1]

    flip_x = should_flip_x(team_role, period)
    if flip_x:
        run_df["x_mirror"] = -run_df["x_mirror"]
        run_df["x_mirror_c"] = -run_df["x_mirror_c"]

    adjusted_runs_list.append(run_df)

final_runs_df = pd.concat(adjusted_runs_list, ignore_index=True)

In [None]:
plt.hist(final_runs_df["x_mirror_c"], bins=500, alpha=0.4, label="x_mirror_c")
plt.hist(final_runs_df["x"], bins=500, alpha=0.4, label="x")
plt.hist(final_runs_df["x_c"], bins=500, alpha=0.4, label="x_c")
plt.hist(final_runs_df["x_mirror"], bins=500, alpha=0.4, label="x_mirror")
plt.title("Distribution of x_mirror_c, x, x_c, and x_mirror")
plt.xlabel("Position (x-axis)")
plt.ylabel("Frequency")
plt.legend()
plt.show()

plt.hist(final_runs_df["y_mirror_c"], bins=500, alpha=0.4, label="y_mirror_c")
plt.hist(final_runs_df["y"], bins=500, alpha=1.0, label="y")
plt.hist(final_runs_df["y_c"], bins=500, alpha=0.4, label="y_c")
plt.hist(final_runs_df["y_mirror"], bins=500, alpha=0.4, label="y_mirror")
plt.title("Distribution of y_mirror_c, y, y_c, and y_mirror")
plt.xlabel("Position (y-axis)")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
# Define grid
x_edges = np.linspace(-52.5, 52.5, 4)   # splits pitch length into thirds
y_edges = np.linspace(-34, 34, 4)       # splits width into thirds

def get_zone(x, y, x_edges, y_edges):
    x_bin = np.digitize([x], x_edges)[0] - 1
    y_bin = np.digitize([y], y_edges)[0] - 1
    x_bin = min(max(x_bin, 0), len(x_edges)-2)
    y_bin = min(max(y_bin, 0), len(y_edges)-2)
    zone_idx = y_bin * (len(x_edges)-1) + x_bin + 1
    return zone_idx

# zone_records = []

# for run_id, run_df in final_runs_df.groupby("run_id"):
#     start_x = run_df["x_mirror_c"].iloc[0]
#     start_y = run_df["y_mirror_c"].iloc[0]

#     end_x = run_df["x_mirror_c"].iloc[-1]
#     end_y = run_df["y_mirror_c"].iloc[-1]

#     start_zone = get_zone(start_x, start_y, x_edges, y_edges)
#     end_zone = get_zone(end_x, end_y, x_edges, y_edges)

#     phase_of_play = run_df["phase_of_play"].iloc[0]
#     in_possession = run_df["in_possession"].iloc[0]
#     team_role = run_df["team_role"].iloc[0]
#     position = run_df["position"].iloc[0]

#     zone_records.append({
#         "run_id": run_id,
#         "start_zone": start_zone,
#         "end_zone": end_zone,
#         "phase_of_play": phase_of_play,
#         "in_possession": in_possession,
#         "team_role": team_role,
#         "position": position
#     })

# zones_df = pd.DataFrame(zone_records)
# print(zones_df.head())

zone_records = []

for run_id, run_df in final_runs_df.groupby("run_id"):
    # Mirrored & centered positions
    start_x_mirror_c = run_df["x_mirror_c"].iloc[0]
    start_y_mirror_c = run_df["y_mirror_c"].iloc[0]

    end_x_mirror_c = run_df["x_mirror_c"].iloc[-1]
    end_y_mirror_c = run_df["y_mirror_c"].iloc[-1]

    start_zone = get_zone(start_x_mirror_c, start_y_mirror_c, x_edges, y_edges)
    end_zone = get_zone(end_x_mirror_c, end_y_mirror_c, x_edges, y_edges)

    # Absolute pitch positions
    start_x_abs = run_df["x"].iloc[0]
    start_y_abs = run_df["y"].iloc[0]

    end_x_abs = run_df["x"].iloc[-1]
    end_y_abs = run_df["y"].iloc[-1]

    start_zone_abs = get_zone(start_x_abs, start_y_abs, x_edges, y_edges)
    end_zone_abs = get_zone(end_x_abs, end_y_abs, x_edges, y_edges)

    phase_of_play = run_df["phase_of_play"].iloc[0]
    in_possession = run_df["in_possession"].iloc[0]
    team_role = run_df["team_role"].iloc[0]
    position = run_df["position"].iloc[0]

    zone_records.append({
        "run_id": run_id,
        "start_zone": start_zone,
        "end_zone": end_zone,
        "start_zone_absolute": start_zone_abs,
        "end_zone_absolute": end_zone_abs,
        "phase_of_play": phase_of_play,
        "in_possession": in_possession,
        "team_role": team_role,
        "position": position
    })

zones_df = pd.DataFrame(zone_records)
print(zones_df.head())

In [None]:
# #  Adjust mirrored coordinates relative to possession-side team centroid
# adjusted_runs_list = []

# for run_id, run_df in all_runs_df.groupby("run_id"):
#     run_df = run_df.copy()
#     run_df = run_df.sort_values("frameIdx")

#     match_id = run_df["match_id"].iloc[0]
#     period = run_df["period"].iloc[0]
#     start_frame = run_df["frameIdx"].iloc[0]

#     team_centroid = np.array([0.0, 0.0])  # fallback
#     key = (match_id, period, start_frame)
#     last_touch_player = frame_last_touch.get(key)

#     if last_touch_player is not None:
#         side_key = (match_id, period, start_frame, last_touch_player)
#         possession_side = player_side_lookup.get(side_key)
#         if possession_side is not None:
#             team_centroid = centroid_dict.get((match_id, period, start_frame, possession_side), team_centroid)

#     run_df["x_c"] = run_df["x"] - team_centroid[0]
#     run_df["y_c"] = run_df["y"] - team_centroid[1]
#     run_df["x_mirror_c"] = run_df["x_mirror"] - team_centroid[0]
#     run_df["y_mirror_c"] = run_df["y_mirror"] - team_centroid[1]

#     adjusted_runs_list.append(run_df)

# # Step 4: Combine into final DataFrame
# final_runs_df = pd.concat(adjusted_runs_list, ignore_index=True)

In [None]:
# Equation (5): Bézier basis function
def bernstein_poly(p, P, t):
    return comb(P - 1, p) * (t**p) * ((1 - t)**(P - 1 - p))

# Equation (7): Design matrix for Bézier fitting
def bezier_design_matrix(num_points, num_control_points):
    t_vals = np.linspace(0, 1, num_points)
    X = np.stack([bernstein_poly(p, num_control_points, t_vals) for p in range(num_control_points)], axis=1)
    return X  # shape: [num_points, num_control_points]

# Equation (6): Fit Bézier curve via least squares
def fit_bezier_curve(coords, num_control_points):
    """
    coords: shape [N, 2] — sequence of (x, y) points
    returns: control_points [P, 2]
    """
    N = coords.shape[0]
    X = bezier_design_matrix(N, num_control_points)  # shape [N, P]
    
    # Solve least squares for x and y separately
    theta_x, _, _, _ = np.linalg.lstsq(X, coords[:, 0], rcond=None)
    theta_y, _, _, _ = np.linalg.lstsq(X, coords[:, 1], rcond=None)
    
    control_points = np.stack([theta_x, theta_y], axis=1)  # shape: [P, 2]
    return control_points

# Equation (4): Evaluate Bézier curve at t using control points θ
def evaluate_bezier_curve(control_points, num_points=50):
    """
    Returns sampled points along the Bézier curve.
    """
    P = control_points.shape[0]
    X = bezier_design_matrix(num_points, P)  # shape [num_points, P]
    curve = X @ control_points  # shape: [num_points, 2]
    return curve

In [None]:
def resample_coords(coords, num_points=50):
    from scipy.interpolate import interp1d
    if len(coords) < 2:
        return np.tile(coords[0], (num_points, 1))  # Edge case
    distances = np.cumsum(np.linalg.norm(np.diff(coords, axis=0), axis=1))
    distances = np.insert(distances, 0, 0.0)
    total_length = distances[-1]
    if total_length == 0:
        return np.tile(coords[0], (num_points, 1))
    normalized_dist = distances / total_length
    interp_func = interp1d(normalized_dist, coords, axis=0, kind='linear')
    uniform_dist = np.linspace(0, 1, num_points)
    return interp_func(uniform_dist)

def compute_l1_distance(traj, bezier_curve):
    """
    traj, bezier_curve: both of shape [num_points, 2]
    """
    return np.mean(np.abs(traj - bezier_curve))  # L1 averaged over all points and dimensions

In [None]:
max_iterations = 10
tolerance = 1e-3  # Minimum improvement in objective to continue
num_points = 50
num_control_points = 4
k_clusters = 70

# Initialize cluster centers (Bézier curves)
random.seed(42)
initial_centroids = random.sample(adjusted_runs_list, k_clusters)
cluster_control_points = []

for run in initial_centroids:
    coords = run[["x_mirror_c", "y_mirror_c"]].values
    control_pts = fit_bezier_curve(coords, num_control_points)
    cluster_control_points.append(control_pts)

cluster_control_points = np.array(cluster_control_points)
previous_objective = float('inf')

for it in range(max_iterations):
    # Assignment step
    assignments = []
    objective_distances = []

    for run_id, run_df in final_runs_df.groupby("run_id"):
        coords = run_df[["x_mirror_c", "y_mirror_c"]].values
        # resampled_coords = resample_coords(coords, num_points=num_points)

        # min_dist = float("inf")
        # assigned_cluster = -1

        # for cluster_idx, control_pts in enumerate(cluster_control_points):
        #     bezier_curve = evaluate_bezier_curve(control_pts, num_points=num_points)
        #     dist = compute_l1_distance(resampled_coords, bezier_curve)
        #     if dist < min_dist:
        #         min_dist = dist
        #         assigned_cluster = cluster_idx

        # assignments.append(assigned_cluster)
        # objective_distances.append(min_dist)

        resampled_coords = resample_coords(coords, num_points=num_points)
        if resampled_coords is None:
            continue  # skip bad run

        min_dist = float("inf")
        assigned_cluster = -1

        for cluster_idx, control_pts in enumerate(cluster_control_points):
            bezier_curve = evaluate_bezier_curve(control_pts, num_points=num_points)
            dist = compute_l1_distance(resampled_coords, bezier_curve)
            if dist < min_dist:
                min_dist = dist
                assigned_cluster = cluster_idx

        # assignments.append(assigned_cluster)

            # Save metadata + cluster assignment
        assignments.append({
            "run_id": run_id,
            "assigned_cluster": assigned_cluster,
            "min_distance": min_dist,
            "playerId": run_df["playerId"].iloc[0],
            "player_name": run_df["player_name"].iloc[0],
            "position": run_df["position"].iloc[0],
            "team_role": run_df["team_role"].iloc[0],
            "match_id": run_df["match_id"].iloc[0],
        })

        assignments_df = pd.DataFrame(assignments)

        #print(assignments_df.head())

        objective_distances.append(min_dist)

    objective = np.mean(objective_distances)
    print(f"Iteration {it}: Mean objective = {objective:.4f}")

    # Check for convergence
    improvement = previous_objective - objective
    if improvement < tolerance:
        print(f"Converged (Δ={improvement:.6f}) at iteration {it}")
        break
    previous_objective = objective

    # Update step
    new_cluster_control_points = []

    for cluster_idx in range(k_clusters):
        #assigned_indices = [i for i, a in enumerate(assignments) if a == cluster_idx]
        assigned_indices = [i for i, a in enumerate(assignments) if a["assigned_cluster"] == cluster_idx]
        if not assigned_indices:
            new_cluster_control_points.append(cluster_control_points[cluster_idx])
            continue

        cluster_coords = []
        for idx in assigned_indices:
            run_id = assignments_df.loc[idx, "run_id"]
            #run_df = all_runs_df[all_runs_df["run_id"] == run_id]
            run_df = final_runs_df[final_runs_df["run_id"] == run_id]

            coords = run_df[["x_mirror_c", "y_mirror_c"]].values
            resampled = resample_coords(coords, num_points=num_points)
            # cluster_coords.append(resampled)
            if resampled is not None:
                cluster_coords.append(resampled)

        cluster_coords = np.stack(cluster_coords, axis=0)
        mean_coords = np.mean(cluster_coords, axis=0)
        new_control_pts = fit_bezier_curve(mean_coords, num_control_points)
        new_cluster_control_points.append(new_control_pts)

    cluster_control_points = np.array(new_cluster_control_points)



In [None]:
# Build assignments DataFrame
assignments_df = pd.DataFrame(assignments)

assignments_zones = assignments_df.merge(
    zones_df,
    on="run_id",
    how="left"
)

#print(assignments_zones.head())

print(assignments_zones.columns)

# Drop duplicates
assignments_zones.drop(columns=[
    "position_y",
    "team_role_y"
], inplace=True, errors="ignore")

# Rename x columns back to plain names
assignments_zones.rename(columns={
    "position_x": "position",
    "team_role_x": "team_role",
}, inplace=True)

# print(assignments_zones.head())
# print(assignments_zones.columns)


# Count number of runs per position per cluster
position_detail_counts = (
    assignments_zones
    .groupby(["assigned_cluster", "position"])
    .size()
    .reset_index(name="num_runs")
    .sort_values(["assigned_cluster", "num_runs"], ascending=[True, False])
)

print(position_detail_counts.head(20))


position_pivot = (
    position_detail_counts
    .pivot_table(index="assigned_cluster",
                 columns="position",
                 values="num_runs",
                 fill_value=0)
    .reset_index()
)

print(position_pivot.head())


# plt.figure(figsize=(12,6))
# sns.barplot(
#     data=position_detail_counts,
#     x="assigned_cluster",
#     y="num_runs",
#     hue="position"
# )
# plt.title("Run Counts per Cluster by Specific Position")
# plt.show()



# Filtering for specific positions: 

filter only LB runs:

In [None]:
lb_runs = assignments_zones[assignments_zones["position"] == "LB"]

print(lb_runs.head())

If you want only clusters containing LB runs

In [None]:
lb_clusters = lb_runs["assigned_cluster"].unique()
print("Clusters with LB runs:", lb_clusters)

In [None]:
# Keep first row of each run
runs_meta_df = final_runs_df.groupby("run_id", as_index=False).first()

# Merge metadata into assignments
merged_df = assignments_df.merge(
    runs_meta_df,
    on="run_id",
    how="left"
)

# Clean up columns
merged_df.drop(columns=[
    "position_y",
    "player_name_y",
    "team_role_y",
    "match_id_y",
    "playerId_y",
], inplace=True, errors="ignore")

merged_df.rename(columns={
    "player_name_x": "player_name",
    "position_x": "position",
    "team_role_x": "team_role",
    "match_id_x": "match_id",
    "playerId_x": "playerId",
}, inplace=True)

#print("merged_df columns:", merged_df.columns)

# Group and count
position_counts = (
    merged_df
    .groupby(["assigned_cluster", "position"])
    .size()
    .reset_index(name="num_runs")
    .sort_values(["assigned_cluster", "num_runs"], ascending=[True, False])
)

#print(position_counts.head(100))

In [None]:
unique_positions = merged_df["position"].dropna().unique()
print("Unique positions found:", unique_positions)
print("Total unique positions:", len(unique_positions))

In [None]:
# Mapping of fine-grained positions → high-level buckets
position_bucket_map = {
    "GK": "sub",       # treat as non-field player for running
    "SUB": "sub",
    
    # Defenders
    "CB": "defender",
    "RCB": "defender",
    "LCB": "defender",
    "RB": "defender",
    "LB": "defender",
    "RWB": "defender",
    "LWB": "defender",
    
    # Midfielders
    "CDM": "midfielder",
    "RDM": "midfielder",
    "LDM": "midfielder",
    "CM": "midfielder",
    "RCM": "midfielder",
    "LCM": "midfielder",
    "CAM": "midfielder",
    "RM": "midfielder",
    "LM": "midfielder",
    
    # Attackers
    "LW": "attacker",
    "RW": "attacker",
    "ST": "attacker",
    "CF": "attacker",
    "RF": "attacker",
    "LF": "attacker",
}

# Map positions to buckets
merged_df["position_bucket"] = merged_df["position"].map(
    lambda pos: position_bucket_map.get(pos, "unknown")
)

print(merged_df[["assigned_cluster", "position", "position_bucket"]].head(1000))


In [None]:
bucket_counts = (
    merged_df
    .groupby(["assigned_cluster", "position_bucket"])
    .size()
    .reset_index(name="num_runs")
    .sort_values(["assigned_cluster", "num_runs"], ascending=[True, False])
)

print(bucket_counts.head(20))

bucket_pivot = (
    bucket_counts
    .pivot_table(index="assigned_cluster", 
                 columns="position_bucket", 
                 values="num_runs", 
                 fill_value=0)
    .reset_index()
)

print(bucket_pivot.head())

In [None]:
def plot_all_clusters_grid(cluster_control_points, final_runs_df, assignments, samples_per_cluster=50, rows=10, cols=7):
    import matplotlib.pyplot as plt
    import random

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 3, rows * 3))
    axes = axes.flatten()

    # Convert assignments to a DataFrame for easy filtering
    assignments_df = pd.DataFrame(assignments)

    for cluster_idx in range(len(cluster_control_points)):
        ax = axes[cluster_idx]

        # Plot Bézier center curve
        bezier_curve = evaluate_bezier_curve(cluster_control_points[cluster_idx], num_points=50)
        ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], 'k-', linewidth=2, label="Cluster")

        # Find runs assigned to this cluster
        run_ids_in_cluster = assignments_df.loc[
            assignments_df["assigned_cluster"] == cluster_idx, "run_id"
        ].tolist()

        # Randomly sample some of them
        if run_ids_in_cluster:
            sampled_run_ids = random.sample(
                run_ids_in_cluster,
                min(samples_per_cluster, len(run_ids_in_cluster))
            )

            for run_id in sampled_run_ids:
                run_df = final_runs_df[final_runs_df["run_id"] == run_id]
                coords = run_df[["x_mirror_c", "y_mirror_c"]].values
                ax.plot(coords[:, 0], coords[:, 1], alpha=0.5)

        ax.set_title(f"Cluster {cluster_idx}\n(n={len(run_ids_in_cluster)})", fontsize=9)
        ax.axis("equal")
        ax.axis("off")

    # Hide any unused subplots
    for i in range(len(cluster_control_points), len(axes)):
        axes[i].axis("off")

    plt.tight_layout()
    plt.show()

plot_all_clusters_grid(cluster_control_points, final_runs_df, assignments)

In [None]:
def draw_pitch(ax, pitch_length=105, pitch_width=68):
    """
    Draws a football pitch centered around (0, 0) on the given matplotlib axis.
    """
    half_length = pitch_length / 2
    half_width = pitch_width / 2

    # Outer boundary & centre line
    ax.plot([-half_length, -half_length, half_length, half_length, -half_length],
            [-half_width, half_width, half_width, -half_width, -half_width], color="black")
    ax.plot([0, 0], [-half_width, half_width], color="black")

    # Left penalty area
    ax.plot([-half_length + 16.5, -half_length + 16.5], [-13.84, 13.84], color="black")
    ax.plot([-half_length, -half_length + 16.5], [-13.84, -13.84], color="black")
    ax.plot([-half_length, -half_length + 16.5], [13.84, 13.84], color="black")

    # Right penalty area
    ax.plot([half_length - 16.5, half_length - 16.5], [-13.84, 13.84], color="black")
    ax.plot([half_length, half_length - 16.5], [-13.84, -13.84], color="black")
    ax.plot([half_length, half_length - 16.5], [13.84, 13.84], color="black")

    # Center circle
    circle = plt.Circle((0, 0), 9.15, color="black", fill=False)
    ax.add_patch(circle)

    ax.set_xlim(-half_length, half_length)
    ax.set_ylim(-half_width, half_width)
    ax.set_aspect("equal")
    ax.axis("off")

# def plot_all_cluster_trajectories_on_pitch(
#     final_runs_df,
#     assignments,
#     cluster_control_points,
#     num_control_points=4,
#     max_runs_per_cluster=30
# ):
#     num_clusters = len(cluster_control_points)
#     fig, axes = plt.subplots(7, 10, figsize=(30, 20))  # adjust grid for k clusters
#     axes = axes.flatten()

#     assignments_df = pd.DataFrame(assignments)

#     for cluster_idx in range(num_clusters):
#         ax = axes[cluster_idx]
#         draw_pitch(ax)

#         # Find all run_ids assigned to this cluster
#         cluster_run_ids = assignments_df.loc[
#             assignments_df["assigned_cluster"] == cluster_idx, "run_id"
#         ].tolist()

#         if len(cluster_run_ids) > max_runs_per_cluster:
#             cluster_run_ids = random.sample(cluster_run_ids, max_runs_per_cluster)

#         for run_id in cluster_run_ids:
#             run_df = final_runs_df[final_runs_df["run_id"] == run_id]
            
#             coords = run_df[["x_mirror_c", "y_mirror_c"]].values
#             if coords.shape[0] < 2:
#                 continue

#             resampled = resample_coords(coords, num_points=50)
#             control_pts = fit_bezier_curve(resampled, num_control_points)

#             # Recover start location to shift back to pitch space
#             start_pos = run_df[["x_mirror_c", "y_mirror_c"]].values[0]
#             shifted_ctrl_pts = control_pts - control_pts[0] + start_pos
#             bezier_curve = evaluate_bezier_curve(shifted_ctrl_pts, num_points=50)

#             ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], alpha=0.2, color="blue")

#         ax.set_title(f"Cluster {cluster_idx}", fontsize=8)

#     # Hide unused plots
#     for i in range(num_clusters, len(axes)):
#         axes[i].axis("off")

#     plt.tight_layout()
#     plt.suptitle("All Bézier Run Trajectories per Cluster (On-Pitch View)", fontsize=16, y=1.02)
#     plt.show()

# plot_all_cluster_trajectories_on_pitch(
#     final_runs_df,
#     assignments,
#     cluster_control_points,
#     num_control_points=4,
#     max_runs_per_cluster=100
# )

In [None]:
# def plot_all_cluster_trajectories_on_pitch(
#     final_runs_df,
#     assignments,
#     cluster_control_points,
#     bucket_pivot,
#     num_control_points=4,
#     max_runs_per_cluster=30,
#     plot_absolute_positions = True
# ):
#     num_clusters = len(cluster_control_points)
#     fig, axes = plt.subplots(7, 10, figsize=(30, 20))  # adjust grid for k clusters
#     axes = axes.flatten()

#     assignments_df = pd.DataFrame(assignments)

#     # Set index for quick lookups
#     bucket_pivot = bucket_pivot.set_index("assigned_cluster")

#     for cluster_idx in range(num_clusters):
#         ax = axes[cluster_idx]
#         draw_pitch(ax)

#         # Plot runs
#         cluster_run_ids = assignments_df.loc[
#             assignments_df["assigned_cluster"] == cluster_idx, "run_id"
#         ].tolist()

#         if len(cluster_run_ids) > max_runs_per_cluster:
#             cluster_run_ids = random.sample(cluster_run_ids, max_runs_per_cluster)

#         for run_id in cluster_run_ids:
#             run_df = final_runs_df[final_runs_df["run_id"] == run_id]

#             coords = run_df[["x_mirror_c", "y_mirror_c"]].values
#             if coords.shape[0] < 2:
#                 continue

#             resampled = resample_coords(coords, num_points=50)
#             control_pts = fit_bezier_curve(resampled, num_control_points)

#             if plot_absolute_positions:
#                 # Recover original start location
#                 start_pos = run_df[["x", "y"]].values[0]
#                 shifted_ctrl_pts = control_pts - control_pts[0] + start_pos
#                 bezier_curve = evaluate_bezier_curve(
#                     shifted_ctrl_pts, num_points=50
#                 )
#             else:
#                 # Keep in mirrored + centroid-relative space
#                 bezier_curve = evaluate_bezier_curve(
#                     control_pts, num_points=50
#                 )

#             ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], alpha=0.2, color="blue")

#             # # Recover start location to shift back to pitch space
#             # start_pos = run_df[["x", "y"]].values[0]
#             # shifted_ctrl_pts = control_pts - control_pts[0] + start_pos
#             # bezier_curve = evaluate_bezier_curve(shifted_ctrl_pts, num_points=50)

#             # ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], alpha=0.2, color="blue")

#         # Add text box with position bucket counts
#         if cluster_idx in bucket_pivot.index:
#             row = bucket_pivot.loc[cluster_idx]
#             text_lines = []
#             for bucket in ["attacker", "midfielder", "defender", "sub", "unknown"]:
#                 if bucket in row and row[bucket] > 0:
#                     text_lines.append(f"{bucket}: {int(row[bucket])}")
#             text = "\n".join(text_lines)
#             ax.text(
#                 0.5, 1.05, text,
#                 transform=ax.transAxes,
#                 ha="center", va="bottom",
#                 fontsize=8,
#                 bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)
#             )

#         ax.set_title(f"Cluster {cluster_idx}", fontsize=8)

#     for i in range(num_clusters, len(axes)):
#         axes[i].axis("off")

#     plt.tight_layout()
#     plt.suptitle("All Bézier Run Trajectories per Cluster (On-Pitch View)", fontsize=16, y=1.02)
#     plt.show()

# plot_all_cluster_trajectories_on_pitch(
#     final_runs_df,
#     assignments,
#     cluster_control_points,
#     bucket_pivot=bucket_pivot,
#     num_control_points=4,
#     max_runs_per_cluster=100,
#     plot_absolute_positions=False
# )

In [None]:
# def plot_all_cluster_trajectories_on_pitch(
#     final_runs_df,
#     assignments_zones,
#     cluster_control_points,
#     bucket_pivot,
#     num_control_points=4,
#     max_runs_per_cluster=30,
#     plot_absolute_positions=True,
#     start_zones=None,
#     end_zones=None,
#     phases_of_play=None,
#     positions=None
# ):
#     import matplotlib.pyplot as plt
#     import random

#     num_clusters = len(cluster_control_points)
#     fig, axes = plt.subplots(7, 10, figsize=(30, 20))  # adjust grid as needed
#     axes = axes.flatten()

#     # --- FILTERING ---
#     filtered_assignments = assignments_zones.copy()

#     if start_zones is not None:
#         filtered_assignments = filtered_assignments[
#             filtered_assignments["start_zone"].isin(start_zones)
#         ]

#     if end_zones is not None:
#         filtered_assignments = filtered_assignments[
#             filtered_assignments["end_zone"].isin(end_zones)
#         ]

#     if phases_of_play is not None:
#         filtered_assignments = filtered_assignments[
#             filtered_assignments["phase_of_play"].isin(phases_of_play)
#         ]

#     if positions is not None:
#         filtered_assignments = filtered_assignments[
#             filtered_assignments["position"].isin(positions)
#         ]

#     filtered_run_ids = filtered_assignments["run_id"].unique()

#     # Set index for quick lookups
#     bucket_pivot = bucket_pivot.set_index("assigned_cluster")

#     for cluster_idx in range(num_clusters):
#         ax = axes[cluster_idx]
#         draw_pitch(ax)

#         # Find runs assigned to this cluster
#         cluster_run_ids = assignments_zones.loc[
#             assignments_zones["assigned_cluster"] == cluster_idx, "run_id"
#         ].tolist()

#         # Apply filters
#         cluster_run_ids = [
#             rid for rid in cluster_run_ids if rid in filtered_run_ids
#         ]

#         if len(cluster_run_ids) > max_runs_per_cluster:
#             cluster_run_ids = random.sample(cluster_run_ids, max_runs_per_cluster)

#         for run_id in cluster_run_ids:
#             run_df = final_runs_df[final_runs_df["run_id"] == run_id]

#             coords = run_df[["x_mirror_c", "y_mirror_c"]].values
#             if coords.shape[0] < 2:
#                 continue

#             resampled = resample_coords(coords, num_points=50)
#             control_pts = fit_bezier_curve(resampled, num_control_points)

#             if plot_absolute_positions:
#                 start_pos = run_df[["x", "y"]].values[0]
#                 shifted_ctrl_pts = control_pts - control_pts[0] + start_pos
#                 bezier_curve = evaluate_bezier_curve(
#                     shifted_ctrl_pts, num_points=50
#                 )
#             else:
#                 bezier_curve = evaluate_bezier_curve(
#                     control_pts, num_points=50
#                 )

#             ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], alpha=0.2, color="blue")

#         # Add text box with position bucket counts
#         if cluster_idx in bucket_pivot.index:
#             row = bucket_pivot.loc[cluster_idx]
#             text_lines = []
#             for bucket in ["attacker", "midfielder", "defender", "sub", "unknown"]:
#                 if bucket in row and row[bucket] > 0:
#                     text_lines.append(f"{bucket}: {int(row[bucket])}")
#             text = "\n".join(text_lines)
#             ax.text(
#                 0.5, 1.05, text,
#                 transform=ax.transAxes,
#                 ha="center", va="bottom",
#                 fontsize=8,
#                 bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)
#             )

#         ax.set_title(f"Cluster {cluster_idx}", fontsize=8)

#     for i in range(num_clusters, len(axes)):
#         axes[i].axis("off")

#     plt.tight_layout()
#     plt.suptitle("All Bézier Run Trajectories per Cluster (On-Pitch View)", fontsize=16, y=1.02)
#     plt.show()

In [None]:
def plot_all_cluster_trajectories_on_pitch(
    final_runs_df,
    assignments_zones,
    cluster_control_points,
    bucket_pivot,
    num_control_points=4,
    max_runs_per_cluster=30,
    plot_absolute_positions=True,
    start_zones=None,
    end_zones=None,
    phases_of_play=None,
    positions=None,
    use_absolute_zones=False,
    start_zones_absolute=None,
    end_zones_absolute=None
):
    import matplotlib.pyplot as plt
    import random

    num_clusters = len(cluster_control_points)
    fig, axes = plt.subplots(7, 10, figsize=(30, 20))
    axes = axes.flatten()

    # --- FILTERING ---
    filtered_assignments = assignments_zones.copy()

    if start_zones is not None and not use_absolute_zones:
        filtered_assignments = filtered_assignments[
            filtered_assignments["start_zone"].isin(start_zones)
        ]

    if end_zones is not None and not use_absolute_zones:
        filtered_assignments = filtered_assignments[
            filtered_assignments["end_zone"].isin(end_zones)
        ]

    if start_zones_absolute is not None and use_absolute_zones:
        filtered_assignments = filtered_assignments[
            filtered_assignments["start_zone_absolute"].isin(start_zones_absolute)
        ]

    if end_zones_absolute is not None and use_absolute_zones:
        filtered_assignments = filtered_assignments[
            filtered_assignments["end_zone_absolute"].isin(end_zones_absolute)
        ]

    if phases_of_play is not None:
        filtered_assignments = filtered_assignments[
            filtered_assignments["phase_of_play"].isin(phases_of_play)
        ]

    if positions is not None:
        filtered_assignments = filtered_assignments[
            filtered_assignments["position"].isin(positions)
        ]

    filtered_run_ids = filtered_assignments["run_id"].unique()

    bucket_pivot = bucket_pivot.set_index("assigned_cluster")

    for cluster_idx in range(num_clusters):
        ax = axes[cluster_idx]
        draw_pitch(ax)

        cluster_run_ids = assignments_zones.loc[
            assignments_zones["assigned_cluster"] == cluster_idx, "run_id"
        ].tolist()

        cluster_run_ids = [
            rid for rid in cluster_run_ids if rid in filtered_run_ids
        ]

        if len(cluster_run_ids) > max_runs_per_cluster:
            cluster_run_ids = random.sample(cluster_run_ids, max_runs_per_cluster)

        for run_id in cluster_run_ids:
            run_df = final_runs_df[final_runs_df["run_id"] == run_id]

            coords = run_df[["x_mirror_c", "y_mirror_c"]].values
            if coords.shape[0] < 2:
                continue

            resampled = resample_coords(coords, num_points=50)
            control_pts = fit_bezier_curve(resampled, num_control_points)

            if plot_absolute_positions:
                start_pos = run_df[["x", "y"]].values[0]
                shifted_ctrl_pts = control_pts - control_pts[0] + start_pos
                bezier_curve = evaluate_bezier_curve(
                    shifted_ctrl_pts, num_points=50
                )
            else:
                bezier_curve = evaluate_bezier_curve(
                    control_pts, num_points=50
                )

            ax.plot(bezier_curve[:, 0], bezier_curve[:, 1], alpha=0.2, color="blue")

        if cluster_idx in bucket_pivot.index:
            row = bucket_pivot.loc[cluster_idx]
            text_lines = []
            for bucket in ["attacker", "midfielder", "defender", "sub", "unknown"]:
                if bucket in row and row[bucket] > 0:
                    text_lines.append(f"{bucket}: {int(row[bucket])}")
            text = "\n".join(text_lines)
            ax.text(
                0.5, 1.05, text,
                transform=ax.transAxes,
                ha="center", va="bottom",
                fontsize=8,
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)
            )

        ax.set_title(f"Cluster {cluster_idx}", fontsize=8)

    for i in range(num_clusters, len(axes)):
        axes[i].axis("off")

    plt.tight_layout()
    plt.suptitle("All Bézier Run Trajectories per Cluster (On-Pitch View)", fontsize=16, y=1.02)
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

# Define edges matching your pitch grid
x_edges = np.linspace(-52.5, 52.5, 4)
y_edges = np.linspace(-34, 34, 4)

fig, ax = plt.subplots(figsize=(10, 6))

# Draw pitch outline
draw_pitch(ax)

zone_idx = 1

# Draw zone rectangles and labels
for y_bin in range(3):
    for x_bin in range(3):
        x_left = x_edges[x_bin]
        x_right = x_edges[x_bin + 1]
        y_bottom = y_edges[y_bin]
        y_top = y_edges[y_bin + 1]

        width = x_right - x_left
        height = y_top - y_bottom

        rect = patches.Rectangle(
            (x_left, y_bottom),
            width,
            height,
            linewidth=1,
            edgecolor='black',
            facecolor='lightgrey',
            alpha=0.2
        )
        ax.add_patch(rect)

        # Label in center
        x_center = (x_left + x_right) / 2
        y_center = (y_bottom + y_top) / 2

        ax.text(
            x_center, y_center,
            str(zone_idx),
            ha='center',
            va='center',
            fontsize=14,
            fontweight='bold',
            color='black'
        )

        zone_idx += 1

ax.set_title("Zone Legend", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
plot_all_cluster_trajectories_on_pitch(
    final_runs_df,
    assignments_zones,
    cluster_control_points,
    bucket_pivot=bucket_pivot,
    num_control_points=4,
    max_runs_per_cluster=100,
    plot_absolute_positions=True,
    start_zones=None,
    end_zones=None,  # You can specify zones like [1, 2, 3] if you want to filter
    phases_of_play=None,
    positions=None,
    use_absolute_zones=True,
    start_zones_absolute=[8,9],
    end_zones_absolute=[8,9]
)