# UEFA Champions League 2021-22 — Player Clustering (Field Players only)
Pipeline: cleaning → correlation pruning → scaling → K-Means → interpretation

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from ucl_clustering_pipeline import cluster_players_pipeline, plot_metric_curve

# Try to reuse an existing merged dataframe `df` if running inside your environment/notebook
# Otherwise, attempt to recreate it with the same relative paths used in your original notebook.
df = None
try:
    df  # noqa
except NameError:
    df = None

if df is None:
    # Fallback: try to build df from csvs (paths based on your earlier notebook)
    data_root = Path("../data")
    csvs = {
        "attack_ds": data_root / "attacking.csv",
        "attempts_ds": data_root / "attempts.csv",
        "defend_ds": data_root / "defending.csv",
        "disciplinary_ds": data_root / "disciplinary.csv",
        "distribution_ds": data_root / "distribution.csv",
        "goalkeeping_ds": data_root / "goalkeeping.csv",
        "goals_ds": data_root / "goals.csv",
        "key_ds": data_root / "key_stats.csv",
        "passing_ds": data_root / "passing.csv",
        "misc_ds": data_root / "miscellaneous.csv",
    }
    exists = {k: p.exists() for k, p in csvs.items()}
    print("CSV presence:", exists)

    if not all(exists.values()):
        raise FileNotFoundError("One or more CSVs not found. Please adjust `data_root` or ensure files are present.")

    attack_ds = pd.read_csv(csvs["attack_ds"])
    attempts_ds = pd.read_csv(csvs["attempts_ds"])
    defend_ds = pd.read_csv(csvs["defend_ds"])
    disciplinary_ds = pd.read_csv(csvs["disciplinary_ds"])
    distribution_ds = pd.read_csv(csvs["distribution_ds"])
    goalkeeping_ds = pd.read_csv(csvs["goalkeeping_ds"])
    goals_ds = pd.read_csv(csvs["goals_ds"])
    key_ds = pd.read_csv(csvs["key_ds"])
    passing_ds = pd.read_csv(csvs["passing_ds"])
    misc_ds = pd.read_csv(csvs["misc_ds"])

    merge_keys = ['serial','player_name','club','position','match_played']
    df = attack_ds.merge(attempts_ds, on=merge_keys, how='outer')\
                  .merge(defend_ds, on=merge_keys, how='outer')\
                  .merge(disciplinary_ds, on=merge_keys, how='outer')\
                  .merge(distribution_ds, on=merge_keys, how='outer')\
                  .merge(goalkeeping_ds, on=merge_keys, how='outer')\
                  .merge(goals_ds, on=merge_keys, how='outer')\
                  .merge(key_ds, on=merge_keys, how='outer')\
                  .merge(passing_ds, on=merge_keys, how='outer')\
                  .merge(misc_ds, on=merge_keys, how='outer')

print("Merged shape:", None if df is None else df.shape)
print("Columns (sample):", None if df is None else list(df.columns)[:12])


## Run pipeline (Field players only)

In [None]:

results = cluster_players_pipeline(
    df=df,
    id_cols=["player_name", "club", "position"],
    position_col="position",
    correlation_threshold=0.90,
    k_min=2, k_max=10,
    drop_cols=["match_played"]  # typically not informative post-merge
)

print("Best k (silhouette):", results["best_k"])
# Plot silhouette scores across k
plot_metric_curve(results["silhouette_scores"], title="Silhouette Score vs k")

clusters_df = results["clusters_df"]
print("Clusters head:")
print(clusters_df.head())

# Save labeled players
out_csv = Path("players_clusters.csv")
clusters_df.to_csv(out_csv, index=False)
print(f"Saved clusters to: {out_csv.resolve()}")

# Display top features per centroid
print("\nTop features driving each cluster (z-scores):")
for cid, feats in results["centroid_top_features"].items():
    print(f"Cluster {cid}:")
    for fname, val in feats:
        print(f"  {fname:<30} {val:>8.3f}")


## (Optional) Inspect remaining features

In [None]:

kept = results["kept_features"]
print(f"{len(kept)} features kept after correlation pruning.")
print(kept[:30])


## Goalkeepers set (for a separate analysis later)

In [None]:

gk_df = results["goalkeepers_df"]
print("Goalkeepers rows:", gk_df.shape[0])
# You can run the same pipeline on gk_df if you want, by passing df=gk_df
