In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np

# Load the dataset
df = pd.read_csv('pl_midfielders_master_combined.csv')

# Fix headers (Resolving CSV formatting issues)
actual_names = df.iloc[0].values
df.columns = [actual_names[i] if "Unnamed" in df.columns[i] else df.columns[i] for i in range(len(df.columns))]
df = df.drop(df.index[0]).reset_index(drop=True)

df.head()

In [None]:
# Convert necessary columns to numeric
metrics = ['90s Played', 'npxG', 'xAG', 'KP', 'PPA', 'Tkl+Int', 'Blocks', 'Clr', 'PrgDist', '1/3', 'PrgP']
for col in metrics:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Apply the "Kaizen" Filter: Core Players (>= 1000 Minutes)
core_df = df[df['90s Played'] >= (1000/90)].copy()

# Normalization: Calculate Per-90 Statistics
p90_cols = ['npxG', 'xAG', 'KP', 'PPA', 'Tkl+Int', 'Blocks', 'Clr', 'PrgDist', '1/3', 'PrgP']
for col in p90_cols:
    core_df[f'{col}_p90'] = core_df[col] / core_df['90s Played']

print(f"Dataset ready. Analyzing {len(core_df)} unique player-seasons.")

In [None]:
scaler = MinMaxScaler()

# Define metric groupings for indices
off_cols = ['npxG_p90', 'xAG_p90', 'KP_p90', 'PPA_p90']
def_cols = ['Tkl+Int_p90', 'Blocks_p90', 'Clr_p90']
prg_cols = ['PrgDist_p90', '1/3_p90', 'PrgP_p90']

# Generate Indices on a 0-100 scale
core_df['Offense_Index'] = scaler.fit_transform(core_df[off_cols]).mean(axis=1) * 100
core_df['Defense_Index'] = scaler.fit_transform(core_df[def_cols]).mean(axis=1) * 100
core_df['Progression_Index'] = scaler.fit_transform(core_df[prg_cols]).mean(axis=1) * 100

In [None]:
# Assign Percentiles for relative benchmarking
core_df['Off_Pct'] = core_df['Offense_Index'].rank(pct=True)
core_df['Def_Pct'] = core_df['Defense_Index'].rank(pct=True)
core_df['Prg_Pct'] = core_df['Progression_Index'].rank(pct=True)

def label_archetype(row):
    if row['Off_Pct'] > 0.7 and row['Prg_Pct'] > 0.7: return "Creative Engine"
    if row['Def_Pct'] > 0.7 and row['Prg_Pct'] > 0.7: return "Deep-Lying Progressor"
    if row['Def_Pct'] > 0.7 and row['Off_Pct'] < 0.4: return "Midfield Anchor"
    if row['Off_Pct'] > 0.4 and row['Def_Pct'] > 0.4 and row['Prg_Pct'] > 0.4: return "Modern Box-to-Box"
    return "System Rotational"

core_df['Archetype'] = core_df.apply(label_archetype, axis=1)

In [None]:
# Era Comparison (2017 vs 2025)
s_start, s_end = '2017-2018', '2024-2025'

print(f"{'Metric':<20} | {'Start Mean':<12} | {'End Mean':<12} | {'P-Value'}")
print("-" * 65)

for metric in ['Offense_Index', 'Defense_Index', 'Progression_Index']:
    m1 = core_df[core_df['Season'] == s_start][metric]
    m2 = core_df[core_df['Season'] == s_end][metric]
    p_val = stats.ttest_ind(m1, m2)[1]
    print(f"{metric:<20} | {m1.mean():<12.2f} | {m2.mean():<12.2f} | {p_val:.4f}")

In [None]:
def find_functional_replacements(target_player, target_season, top_n=6):
    # Retrieve target vector
    target_data = core_df[(core_df['Player'] == target_player) & (core_df['Season'] == target_season)]
    if target_data.empty: return "Player not found."

    target_vector = target_data[['Offense_Index', 'Defense_Index', 'Progression_Index']].values

    # Calculate Euclidean Distance (Similarity Search)
    distances = np.sqrt(((core_df[['Offense_Index', 'Defense_Index', 'Progression_Index']] - target_vector)**2).sum(axis=1))

    results = core_df.copy()
    results['Similarity_Distance'] = distances
    return results.sort_values('Similarity_Distance').head(top_n)[['Player', 'Season', 'Squad', 'Similarity_Distance']]

# Testing the ML Model
find_functional_replacements('Kevin De Bruyne', '2019-2020')

In [None]:
# Plot 1: Longitudinal Index Trends
trends = core_df.groupby('Season')[['Offense_Index', 'Defense_Index', 'Progression_Index']].mean()
trends.plot(kind='line', marker='o', figsize=(12, 5), title="Tactical Index Evolution")
plt.show()

# Plot 2: Archetype Count Evolution
archetype_trends = core_df.groupby(['Season', 'Archetype']).size().unstack(fill_value=0)
archetype_trends.plot(kind='line', marker='o', figsize=(12, 6), title="Archetype Population Trends")
plt.legend(bbox_to_anchor=(1.05, 1))
plt.show()

In [None]:
def plot_radar(p1_name, s1, p2_name, s2):
    radar_cols = ['npxG_p90', 'xAG_p90', 'KP_p90', 'PPA_p90', 'Tkl+Int_p90', 'PrgP_p90', '1/3_p90']
    radar_labels = ['npxG', 'xAG', 'KP', 'PPA', 'Tkl+Int', 'Prg Passes', 'Final 1/3 Passes']

    # Scale for radar
    radar_scaler = MinMaxScaler()
    radar_df = core_df.copy()
    radar_df[radar_cols] = radar_scaler.fit_transform(radar_df[radar_cols]) * 100

    p1 = radar_df[(radar_df['Player'].str.contains(p1_name)) & (radar_df['Season'] == s1)].iloc[0]
    p2 = radar_df[(radar_df['Player'].str.contains(p2_name)) & (radar_df['Season'] == s2)].iloc[0]

    angles = np.linspace(0, 2*np.pi, len(radar_labels), endpoint=False).tolist()
    angles += angles[:1]

    def get_v(row):
        v = row[radar_cols].values.tolist()
        return v + v[:1]

    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    ax.plot(angles, get_v(p1), color='red', label=f'{p1_name} ({s1})')
    ax.fill(angles, get_v(p1), color='red', alpha=0.1)
    ax.plot(angles, get_v(p2), color='blue', label=f'{p2_name} ({s2})')
    ax.fill(angles, get_v(p2), color='blue', alpha=0.1)
    ax.set_thetagrids(np.degrees(angles[:-1]), radar_labels)
    plt.title("Player Specification Comparison", y=1.1)
    plt.legend(loc='upper right')
    plt.show()

plot_radar('Xhaka', '2017-2018', 'Tielemans', '2024-2025')