# Player Similarity Analysis
## Finding Similar Players Using Machine Learning
### Cosine Similarity on Position-Weighted Feature Vectors

This notebook explains the player similarity algorithm and demonstrates how to find statistically similar players.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import sys
sys.path.append('../src')
from analysis import get_data

df = get_data()

def get_pos(p):
    if pd.isna(p): return None
    p = p.upper()
    if 'GK' in p: return 'GK'
    elif 'DF' in p: return 'DF'
    elif 'MF' in p: return 'MF'
    elif 'FW' in p: return 'FW'
    return None

df['main_pos'] = df['pos'].apply(get_pos)
print(f"Loaded {len(df)} players")

## 1. What is Cosine Similarity?

Cosine similarity measures the angle between two vectors in multi-dimensional space:

$$\cos(\theta) = \frac{A \cdot B}{||A|| \times ||B||}$$

- **1.0** = Identical direction (most similar)
- **0.0** = Perpendicular (no similarity)
- **-1.0** = Opposite direction

In [None]:
# Simple example
player_a = np.array([[10, 5, 8]])  # 10 goals, 5 assists, 8 xG
player_b = np.array([[12, 6, 9]])  # Similar profile
player_c = np.array([[2, 15, 3]])  # Different profile (playmaker)

print(f"Similarity A-B (similar strikers): {cosine_similarity(player_a, player_b)[0][0]:.3f}")
print(f"Similarity A-C (striker vs playmaker): {cosine_similarity(player_a, player_c)[0][0]:.3f}")

## 2. Position-Based Metrics

Different positions are compared using different stat profiles.
Now includes Understat metrics (xGChain, xGBuildup) for richer comparison.

In [None]:
# Position-specific metrics (updated for dual-source data)
FW_METRICS = ['gls', 'ast', 'g_a', 'xg', 'xag', 'npxg', 'g_pk', 'sh', 'sot',
              'kp', 'ppa', 'touches', 'carries', 'prgr', 'finishing_alpha',
              'xgchain', 'xgbuildup', 'us_shots', 'us_key_passes']

MF_METRICS = ['gls', 'ast', 'g_a', 'xg', 'xag', 'npxg', 'tkl', 'tklw',
              'int', 'prgp', 'prgc', 'kp', 'ppa', 'touches', 'carries', 'prgr',
              'crdy', 'recov', 'playmaking_alpha', 'xgchain', 'xgbuildup']

DF_METRICS = ['tkl', 'tklw', 'int', 'clr', 'err', 'prgp', 'prgc',
              'touches', 'carries', 'crdy', 'recov', 'xgbuildup']

GK_METRICS = ['ga', 'saves', 'cs', 'ga90']

for name, metrics in [("FW", FW_METRICS), ("MF", MF_METRICS), ("DF", DF_METRICS), ("GK", GK_METRICS)]:
    available = [m for m in metrics if m in df.columns]
    print(f"{name}: {len(available)}/{len(metrics)} metrics available")

## 3. Data Preprocessing

Steps:
1. Filter by position (compare apples to apples)
2. Handle missing values
3. Standardize features (z-score normalization)

In [None]:
# Example: Forwards
forwards = df[df['main_pos'] == 'FW'].copy()
available_metrics = [m for m in FW_METRICS if m in forwards.columns]
print(f"Forwards: {len(forwards)} players, {len(available_metrics)} metrics")

metrics_df = forwards[available_metrics].fillna(0)

# Before standardization
print("\nBefore Standardization (sample):")
print(metrics_df[['gls', 'xg', 'ast']].describe().loc[['mean', 'std']].round(2))

In [None]:
# Apply StandardScaler
scaler = StandardScaler()
metrics_scaled = scaler.fit_transform(metrics_df)

print("After Standardization:")
print(pd.DataFrame(metrics_scaled, columns=available_metrics).describe().loc[['mean', 'std']].round(2))

## 4. Calculate Similarity Matrix

In [None]:
sim_matrix = cosine_similarity(metrics_scaled)
print(f"Similarity Matrix Shape: {sim_matrix.shape}")

# Show sample
sample_players = forwards['player'].iloc[:5]
sim_sample = pd.DataFrame(sim_matrix[:5, :5], index=sample_players, columns=sample_players)
sim_sample.round(3)

## 5. Find Similar Players Function

In [None]:
def find_similar_players(df, player_name, top_n=10):
    player_row = df[df['player'].str.lower().str.contains(player_name.lower(), na=False)]
    if len(player_row) == 0:
        return None, f"Player '{player_name}' not found"
    
    player = player_row.iloc[0]
    pos = get_pos(player['pos'])
    
    metric_map = {'FW': FW_METRICS, 'MF': MF_METRICS, 'DF': DF_METRICS, 'GK': GK_METRICS}
    metrics = metric_map.get(pos, FW_METRICS)
    
    pos_df = df[df['main_pos'] == pos].copy()
    available = [m for m in metrics if m in pos_df.columns]
    
    data = pos_df[available].fillna(0)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(data)
    
    sim = cosine_similarity(scaled)
    player_idx = pos_df.index.get_loc(player_row.index[0])
    
    similarities = list(enumerate(sim[player_idx]))
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    results = []
    for idx, score in similarities[1:top_n+1]:
        p = pos_df.iloc[idx]
        results.append({
            'player': p['player'], 'squad': p['squad'], 'comp': p['comp'],
            'similarity': round(score * 100, 1)
        })
    
    return results, player

# Test
results, original = find_similar_players(df, 'Haaland')
if results:
    print(f"Players similar to {original['player']} ({original['squad']}):")
    for i, r in enumerate(results, 1):
        print(f"  {i}. {r['player']:<25} {r['squad']:<20} {r['similarity']}%")

## 6. Radar Chart Comparison

In [None]:
def radar_compare(df, player1_name, player2_name, metrics=None):
    if metrics is None:
        metrics = ['gls', 'ast', 'xg', 'xag', 'finishing_alpha', 'playmaking_alpha']
    
    p1 = df[df['player'].str.contains(player1_name, case=False, na=False)].iloc[0]
    p2 = df[df['player'].str.contains(player2_name, case=False, na=False)].iloc[0]
    
    available = [m for m in metrics if m in df.columns]
    max_vals = {m: max(abs(df[m].max()), abs(df[m].min()), 1) for m in available}
    
    angles = np.linspace(0, 2 * np.pi, len(available), endpoint=False).tolist()
    angles += angles[:1]
    
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    
    for player, color, label in [(p1, '#3498db', p1['player']), (p2, '#e74c3c', p2['player'])]:
        vals = [(player[m] / max_vals[m] * 100) if pd.notna(player.get(m)) and max_vals[m] != 0 else 0 for m in available]
        vals += vals[:1]
        ax.plot(angles, vals, 'o-', linewidth=2, label=label, color=color)
        ax.fill(angles, vals, alpha=0.15, color=color)
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(available, fontsize=10)
    ax.set_title(f'{p1["player"]} vs {p2["player"]}', fontsize=14, pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()
    plt.show()

# Example
radar_compare(df, 'Haaland', 'Salah')

## 7. More Examples

In [None]:
# Midfield comparison
for name in ['De Bruyne', 'Pedri', 'Bellingham']:
    results, original = find_similar_players(df, name, top_n=5)
    if results:
        print(f"\n{original['player']} ({original['squad']}, {original['pos']}):")
        for i, r in enumerate(results, 1):
            print(f"  {i}. {r['player']:<25} {r['similarity']}%")

## Key Insights

1. **Cosine Similarity**: Measures angle between player stat vectors (scale-independent)
2. **Position Filtering**: Only compares players in similar roles
3. **Dual-Source Metrics**: xGChain and xGBuildup add involvement context beyond goals/assists
4. **Standardization**: Essential for fair comparison across different statistical scales
5. **Use Cases**: Scouting, transfer targets, replacement analysis, tactical planning