# 06 — ML: Predict Market Value
**Requires:** Run `01_load_and_filter.ipynb` first.

Train and compare regression models to predict the current market value of active Greek players using career stats and player attributes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

plt.style.use('seaborn-v0_8-whitegrid')

OUTPUTS_PATH = "../outputs/"
FIGURES_PATH = "../outputs/figures/"

greek_active      = pd.read_parquet(OUTPUTS_PATH + "greek_active.parquet")
greek_appearances = pd.read_parquet(OUTPUTS_PATH + "greek_appearances.parquet")

print("greek_active:     ", greek_active.shape)
print("greek_appearances:", greek_appearances.shape)

## 6.1 Build Career Stats from Appearances

In [None]:
career_stats = greek_appearances.groupby("player_id").agg(
    total_games   = ("game_id",        "count"),
    total_goals   = ("goals",          "sum"),
    total_assists = ("assists",        "sum"),
    total_yellows = ("yellow_cards",   "sum"),
    total_reds    = ("red_cards",      "sum"),
    total_minutes = ("minutes_played", "sum"),
).reset_index()

# Per-90 stats (avoid division by zero)
career_stats["goals_per_90"]   = career_stats["total_goals"]   / (career_stats["total_minutes"] / 90).replace(0, np.nan)
career_stats["assists_per_90"] = career_stats["total_assists"] / (career_stats["total_minutes"] / 90).replace(0, np.nan)
career_stats["yellows_per_90"] = career_stats["total_yellows"] / (career_stats["total_minutes"] / 90).replace(0, np.nan)

career_stats.head()

## 6.2 Join Career Stats with Player Attributes

In [None]:
df = greek_active.merge(career_stats, on="player_id", how="left")

print("Shape after join:", df.shape)
df[["name", "position", "age", "market_value_in_eur",
    "total_games", "goals_per_90", "assists_per_90"]].head()

## 6.3 Prepare Features

In [None]:
le_position = LabelEncoder()
le_foot     = LabelEncoder()

df["position_enc"] = le_position.fit_transform(df["position"].fillna("Unknown"))
df["foot_enc"]     = le_foot.fit_transform(df["foot"].fillna("Unknown"))

# age² — because age effect on value is non-linear (peak ~22-26, then drops)
df["age_squared"] = df["age"] ** 2

FEATURES = [
    "age", "age_squared", "height_in_cm", "position_enc", "foot_enc",
    "total_games", "total_goals", "total_assists",
    "total_minutes", "total_yellows", "total_reds",
    "goals_per_90", "assists_per_90", "yellows_per_90",
]
TARGET = "market_value_in_eur"

df_ml = df[FEATURES + [TARGET, "name"]].dropna(subset=[TARGET])

# Fill remaining NaN in features with median
df_ml[FEATURES] = df_ml[FEATURES].fillna(df_ml[FEATURES].median())

print(f"Rows for ML: {len(df_ml)}")
print(f"Missing values: {df_ml[FEATURES].isna().sum().sum()}")

In [None]:
X = df_ml[FEATURES]

# Log transform target — fixes skewed distribution (found in EDA 2.4)
y_log = np.log1p(df_ml[TARGET])
y_eur = df_ml[TARGET]  # keep original for final MAE in €

X_train, X_test, y_train, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)
# Keep original € values aligned to test set for evaluation
_, _, _, y_test_eur = train_test_split(
    X, y_eur, test_size=0.2, random_state=42
)

print(f"Train: {len(X_train)} | Test: {len(X_test)}")
print(f"Target (log scale) — mean: {y_train.mean():.2f}, std: {y_train.std():.2f}")

## 6.4 Train & Compare Models

In [None]:
models = {
    "Linear Regression":  LinearRegression(),
    "Random Forest":      RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting":  GradientBoostingRegressor(n_estimators=100, random_state=42),
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)

    preds_log = model.predict(X_test)
    preds_eur = np.expm1(preds_log)       # convert back from log to €

    mae = mean_absolute_error(y_test_eur, preds_eur)
    r2  = r2_score(y_test_log, preds_log) # R² on log scale

    results[name] = {"MAE (€)": mae, "R²": r2, "model": model, "preds_eur": preds_eur}
    print(f"{name:22s} → MAE: €{mae:>10,.0f}  |  R²: {r2:.3f}")

best_name  = max(results, key=lambda k: results[k]["R²"])
best_model = results[best_name]["model"]
best_preds = results[best_name]["preds_eur"]
print(f"\nBest model: {best_name}")

## 6.5 Predicted vs Actual

In [None]:
plt.figure(figsize=(7, 7))
plt.scatter(y_test_eur / 1e6, best_preds / 1e6, alpha=0.5, color="steelblue")
max_val = max(y_test_eur.max(), best_preds.max()) / 1e6
plt.plot([0, max_val], [0, max_val], "r--", label="Perfect prediction")
plt.xlabel("Actual Value (€M)")
plt.ylabel("Predicted Value (€M)")
plt.title(f"Predicted vs Actual — {best_name}")
plt.legend()
plt.tight_layout()
plt.savefig(FIGURES_PATH + "06_predicted_vs_actual.png", dpi=150)
plt.show()

## 6.6 Feature Importance

In [None]:
if hasattr(best_model, "feature_importances_"):
    importances = pd.Series(best_model.feature_importances_, index=FEATURES).sort_values(ascending=False)

    plt.figure(figsize=(10, 5))
    importances.plot(kind="bar", color="steelblue", edgecolor="black")
    plt.title(f"Feature Importance — {best_name}")
    plt.ylabel("Importance")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(FIGURES_PATH + "06_feature_importance.png", dpi=150)
    plt.show()

    print("\nTop 5 features:")
    print(importances.head())
else:
    print("Feature importances not available for this model type.")

## 6.7 Most Undervalued & Overvalued Players

In [None]:
# Predict on full dataset and convert back to €
df_ml["predicted_value"] = np.expm1(best_model.predict(df_ml[FEATURES]))
df_ml["residual"]        = df_ml["predicted_value"] - df_ml[TARGET]
df_ml["residual_M"]      = df_ml["residual"] / 1e6

print("=== Most Undervalued (model thinks they're worth more) ===")
print(df_ml.nsmallest(5, "residual")[["name", TARGET, "predicted_value", "residual_M"]]
      .rename(columns={TARGET: "actual_eur", "predicted_value": "predicted_eur"})
      .to_string(index=False))

print("\n=== Most Overvalued (model thinks they're worth less) ===")
print(df_ml.nlargest(5, "residual")[["name", TARGET, "predicted_value", "residual_M"]]
      .rename(columns={TARGET: "actual_eur", "predicted_value": "predicted_eur"})
      .to_string(index=False))

## 6.8 Save Model

In [None]:
import pickle

with open(OUTPUTS_PATH + "models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print(f"Saved {best_name} to outputs/models/best_model.pkl")