# ML Model Comparison: Detecting Elite Players

This notebook demonstrates the usage of our 10 foundational ML model scripts.
We will attempt to predict if a player is an "Elite" player (Top 10% by Wins) based on their PvP and Objective metrics.
**Goal:** Compare 5 Classic vs 5 Advanced Models.
**Setup:** Fixed Seed (808), 60/20/20 Split.

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns

# --- Path Setup ---
# Ensure we can import from 'ml' folder regardless of where notebook is run
current_dir = os.getcwd()
project_root = current_dir

# If we are in 'notebooks' folder, go up one level
if os.path.basename(current_dir) == 'notebooks':
    project_root = os.path.dirname(current_dir)

if project_root not in sys.path:
    sys.path.append(project_root)

# --- Imports ---
# Now we import using the package structure
from ml.utils import get_splitter, evaluate_classifier, SEED
from ml.visuals import set_style, save_plot, get_palette

# Import Models
from ml.logistic_regression import get_model as get_logreg
from ml.knn import get_model as get_knn
from ml.decision_tree import get_model as get_dt
from ml.random_forest import get_model as get_rf
from ml.svm import get_model as get_svm

from ml.xgboost_model import get_model as get_xgb
from ml.lightgbm_model import get_model as get_lgbm
from ml.gradient_boosting import get_model as get_gbm
from ml.neural_network import get_model as get_nn
from ml.adaboost import get_model as get_ada

# Apply Theme
set_style()

## 1. Load Data

In [None]:
# Load Golden Dataset - Robust path handling
data_path = os.path.join(project_root, 'data', 'processed', 'golden_dataset.parquet')
df = pd.read_parquet(data_path)

# Create Target Variable: "Elite" (Top 10% of Wins)
# We use 'wins_matches' as the success metric
target_metric = 'wins_matches'
threshold = df[target_metric].quantile(0.90)
df['is_elite'] = (df[target_metric] >= threshold).astype(int)

print(f"Dataset Shape: {df.shape}")
print(f"Elite Players: {df['is_elite'].sum()} ({df['is_elite'].mean():.1%})")
print(f"Features Available: {list(df.columns)}")

## 2. Preprocessing

In [None]:
# Drop Non-Numeric / ID columns and the Target source column to avoid leakage
# We want to predict "Match Winning Ability" using ONLY PvP and Objective stats.
# So we drop all match-related columns (wins, losses, matches, etc.) as they are directly leaks.
drop_cols = [
    'player_id', 
    'wins_matches', 'losses_matches', 'matches_matches', 'wl_ratio_matches', 'ties_matches', # Target Leakage
    'position_matches', 'position_pvp', 'position_objectives' # Rank Leakage
]

# Robust column dropping (only drop what exists)
existing_drop_cols = [c for c in drop_cols if c in df.columns]
X_df = df.drop(columns=existing_drop_cols)

# Filter for numeric
X_df = X_df.select_dtypes(include=[np.number])

# Fill NaNs (important for LogReg/NeuralNet)
X_df = X_df.fillna(0)

# Setup Target
y = df['is_elite']

# Combine for splitter
full_data = X_df.copy()
full_data['target'] = y

print(f"Training with {X_df.shape[1]} Features: {list(X_df.columns)}")

X_train, X_val, X_test, y_train, y_val, y_test = get_splitter(full_data, 'target')

## 3. Training Loop

In [None]:
models = {
    "Logistic Regression": get_logreg(),
    "KNN": get_knn(),
    "Decision Tree": get_dt(),
    "Random Forest": get_rf(),
    "SVM": get_svm(),
    "XGBoost": get_xgb(),
    "LightGBM (HistGB)": get_lgbm(),
    "Gradient Boosting": get_gbm(),
    "Neural Network": get_nn(),
    "AdaBoost": get_ada()
}

results = []

for name, model in models.items():
    if model is None:
        print(f"Skipping {name} (Not available)")
        continue
        
    print(f"Training {name}...")
    try:
        model.fit(X_train, y_train)
        
        # Predict on Validation set for comparison
        preds = model.predict(X_val)
        
        # Get probabilities if supported (for ROC AUC)
        probs = None
        if hasattr(model, "predict_proba"):
            probs = model.predict_proba(X_val)
        
        metrics = evaluate_classifier(y_val, preds, probs, model_name=name)
        results.append(metrics)
        
    except Exception as e:
        print(f"Error training {name}: {e}")

## 4. Results Comparison

In [None]:
results_df = pd.DataFrame(results).set_index("Model")
results_df = results_df.sort_values("F1 Score", ascending=False)

# Display Table
print(results_df)

In [None]:
# Visualization
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y=results_df['F1 Score'], palette=get_palette())
plt.title("Model Comparison - F1 Score (Elite Player Detection)")
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1.0)
plt.xlabel("")
plt.ylabel("F1 Score")

# Save Plot
save_path = os.path.join(project_root, 'plots', 'model_evaluation')
save_plot('model_f1_comparison', folder=save_path)

plt.show()

## 5. Test Best Model
Selecting the best model based on Validation F1 Score and running on the held-out Test Set.

In [None]:
best_model_name = results_df.index[0]
print(f"Best Model: {best_model_name}")

best_model = models[best_model_name]
# Predict on Test
test_preds = best_model.predict(X_test)
test_metrics = evaluate_classifier(y_test, test_preds, model_name=best_model_name + " (Test)")

pd.DataFrame([test_metrics])