# 03 - Upset Explorer

Analyze where underdog wins cluster and how upset rates change across context.

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

PROJECT_ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd().resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data_prep import load_matches, build_team1_win_target, assign_favorite_underdog_from_elo
from src.viz import upset_rate_by_bucket

In [None]:
df = load_matches()
df = build_team1_win_target(df)
df = assign_favorite_underdog_from_elo(df)
print(f"Rows: {len(df):,}")
print(f"Overall upset rate: {df['is_upset'].mean():.2%}")

In [None]:
bucket_table = upset_rate_by_bucket(df, score_col="elo_diff", upset_col="is_upset", bins=10)
bucket_table

In [None]:
stage_venue = (
    df.groupby(["match_stage", "venue"], observed=True)["is_upset"]
    .mean()
    .reset_index(name="upset_rate")
)
top_venues = (
    df["venue"].value_counts().head(12).index.tolist()
)
heat = stage_venue[stage_venue["venue"].isin(top_venues)].pivot(
    index="match_stage", columns="venue", values="upset_rate"
)

plt.figure(figsize=(14, 6))
sns.heatmap(heat, cmap="viridis", annot=False)
plt.title("Upset Rate by Match Stage and Venue (Top Venues)")
plt.tight_layout()
plt.show()

In [None]:
toss_stage = (
    df.groupby(["match_stage", "toss_decision"], observed=True)["is_upset"]
    .mean()
    .reset_index(name="upset_rate")
)
toss_stage.sort_values(["match_stage", "upset_rate"], ascending=[True, False]).head(30)

## SHAP Global Feature Importance

Train a compact model to predict team1 win and compute SHAP global feature importance.

In [None]:
# SHAP global explanation: pipeline pattern from src, compact model, fallback if SHAP unavailable
# Requires df from previous cells.
from sklearn.metrics import roc_auc_score

from src.data_prep import time_based_split
from src.features import build_pre_match_feature_frame
from src.models import train_logistic_baseline

train_df, valid_df, test_df = time_based_split(df)
X_train, y_train = build_pre_match_feature_frame(train_df)
X_valid, y_valid = build_pre_match_feature_frame(valid_df)

# Compact logistic model for explainability
shap_model = train_logistic_baseline(X_train, y_train)
prob_valid = shap_model.predict_proba(X_valid)[:, 1]
print("Logistic baseline trained for SHAP (validation ROC-AUC):",
      round(roc_auc_score(y_valid, prob_valid), 4))

In [None]:
try:
    import numpy as np
    import shap

    preprocess = shap_model.named_steps["preprocess"]
    clf = shap_model.named_steps["clf"]
    X_transformed = preprocess.transform(X_train)
    feature_names = preprocess.get_feature_names_out().tolist()

    sample_size = min(200, len(X_transformed))
    rng = np.random.default_rng(42)
    idx = rng.choice(len(X_transformed), size=sample_size, replace=False)
    X_bg = X_transformed[idx]

    eval_size = min(500, len(X_transformed))
    eval_idx = rng.choice(len(X_transformed), size=eval_size, replace=False)
    X_eval = X_transformed[eval_idx]

    explainer = shap.LinearExplainer(clf, X_bg, feature_names=feature_names)
    shap_vals = explainer.shap_values(X_eval)
    if isinstance(shap_vals, list):
        shap_vals = shap_vals[1]  # positive class for binary

    fig, ax = plt.subplots(figsize=(10, 8))
    shap.summary_plot(shap_vals, X_eval, feature_names=feature_names, plot_type="bar", show=False, max_display=15)
    plt.title("SHAP Global Feature Importance (Team1 Win Model)")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"SHAP not available or failed in this environment: {e}")
    print("Install with: pip install shap")