#### Imports

In [47]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from scipy.stats import spearmanr
import shap

In [48]:
df_clean = pd.read_parquet("../data/features.parquet")
modelling_df = df_clean.copy()

In [49]:
# Separate copies for each model
df_enet = df_clean.copy()
df_xgb = df_clean.copy()

In [50]:
# Common drops for both
common_drop = [
    "player", "team_id", "pos", "season", "is_mvp",
]

era_sensitive_cols = [
    # Scoring/usage
    "pts_per_g", "fg_per_g", "fga_per_g",
    "fg3_per_g", "fg3a_per_g", "ft_per_g", "fta_per_g",
    "usg_pct",
    # Shooting efficiency
    "fg_pct", "fg2_pct", "fg3_pct", "efg_pct", "ts_pct",
    # Playmaking/defense rates
    "ast_per_g", "stl_per_g", "blk_per_g", "tov_per_g", "pf_per_g",
    # Rebounding rates
    "orb_per_g", "drb_per_g", "trb_per_g",
    # Advanced impact
    "per", "ows", "dws", "ws", "ws_per_48",
    "obpm", "dbpm", "bpm", "vorp",
    # Team context
    "win_loss_pct", "mov", "mov_adj",
    # Shot mix
    "fg3a_per_fga_pct", "fta_per_fga_pct",
]

# Elastic Net: keep z-scores + ranks + engineered, drop raw duplicates
elastic_drop = common_drop + era_sensitive_cols

# Drop season rank columns
rank_cols = ["pts_per_g", "bpm", "vorp", "ws", "ts_pct", "win_loss_pct"]
rank_drop = [f"{c}_rank_season" for c in rank_cols]

enet_drop = set(elastic_drop + rank_drop)
df_enet = df_enet.drop(columns=enet_drop)


# XGBoost: keep almost everything; only drop IDs/target
xgb_drop = common_drop
df_xgb = df_xgb.drop(columns=xgb_drop)

In [51]:
# season-based 70/15/15 split
seasons = sorted(modelling_df["season"].unique())

train_seasons = seasons[:29]
val_seasons = seasons[29:35]
test_seasons = seasons[35:]

train_idx = modelling_df["season"].isin(train_seasons)
val_idx = modelling_df["season"].isin(val_seasons)
test_idx = modelling_df["season"].isin(test_seasons)

#### Season-based train and test split

In [52]:
target = "award_share"

# Elastic Net
X_enet_train = df_enet.loc[train_idx].drop(columns=[target])
y_enet_train = df_enet.loc[train_idx][target]
X_enet_val   = df_enet.loc[val_idx].drop(columns=[target])
y_enet_val   = df_enet.loc[val_idx][target]
X_enet_test  = df_enet.loc[test_idx].drop(columns=[target])
y_enet_test  = df_enet.loc[test_idx][target]

# XGBoost
X_xgb_train = df_xgb.loc[train_idx].drop(columns=[target])
y_xgb_train = df_xgb.loc[train_idx][target]
X_xgb_val   = df_xgb.loc[val_idx].drop(columns=[target])
y_xgb_val   = df_xgb.loc[val_idx][target]
X_xgb_test  = df_xgb.loc[test_idx].drop(columns=[target])
y_xgb_test  = df_xgb.loc[test_idx][target]


In [53]:
#Enet
enet = Pipeline([
    ("scaler", StandardScaler()),
    ("model", ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], cv=5, random_state=42))
])

enet.fit(X_enet_train, y_enet_train)
enet_pred = enet.predict(X_enet_test)

#XGBoost
dtrain = xgb.DMatrix(X_xgb_train, label=y_xgb_train)
dval = xgb.DMatrix(X_xgb_val, label=y_xgb_val)
dtest = xgb.DMatrix(X_xgb_test)

params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 4,
    "min_child_weight": 7,
    "gamma": 0.1,
    "reg_lambda": 2.0,
    "reg_alpha": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "eval_metric": "rmse",
    "seed": 42,
}

xgboost = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=2000,
    evals=[(dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=False,
)

xgb_pred = xgboost.predict(dtest)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

#### Overall Model Evaluation

In [54]:
def eval_reg(y_true, y_pred, label):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label} -> RMSE: {rmse:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")

# Elastic Net Test Predictions (baseline)
eval_reg(y_enet_test, enet_pred, "ElasticNet (test)")

# XGBoost Test Predictions (primary)
eval_reg(y_xgb_test, xgb_pred, "XGBoost (test)")

# Train predictions
enet_train_pred = enet.predict(X_enet_train)
xgb_train_pred = xgboost.predict(dtrain)

# Train metrics
eval_reg(y_enet_train, enet_train_pred, "ElasticNet (train)")
eval_reg(y_xgb_train, xgb_train_pred, "XGBoost (train)")


ElasticNet (test) -> RMSE: 0.0487 | MAE: 0.0231 | R2: 0.2305
XGBoost (test) -> RMSE: 0.0289 | MAE: 0.0039 | R2: 0.7277
ElasticNet (train) -> RMSE: 0.0502 | MAE: 0.0203 | R2: 0.3144
XGBoost (train) -> RMSE: 0.0203 | MAE: 0.0029 | R2: 0.8876


#### Evaluating model accuracy on MVP candidates

In [55]:
# Non-zero test cases
nz_mask = y_xgb_test != 0

# XGBoost metrics on non-zero only
eval_reg(y_xgb_test[nz_mask], xgb_pred[nz_mask], "XGBoost (test, non-zero)")

# ElasticNet metrics on non-zero only (if you want baseline comparison)
eval_reg(y_enet_test[y_enet_test != 0], enet_pred[y_enet_test != 0], "ElasticNet (test, non-zero)")


XGBoost (test, non-zero) -> RMSE: 0.1864 | MAE: 0.1293 | R2: 0.6152
ElasticNet (test, non-zero) -> RMSE: 0.2556 | MAE: 0.1905 | R2: 0.2764


In [56]:
test_xgb_df = modelling_df.loc[test_idx].copy()
test_xgb_df["pred_award_share"] = xgb_pred

pred_col = "pred_award_share"
true_col = "award_share"
season_col = "season"

def top1_hit(group):
    true_top = group[true_col].idxmax()
    pred_top = group[pred_col].idxmax()
    return 1 if true_top == pred_top else 0

def season_spearman(group):
    # Spearman correlation between predicted and true ranks within a season
    return spearmanr(group[true_col], group[pred_col], nan_policy="omit").correlation

# Top-1 MVP hit rate
top1_by_season = test_xgb_df.groupby(season_col).apply(top1_hit)
top1_hit_rate = top1_by_season.mean()

# Spearman rank correlation by season
spearman_by_season = test_xgb_df.groupby(season_col).apply(season_spearman)
spearman_mean = spearman_by_season.mean()

print("Top-1 MVP hit rate:", top1_hit_rate)
print("Mean Spearman by season:", spearman_mean)


Top-1 MVP hit rate: 0.8333333333333334
Mean Spearman by season: 0.5225665889899006


  top1_by_season = test_xgb_df.groupby(season_col).apply(top1_hit)
  spearman_by_season = test_xgb_df.groupby(season_col).apply(season_spearman)


# Insight Generation

#### Analysis of failed predicted season

In [57]:
# seasons where the top-1 prediction was wrong
wrong_seasons = top1_by_season[top1_by_season == 0].index.tolist()
print("Wrong seasons:", wrong_seasons)

Wrong seasons: [2017]


In [58]:
season = 2017
season_df = test_xgb_df[test_xgb_df["season"] == season].copy()

# True MVP (highest award_share)
true_mvp = season_df.loc[season_df["award_share"].idxmax()]

# Model's top prediction
pred_mvp = season_df.loc[season_df["pred_award_share"].idxmax()]

print("True MVP:", true_mvp.get("player", true_mvp.name), true_mvp["award_share"])
print("Pred MVP:", pred_mvp.get("player", pred_mvp.name), pred_mvp["pred_award_share"])

# Show top 5 predicted vs top 5 true
print("\nTop 5 predicted:")
print(season_df.sort_values("pred_award_share", ascending=False).head(5)[["player", "award_share", "pred_award_share"]])

print("\nTop 5 true:")
print(season_df.sort_values("award_share", ascending=False).head(5)[["player", "award_share", "pred_award_share"]])


True MVP: russell westbrook 0.879
Pred MVP: james harden 0.52548546

Top 5 predicted:
                  player  award_share  pred_award_share
14614       james harden        0.746          0.525485
14702      kawhi leonard        0.495          0.393002
14539      stephen curry        0.051          0.261718
14899  russell westbrook        0.879          0.237582
14560       kevin durant        0.002          0.142388

Top 5 true:
                  player  award_share  pred_award_share
14899  russell westbrook        0.879          0.237582
14614       james harden        0.746          0.525485
14702      kawhi leonard        0.495          0.393002
14662       lebron james        0.330          0.131395
14866      isaiah thomas        0.080          0.122315


#### Quantifying key drivers of vote share

In [59]:
mask = y_xgb_test > 0
X_nz = X_xgb_test.loc[mask]

explainer = shap.TreeExplainer(xgboost)
shap_vals = explainer.shap_values(X_nz)

shap_importance = pd.Series(
    abs(shap_vals).mean(axis=0), index=X_nz.columns
).sort_values(ascending=False)

shap_importance.head(15)


ws_z_season                 0.048026
vorp_rank_season            0.025913
vorp_z_season               0.025253
ws_rank_season              0.025233
win_loss_pct_rank_season    0.021763
per                         0.010279
win_loss_pct_z_season       0.009549
fg_per_g                    0.005906
tov_per_g_z_season          0.005403
dws_z_season                0.005319
fg_per_g_z_season           0.005034
fg2a_per_g                  0.004968
ws                          0.004792
pts_per_g_rank_season       0.004422
win_loss_pct                0.003820
dtype: float32