#### Imports

In [19]:
import numpy as np
import pandas as pd

In [20]:
df_clean = pd.read_parquet("../data/cleaned.parquet")
fe_df = df_clean.copy()

In [21]:
KEY = ["player", "season", "age"] 

In [22]:
#Two way impact evaluation
fe_df["two_way_ws_balance"] = (fe_df["ows"] - fe_df["dws"])  # positive = offense-heavy
fe_df["two_way_bpm_balance"] = (fe_df["obpm"] - fe_df["dbpm"]) 

In [23]:
#Normalize era-sensitive stats by season using z-score
exclude_cols = {"season", "award_share", "age", "player", "team_id", "pos"}
era_sensitive_cols = [
    # Scoring/usage
    "pts_per_g", "fg_per_g", "fga_per_g",
    "fg3_per_g", "fg3a_per_g", "ft_per_g", "fta_per_g",
    "usg_pct",
    # Shooting efficiency
    "fg_pct", "fg2_pct", "fg3_pct", "efg_pct", "ts_pct",
    # Playmaking/defense rates
    "ast_per_g", "stl_per_g", "blk_per_g", "tov_per_g", "pf_per_g",
    # Rebounding rates
    "orb_per_g", "drb_per_g", "trb_per_g",
    # Advanced impact
    "per", "ows", "dws", "ws", "ws_per_48",
    "obpm", "dbpm", "bpm", "vorp",
    # Team context
    "win_loss_pct", "mov", "mov_adj",
    # Shot mix
    "fg3a_per_fga_pct", "fta_per_fga_pct",
]

season_z_cols = [c for c in era_sensitive_cols if c not in exclude_cols]

def zscore(s):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

for c in season_z_cols:
    fe_df[f"{c}_z_season"] = fe_df.groupby("season")[c].transform(zscore)


In [24]:
#Ranking player key stats by season
rank_cols = ["pts_per_g", "bpm", "vorp", "ws", "ts_pct", "win_loss_pct"]

for c in rank_cols:
    fe_df[f"{c}_rank_season"] = (
        fe_df.groupby("season")[c]
        .rank(ascending=False, method="average")
    )

In [25]:
#Usage efficiency
fe_df["usage_efficiency"] = fe_df["usg_pct"] * fe_df["ts_pct"]

In [26]:
# Contextual features (previous MVP wins)
# Winner = player(s) with max award_share in each season
max_share = fe_df.groupby("season")["award_share"].transform("max")
fe_df["is_mvp"] = (fe_df["award_share"] == max_share).astype(int)

fe_df["prev_mvp_wins"] = (
    fe_df.groupby(KEY)["is_mvp"]
    .transform(lambda s: s.cumsum().shift(1).fillna(0))
)

In [27]:
# Produce an artifact
fe_df.to_parquet("../data/features.parquet", index=False)