# Capstone Two: Pre‐processing & Training Data Development


In [50]:
# 0) imports

import pandas as pd
import numpy as np

from sklearn.impute           import SimpleImputer
from sklearn.pipeline         import Pipeline
from sklearn.preprocessing    import StandardScaler, OneHotEncoder
from sklearn.compose          import ColumnTransformer
from sklearn.model_selection  import train_test_split


In [51]:
# 1) load the three CSVs
adv = pd.read_csv("advanced_player_stats_checked.csv")   # advanced metrics
raw = pd.read_csv("nba_player_stats_checked.csv")       # basic stats
sal = pd.read_csv("nba_salary_checked.csv")             # salaries




In [52]:
# 2) fix salary table so it matches the others
sal = sal.rename(columns={"Tm": "Team"})               # Tm → Team
sal = sal[["Player", "Team", "2024-25"]].rename(columns={"2024-25": "Salary"})

In [53]:
# 3) merge advanced + raw on Player & Team, then add salary
df = (adv
      .merge(raw, on=["Player","Team"], how="inner")
      .merge(sal, on=["Player","Team"], how="inner"))
print("after merge, df.shape =", df.shape)

after merge, df.shape = (270, 59)


In [54]:
# 4) drop any columns that are entirely empty (like Awards_x)
df = df.dropna(axis=1, how="all")

In [55]:
# 5) split into features X and target y
X = df.drop(columns=["Player","Salary"])
y = df["Salary"]

In [56]:
# 6) train/test split (25% holdout)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)
print("X_train.shape:", X_train.shape, "X_test.shape:", X_test.shape)

X_train.shape: (202, 56) X_test.shape: (68, 56)


In [57]:
# 7) identify numeric vs. categorical columns
numeric_cols     = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
print("numeric_cols:", numeric_cols)
print("categorical_cols:", categorical_cols)

numeric_cols: ['Rk_x', 'Age_x', 'G_x', 'GS_x', 'MP_x', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Rk_y', 'Age_y', 'G_y', 'GS_y', 'MP_y', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
categorical_cols: ['Team', 'Pos_x', 'Pos_y', 'Awards_y']


In [58]:
# 8) build pipelines
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  # fill NaNs with median
    ("scaler",  StandardScaler()),                  # then standardize
])
cat_pipe = Pipeline([
    ("ohe", OneHotEncoder(
        sparse_output=False,      # dense array
        drop="first",             # avoid dummy trap
        handle_unknown="ignore"   # unseen cats → all zeros
    )),
])

In [59]:
# 9) combine into a ColumnTransformer
preprocessor = ColumnTransformer([
    ("nums", num_pipe,     numeric_cols),
    ("cats", cat_pipe,     categorical_cols),
])


In [60]:
# 10) fit on TRAIN only & transform both
preprocessor.fit(X_train)
X_train_prep = preprocessor.transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

print("processed shapes:", X_train_prep.shape, X_test_prep.shape)

processed shapes: (202, 116) (68, 116)


