In this notebook I’ll build and compare three different regression
 models to predict next-season salary ("2024-25") for NBA players.
 I’ll use:
   1) Linear Regression
   2) Random Forest Regressor (with GridSearchCV)
   3) XGBoost Regressor (with basic hyperparam tuning)
and evaluate them using RMSE on a held-out test set.

In [1]:
# 1) Imports and random seed

!pip install xgboost
from xgboost import XGBRegressor
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model    import LinearRegression
from sklearn.ensemble        import RandomForestRegressor
from catboost                import CatBoostRegressor
from sklearn.metrics         import mean_squared_error

from sklearn.pipeline        import Pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import StandardScaler, OneHotEncoder

# set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)



In [2]:
# 2) Load & merge data
# --------------------
adv = pd.read_csv("advanced_player_stats_checked.csv")
raw = pd.read_csv("nba_player_stats_checked.csv")
sal = pd.read_csv("nba_salary_checked.csv")

# 2a) Clean salary strings so they become floats
sal["2024-25"] = (
    sal["2024-25"]
      .str.replace(r"[\$,]", "", regex=True)
      .str.strip()
      .astype(float)
)

# now merge on “Player”
df = (
    adv
      .merge(raw, on="Player", how="inner", suffixes=("_adv","_raw"))
      .merge(sal, on="Player", how="inner")
)

print("Merged DF shape:", df.shape)
df.head()


Merged DF shape: (811, 68)


Unnamed: 0,Rk_adv,Player,Age_adv,Team_adv,Pos_adv,G_adv,GS_adv,MP_adv,PER,TS%,...,Awards_raw,Rk,Tm,2024-25,2025-26,2026-27,2027-28,2028-29,2029-30,Guaranteed
0,1,Mikal Bridges,28,NYK,SF,82,82,3036,14.0,0.585,...,,71,NYK,23300000.0,"$24,900,000",,,,,"$48,200,000"
1,2,Josh Hart,29,NYK,SG,77,77,2897,16.5,0.611,...,,95,NYK,18144000.0,"$19,472,240","$20,923,760","$22,375,280",,,"$58,540,000"
2,3,Anthony Edwards,23,MIN,SG,79,79,2871,20.1,0.595,...,"MVP-7,CPOY-8,AS,NBA2",21,MIN,42176400.0,"$45,550,512","$48,924,624","$52,298,736","$55,672,848",,"$244,623,120"
3,4,Devin Booker,28,PHO,SG,75,75,2795,19.3,0.589,...,"AS,NBA3",7,PHO,49205800.0,"$53,142,264","$57,078,728","$61,015,192",,,"$220,441,984"
4,5,James Harden,35,LAC,PG,79,79,2789,20.0,0.582,...,,44,LAC,33653846.0,"$36,346,154",,,,,"$33,653,846"


In [3]:
# 3) Define features (X) and target (y)
y = df["2024-25"]  # what I want to predict

# drop identifier & future‐salary cols
drop_cols = [
    "Player", "Rk_adv","Rk_raw","Rk","Tm",
    "2025-26","2026-27","2027-28",
    "2028-29","2029-30","Guaranteed"
]
X = df.drop(columns=drop_cols + ["2024-25"])
print("X shape:", X.shape)
X.head()

X shape: (811, 56)


Unnamed: 0,Age_adv,Team_adv,Pos_adv,G_adv,GS_adv,MP_adv,PER,TS%,3PAr,FTr,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards_raw
0,28,NYK,SF,82,82,3036,14.0,0.585,0.391,0.1,...,0.9,3.8,4.7,3.8,1.0,0.4,2.1,1.5,20.3,
1,29,NYK,SG,77,77,2897,16.5,0.611,0.327,0.266,...,1.7,7.3,9.0,4.4,1.0,0.3,1.6,2.3,10.1,
2,23,MIN,SG,79,79,2871,20.1,0.595,0.503,0.308,...,0.7,4.9,5.6,5.3,1.3,0.5,3.1,1.8,26.6,"MVP-7,CPOY-8,AS,NBA2"
3,28,PHO,SG,75,75,2795,19.3,0.589,0.388,0.34,...,0.8,3.7,4.5,6.9,0.9,0.4,2.6,3.0,27.1,"AS,NBA3"
4,35,LAC,PG,79,79,2789,20.0,0.582,0.516,0.446,...,0.5,4.9,5.4,8.9,1.1,0.8,2.7,1.9,17.4,


In [4]:
# 4) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)
print("Train:", X_train.shape)
print(" Test:", X_test.shape)

Train: (608, 56)
 Test: (203, 56)


In [5]:
# 5) Identify numeric vs categorical
numeric_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)

Numeric: ['Age_adv', 'G_adv', 'GS_adv', 'MP_adv', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Awards_adv', 'Age_raw', 'G_raw', 'GS_raw', 'MP_raw', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
Categorical: ['Team_adv', 'Pos_adv', 'Team_raw', 'Pos_raw', 'Awards_raw']


In [6]:
# 6) Build preprocessing pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# numeric: median impute → standardize
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

# categorical: one-hot encode, unknown→all zeros
cat_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# combine them
preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
])

In [7]:
# 7) Linear Regression baseline
lin_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])
lin_pipe.fit(X_train, y_train)
pred_lin = lin_pipe.predict(X_test)
rmse_lin = np.sqrt(mean_squared_error(y_test, pred_lin))
print(f"Linear Regression RMSE: ${rmse_lin:,.0f}")


Linear Regression RMSE: $20,109,315,997,133,996




In [8]:
# 8) Random Forest with GridSearchCV
rf_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestRegressor(random_state=RANDOM_STATE))
])
rf_params = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth":    [5, 10, None]
}
grid_rf = GridSearchCV(
    rf_pipe, rf_params,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print("RF best params:", grid_rf.best_params_)
pred_rf = best_rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
print(f"Random Forest RMSE: ${rmse_rf:,.0f}")



RF best params: {'model__max_depth': None, 'model__n_estimators': 200}
Random Forest RMSE: $5,395,378




In [9]:
# 9) XGBoost with GridSearchCV
xgb_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", XGBRegressor(random_state=RANDOM_STATE, verbosity=0))
])
xgb_params = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth":    [3, 6, 10],
    "model__learning_rate": [0.01, 0.1, 0.2]
}
grid_xgb = GridSearchCV(
    xgb_pipe, xgb_params,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
print("XGB best params:", grid_xgb.best_params_)
pred_xgb = best_xgb.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))
print(f"XGBoost RMSE: ${rmse_xgb:,.0f}")



XGB best params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
XGBoost RMSE: $5,109,531




In [10]:
# 10) Compare all three
results = pd.DataFrame({
    "Model":    ["LinearRegression", "RandomForest", "XGBoost"],
    "Test RMSE": [rmse_lin, rmse_rf, rmse_xgb]
})
print("\nModel comparison:")
print(results)


Model comparison:
              Model     Test RMSE
0  LinearRegression  2.010932e+16
1      RandomForest  5.395378e+06
2           XGBoost  5.109531e+06


# 11) My conclusion  
Based on the lowest RMSE (≈ \$5.1 million), I will select **XGBoost** as my final model.  

## 11) Save full model metrics to CSV


In [11]:
# 11) Save full model metrics to report/model_metrics.csv
from sklearn.metrics    import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd, os

# If you already have pred_lin, pred_rf, pred_xgb and y_test:
metrics = [
    {
      "model": "Linear Regression",
      "RMSE":  mean_squared_error(y_test, pred_lin, squared=False),
      "MAE":   mean_absolute_error(y_test, pred_lin),
      "R2":    r2_score(y_test, pred_lin)
    },
    {
      "model": "Random Forest",
      "RMSE":  mean_squared_error(y_test, pred_rf, squared=False),
      "MAE":   mean_absolute_error(y_test, pred_rf),
      "R2":    r2_score(y_test, pred_rf)
    },
    {
      "model": "XGBoost",
      "RMSE":  mean_squared_error(y_test, pred_xgb, squared=False),
      "MAE":   mean_absolute_error(y_test, pred_xgb),
      "R2":    r2_score(y_test, pred_xgb)
    }
]

df_metrics = pd.DataFrame(metrics)

# Make the report folder if it doesn't exist
os.makedirs('report', exist_ok=True)

# Write out the CSV
df_metrics.to_csv('report/model_metrics.csv', index=False)

# Display to confirm
df_metrics




Unnamed: 0,model,RMSE,MAE,R2
0,Linear Regression,2.010932e+16,3992034000000000.0,-3.497084e+18
1,Random Forest,5395378.0,3149207.0,0.7482582
2,XGBoost,5109531.0,3014224.0,0.7742262
