<a href="https://colab.research.google.com/github/grmanjar-cmyk/mlb_wrc_plus_model/blob/main/WRC_ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
upload = files.upload()

Saving fangraphs_merged_data_2015_2025.csv to fangraphs_merged_data_2015_2025.csv


In [5]:
import pandas as pd
df = pd.read_csv('fangraphs_merged_data_2015_2025.csv')
df.head(5)

Unnamed: 0,Season,Name,Team,G,PA,HR,R,RBI,SB,BB%,...,EV,EV90,maxEV,LA,Barrels,Barrel%,HardHit,HardHit%,xBA,xSLG
0,2024,Aaron Judge,NYY,158,704,58,122,144,10,0.18892,...,96.230589,,117.5,18.894602,105,0.268542,239,0.611253,0.31037,0.724406
1,2022,Aaron Judge,NYY,157,696,62,133,131,16,0.159483,...,95.777696,,118.4,14.874687,106,0.262376,247,0.611386,0.305257,0.705921
2,2024,Bobby Witt Jr.,KCR,161,709,32,125,109,31,0.080395,...,92.732712,,116.9,15.118959,77,0.143123,260,0.483271,0.31459,0.5763
3,2018,Mookie Betts,BOS,136,614,32,129,80,30,0.131922,...,92.27837,,110.6,18.483721,61,0.140553,218,0.502304,0.309364,0.607269
4,2025,Aaron Judge,NYY,152,679,53,137,114,12,0.182622,...,95.421966,,118.1,19.062016,96,0.247423,226,0.582474,0.315281,0.735174


In [6]:
import pandas as pd
import numpy as np

# 1. Define the Target and Features
TARGET_COL = 'wRC+'

In [7]:
FEATURE_COLS = [
    'BB%', 'K%', 'ISO','Hard%', 'Barrel%', 'HardHit%', 'GB/FB', 'LD%', 'IFH%', 'HR/FB', 'Pull%', 'Cent%', 'Oppo%', 'GB%', 'FB%',
    'xBA', 'xSLG', 'EV', 'maxEV', 'LA', 'Barrel%', 'HardHit%']

In [9]:
# [8] Step 1: Data Preparation and Cleaning

# 1. Use a copy to avoid a 'SettingWithCopyWarning' later on
df_cleaned_temp = df.copy()


In [10]:
# Define the list of all columns that need to be clean and numeric
all_relevant_cols = [TARGET_COL] + FEATURE_COLS

# 2. Clean and Convert Data Types
# Convert all feature and target columns to numeric, coercing errors to NaN
for col in all_relevant_cols:
    # pd.to_numeric with errors='coerce' turns any non-numeric value into NaN
    df_cleaned_temp[col] = pd.to_numeric(df_cleaned_temp[col], errors='coerce')

In [12]:
# 3. Handle Missing Values (NaNs)
# Drop any row where the target or a key feature is missing.
df_clean = df_cleaned_temp.dropna(subset=all_relevant_cols)

print(f"Original Data size: {len(df)}")
print(f"Data size after NaN removal: {len(df_clean)}")

Original Data size: 1520
Data size after NaN removal: 1520


In [13]:
# [9] Step 2: Time-Based Train/Validation Split

# Define the split year (Train <= 2023, Validate > 2023)
SPLIT_YEAR = 2023

In [14]:
# Create the training set (2015 - 2023)
train_mask = df_clean['Season'] <= SPLIT_YEAR
X_train = df_clean[train_mask][FEATURE_COLS]
y_train = df_clean[train_mask][TARGET_COL]

In [15]:
# Create the validation set (2024 - 2025)
val_mask = df_clean['Season'] > SPLIT_YEAR
X_val = df_clean[val_mask][FEATURE_COLS]
y_val = df_clean[val_mask][TARGET_COL]

In [16]:
print("--- Train/Validation Split ---")
print(f"Training Player-Seasons (2015-{SPLIT_YEAR}): {len(X_train)}")
print(f"Validation Player-Seasons ({SPLIT_YEAR+1}-2025): {len(X_val)}")

--- Train/Validation Split ---
Training Player-Seasons (2015-2023): 1246
Validation Player-Seasons (2024-2025): 274


In [20]:
# [10] Step 3: Model Training (Random Forest)
from sklearn.ensemble import RandomForestRegressor

# 1. Initialize the Random Forest Regressor
# n_jobs=-1 uses all CPU cores for faster training
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42, # for reproducibility
    n_jobs=-1,
    max_depth=15,    # Limit depth to prevent severe overfitting
    min_samples_leaf=5 # Minimum samples per leaf node
)
print("--- Training Model ---")
# 2. Train the model
rf_model.fit(X_train, y_train)
print("Training complete.")

--- Training Model ---
Training complete.


In [22]:
# [11] Step 4: Model Evaluation and Interpretation
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. Make predictions on the validation set
y_pred = rf_model.predict(X_val)

In [24]:
# 2. Calculate Evaluation Metrics
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("--- Model Evaluation (Validation Set) ---")
print(f"Mean Absolute Error (MAE): {mae:.2f} wRC+ points")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} wRC+ points")

--- Model Evaluation (Validation Set) ---
Mean Absolute Error (MAE): 8.52 wRC+ points
Root Mean Squared Error (RMSE): 10.88 wRC+ points


In [25]:
# 3. Interpret Feature Importance (Key benefit of Random Forest)
feature_importances = pd.Series(
    rf_model.feature_importances_,
    index=FEATURE_COLS
).sort_values(ascending=False)

print("\n--- Top 5 Feature Importances ---")
print(feature_importances.head(5))


--- Top 5 Feature Importances ---
xSLG    0.428895
ISO     0.241725
xBA     0.114602
BB%     0.072672
K%      0.022495
dtype: float64


In [26]:
# Optional: Add the prediction to the validation dataframe for analysis
df_val_results = df_clean[val_mask][['Season', 'Name', 'Team', TARGET_COL]].copy()
df_val_results['wRC+_Predicted'] = y_pred
df_val_results['Error'] = np.abs(df_val_results[TARGET_COL] - df_val_results['wRC+_Predicted'])
print("\nTop 5 Largest Prediction Errors (Absolute Difference)")
print(df_val_results.sort_values('Error', ascending=False).head(5))


Top 5 Largest Prediction Errors (Absolute Difference)
      Season               Name Team        wRC+  wRC+_Predicted      Error
0       2024        Aaron Judge  NYY  219.783572      181.371007  38.412565
1360    2025      Brenton Doyle  COL   65.190356       96.263480  31.073124
1345    2025     Salvador Perez  KCR   95.448119      123.999392  28.551273
181     2024       Brent Rooker  OAK  163.631416      135.097635  28.533781
1138    2025  Michael Harris II  ATL   82.851702      108.902131  26.050428


In [30]:
# 1. Calculate the mean of the target variable (wRC+) across ALL clean data
# We use all clean data to represent the overall population mean.
mean_wrc_plus = df_clean[TARGET_COL].mean()

# 2. Retrieve the MAE from the previous evaluation step (Cell 24 output)
# Note: Here can also hardcode the value from output example: (8.52) for demonstration,
# but can also use the 'mae' variable from cell [24].
mae_value = mae

# 3. Calculate the percentage of the error relative to the mean
error_percent = (mae_value / mean_wrc_plus) * 100

print(f"--- Error Context ---")
print(f"Mean wRC+ in the full, cleaned dataset: {mean_wrc_plus:.2f}")
print(f"Mean Absolute Error (MAE): {mae_value:.2f} wRC+ points")
print("-" * 30)
print(f"Your model's average prediction error ({mae_value:.2f})")
print(f"is approximately {error_percent:.1f}% of the average hitter's wRC+.")

--- Error Context ---
Mean wRC+ in the full, cleaned dataset: 112.66
Mean Absolute Error (MAE): 8.52 wRC+ points
------------------------------
Your model's average prediction error (8.52)
is approximately 7.6% of the average hitter's wRC+.
