### Using lightgmb to model wobacon

In [1]:
import pandas as pd
import os
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, PredefinedSplit
from sklearn.metrics import mean_squared_error

In [2]:
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/Coding Projects/woba modeling/data/')
df = pd.read_csv('pitch_cleaned.csv').drop(columns=['Unnamed: 0'])

##### Cleaning for Task

In [3]:
df = df[df['year'] < 2025]

In [4]:
df = df[df['description'] == 'hit_into_play']
df = df[df['attack_direction'].notna()] # mar, but higher substinally higher avg woba on missing. going to adress with a launch angle, launch speed model
df = df[df['launch_speed'].notna()] # mcar 

In [5]:
df['zone'] = df['zone'].astype('category')
df = df.dropna()

In [6]:
X = (df[['zone', 'launch_speed', 'swing_path_tilt', 'attack_angle', 'attack_direction']])
y = df['woba_value']

##### Train Val

no need for a test set as I am purposely holding out 2025 data. I want to test on all 2025 data to compare the predection power of this model to xwobacon

In [7]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=26) 

##### Hyper Parameters Tuning

In [8]:
model = lgb.LGBMRegressor(random_state=26, n_jobs=-1, metric=['mae', 'rmse'])

##### Random Search

In [None]:
rnd_search_params = {
    'boosting_type': ['gbdt'],
    'learning_rate': np.linspace(0.005, 0.2, 10),          
    'num_leaves': np.linspace(2, 100, 10, dtype=int),
    'max_depth': np.linspace(1, 15, 20, dtype=int),  
    'min_data_in_leaf': np.linspace(1, 30, 10, dtype=int),         
    'subsample': np.linspace(0.4, 0.9, 7),               
    'max_bin': np.linspace(200, 600, 10, dtype=int),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'n_estimators': np.linspace(200, 2000, 15, dtype=int),
    'lambda_l2': np.linspace(0.01, 0.5, 15)
}

fit_params = {
    "callbacks": [lgb.early_stopping(stopping_rounds=40, verbose=False)], 
    "eval_set": [(x_val, y_val)],
    "eval_metric": "rmse" 
}

# for early stopping
x_combined = np.concatenate((x_train, x_val), axis=0)
y_combined = np.concatenate((y_train, y_val), axis=0)
split_index = [-1] * len(x_train) + [0] * len(x_val)
pds = PredefinedSplit(test_fold=split_index)

rnd_searcher = RandomizedSearchCV(model, param_distributions=rnd_search_params, cv=pds,
                                n_iter=500, random_state=26, verbose=4, n_jobs=-1 
)

In [120]:
search = rnd_searcher.fit(x_combined, y_combined, **fit_params)
print(search.best_params_)
print(search.best_score_)
print(search.feature_names_in_)

Fitting 1 folds for each of 500 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 816
[LightGBM] [Info] Number of data points in the train set: 183322, number of used features: 6
[LightGBM] [Info] Start training from score 0.372486
{'subsample': 0.65, 'num_leaves': 12, 'n_estimators': 714, 'min_data_in_leaf': 20, 'max_depth': 5, 'max_bin': 200, 'learning_rate': 0.07, 'lambda_l2': 0.02357142857142857, 'colsample_bytree': 0.9, 'boosting_type': 'gbdt'}
0.23143351520215694
['Column_0' 'Column_1' 'Column_2' 'Column_3' 'Column_4' 'Column_5']


##### Grid Search

In [9]:
grid = {'boosting_type': ['dart', 'gbdt'],
        'subsample': [0.45, 0.5, 0.55], 
        'num_leaves':  [8, 9, 10], 
        'n_estimators': [350, 400, 450], 
        'min_data_in_leaf': [16, 18, 20], 
        'max_depth': [8, 9, 10], 
        'max_bin': [63], 
        'learning_rate': [0.05 ,0.1, 0.15], 
        'lambda_l2': [0.5, 1, 3], 
        'colsample_bytree': [0.75,0.8, 0.85]
        }

# for early stopping
x_combined = np.concatenate((x_train, x_val), axis=0)
y_combined = np.concatenate((y_train, y_val), axis=0)
split_index = [-1] * len(x_train) + [0] * len(x_val)
pds = PredefinedSplit(test_fold=split_index)

fit_params = {
    "callbacks": [lgb.early_stopping(stopping_rounds=40, verbose=False)], 
    "eval_set": [(x_val, y_val)],
    "eval_metric": "rmse" 
}


grid_searcher = GridSearchCV(model, param_grid=grid, cv=pds, verbose=4, n_jobs=-1)

In [None]:
grid_searcher.fit(x_combined, y_combined, **fit_params)
print(grid_searcher.best_params_)
print(grid_searcher.best_score_)
print(grid_searcher.feature_names_in_)

Fitting 1 folds for each of 13122 candidates, totalling 13122 fits


In [163]:
grid_best = {'boosting_type': 'dart','colsample_bytree': 0.8, 'lambda_l2': 0.5, 'learning_rate': 0.1, 'max_bin': 63, 'max_depth': 9, 'min_data_in_leaf': 18, 'n_estimators': 400, 'num_leaves': 9, 'subsample': 0.5, 'early_stopping_rounds': 40}

#### Model Fit

In [159]:
model = model.set_params(**grid_best)
model = model.fit(x_train, y_train, eval_set=[[x_val, y_val]], callbacks=[lgb.early_stopping(stopping_rounds=40, verbose=False)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 160406, number of used features: 5
[LightGBM] [Info] Start training from score 0.372028
Training until validation scores don't improve for 40 rounds


##### Testing on 2025 Data

In [None]:
df_25 = df[df['year'] == 2025]
x_25 = df_25[['zone', 'launch_speed', 'swing_path_tilt', 'attack_angle', 'attack_direction']]
y_25 = df_25['woba_value']

In [None]:
y_pred = model.predict(x_25)
rmse = np.sqrt(mean_squared_error(y_25, y_pred))
results_df = pd.DataFrame({'actual': y_25, 'predicted': y_pred})
results_df = results_df.join(df[['batter', 'year']])
print(f'RMSE: {rmse}')

RMSE: 0.5033683310184024


In [161]:
print(model.feature_importances_)

[258 428 211 341 354]


In [165]:
grouped_results = results_df.groupby(['year', 'batter'])[['actual', 'predicted']].agg(['mean', 'count'])
grouped_results.columns = ['_'.join(col).strip() for col in grouped_results.columns.values]
grouped_results = grouped_results.reset_index()
grouped_rmse = np.sqrt(mean_squared_error(grouped_results['actual_mean'], grouped_results['predicted_mean']))
qualified_results = grouped_results[grouped_results['actual_count'] > 40]
qualified_rmse = np.sqrt(mean_squared_error(qualified_results['actual_mean'], qualified_results['predicted_mean']))
print(f'RMSE for batters with more than 60 plate appearances: {qualified_rmse}')
print(f'Grouped RMSE {grouped_rmse}')

RMSE for batters with more than 60 plate appearances: 0.07396243072837254
Grouped RMSE 0.17253456466657058
