further clean data for ml
split data for training (smaller sample)
train test split
using bayesian search for parameters
then go to grid search
then train the whole model

#### import packages and data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np

In [5]:
# import data
dataset = pd.read_csv('/content/drive/MyDrive/colab/cleaned_pitch_2024.csv')

# Convert boolean columns to int (1/0)
dataset['called_strike'] = dataset['called_strike'].astype(int)
dataset['swinging_strike'] = dataset['swinging_strike'].astype(int)
dataset['in_strike_zone'] = dataset['in_strike_zone'].astype(int)

# hyperpara trainset
hyper = dataset.sample(30000, random_state=26)

#### feature and target selection

In [6]:
# features
X = hyper.drop(columns=["Unnamed: 0", "player_name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['estimated_woba']
X = X.drop(columns=['estimated_woba'])

#### Train Test Split | ML import

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import bayes_opt
from sklearn.metrics import mean_squared_error


In [2]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.2, random_state=26)

NameError: name 'X' is not defined

### HyperParamater Tuning

In [8]:
def bay_opt(X, y, parameters, n_iter=10, int_points = 20, random_state=26):
    def xgb_evaluate(
        learning_rate,
        max_depth,
        min_child_weight,
        subsample,
        colsample_bytree,
        early_stopping_rounds,
        max_leaves
    ):
        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'max_leaves': int(max_leaves),
            'early_stopping_rounds': early_stopping_rounds,
            'colsample_bytree': colsample_bytree,
            'n_estimators': 50000,
            'objective': 'reg:squarederror',
            'n_jobs': -1,
            'random_state': random_state
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X, y, eval_set=[(test_x, test_y)])
        predictions = model.predict(test_x)
        return -mean_squared_error(test_y, predictions)

    optimizer = bayes_opt.BayesianOptimization(
        f=xgb_evaluate,
        pbounds=parameters,
        random_state=random_state,
    )

    optimizer.maximize(n_iter=n_iter, init_points=int_points)

    return optimizer.max['params']

Bay Opt for hyperparameter tuning

In [10]:
params = {
    'learning_rate': (0.05, 0.3),
    'max_leaves': (1, 40),
    'min_child_weight': (1, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.4, 1),
    'early_stopping_rounds': (10, 60),
    'max_depth': (0, 20)
}

best_params = bay_opt(train_x, train_y, parameters=params, int_points=200, n_iter=300)
print(best_params)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[52]	validation_0-rmse:0.24221
[53]	validation_0-rmse:0.24220
[54]	validation_0-rmse:0.24220
[55]	validation_0-rmse:0.24221
[56]	validation_0-rmse:0.24220
| [39m445      [39m | [39m-0.05866 [39m | [39m0.5388   [39m | [39m25.71    [39m | [39m0.2605   [39m | [39m11.27    [39m | [39m1.027    [39m | [39m9.164    [39m | [39m0.495    [39m |
[0]	validation_0-rmse:0.24149
[1]	validation_0-rmse:0.24086
[2]	validation_0-rmse:0.23725
[3]	validation_0-rmse:0.23452
[4]	validation_0-rmse:0.23283
[5]	validation_0-rmse:0.23245
[6]	validation_0-rmse:0.23120
[7]	validation_0-rmse:0.22825
[8]	validation_0-rmse:0.22806
[9]	validation_0-rmse:0.22798
[10]	validation_0-rmse:0.22786
[11]	validation_0-rmse:0.22772
[12]	validation_0-rmse:0.22669
[13]	validation_0-rmse:0.22656
[14]	validation_0-rmse:0.22443
[15]	validation_0-rmse:0.22438
[16]	validation_0-rmse:0.22432
[17]	validation_0-rmse:0.22354
[18]	validation_0-rmse:0.22300
[

Grid Search to Narrow

In [11]:
def grid_search(X, y, params):

    model = xgb.XGBRegressor( n_jobs = -1, n_estimators = 50000)
    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X, y, eval_set=[(test_x, test_y)])

    print("Best parameters found:", grid.best_params_)
    print("Best score found:", grid.best_score_)

    best_estimator = grid.best_estimator_
    feature_importances = best_estimator.feature_importances_

    return best_estimator, feature_importances

Best parameters found: {'colsample_bytree': 0.9, 'early_stopping_rounds': 30, 'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 30, 'min_child_weight': 6, 'subsample': 1}

{'colsample_bytree': 0.9, 'early_stopping_rounds': 10, 'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 40, 'min_child_weight': 6, 'subsample': 1}

{'colsample_bytree': 0.9, 'early_stopping_rounds': 5, 'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 38, 'min_child_weight': 6, 'subsample': 1}

{'colsample_bytree': 0.9, 'early_stopping_rounds': 3, 'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 39, 'min_child_weight': 6, 'subsample': 1}

In [15]:
grid_para = {'colsample_bytree': [0.9], 'early_stopping_rounds': [10],
             'learning_rate': [0.2], 'max_depth': [5],
             'max_leaves': [39], 'min_child_weight': [6],
             'subsample': [1]}
grid_search(X=train_x, y = train_y, params=grid_para)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[0]	validation_0-rmse:0.23263
[1]	validation_0-rmse:0.22633
[2]	validation_0-rmse:0.22217
[3]	validation_0-rmse:0.21944
[4]	validation_0-rmse:0.21761
[5]	validation_0-rmse:0.21633
[6]	validation_0-rmse:0.21549
[7]	validation_0-rmse:0.21487
[8]	validation_0-rmse:0.21461
[9]	validation_0-rmse:0.21437
[10]	validation_0-rmse:0.21417
[11]	validation_0-rmse:0.21397
[12]	validation_0-rmse:0.21394
[13]	validation_0-rmse:0.21382
[14]	validation_0-rmse:0.21381
[15]	validation_0-rmse:0.21366
[16]	validation_0-rmse:0.21352
[17]	validation_0-rmse:0.21360
[18]	validation_0-rmse:0.21359
[19]	validation_0-rmse:0.21374
[0]	validation_0-rmse:0.23240
[1]	validation_0-rmse:0.22597
[2]	validation_0-rmse:0.22173
[3]	validation_0-rmse:0.21897
[4]	validation_0-rmse:0.21714
[5]	validation_0-rmse:0.21588
[6]	validation_0-rmse:0.21513
[7]	validation_0-rmse:0.21459
[8]	validation_0-rmse:0.21421
[9]	validation_0-rmse:0.21400
[10]	validation_0-rmse:0.21380

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, device=None, early_stopping_rounds=3,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=39,
              min_child_weight=6, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50000, n_jobs=4,
              num_parallel_tree=None, random_state=None, ...),
 array([0.46724677, 0.42210335, 0.00786318, 0.00763952, 0.00903418,
        0.00510905, 0.00634181, 0.00588118, 0.00653447, 0.00524724,
        0.00392789, 0.00549478, 0.00708443, 0.00678737, 0.00926275,
        0.00363444, 0.00299965, 0.01780787], dtype=f

### train on whole dataset

In [18]:
# features
X = dataset.drop(columns=["Unnamed: 0", "player_name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['estimated_woba']
X = X.drop(columns=['estimated_woba'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# best para found after grid search
opti_para = {'colsample_bytree': 0.9,
             'learning_rate': 0.2, 'max_depth': 5,
             'max_leaves': 39, 'min_child_weight': 6,
             'subsample': 1}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=50000, early_stopping_rounds=3)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
for feature, importance in zip(feature_names, feature_importances):
    print(f"Feature: {feature}, Importance: {importance}")




Mean Squared Error on Test Set: 0.04593725520252476
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=3,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=39,
             min_child_weight=6, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=50000, n_jobs=-1,
             num_parallel_tree=None, random_state=None, ...)
Feature: called_strike, Importance: 0.37420138716697693
Feature: swinging_strike, Importance: 0.5960631966590881
Feature: pitch_velo_adj, Importance: 0.0010297887492924929
Feature: horz_position_of_pitch, Importance