further clean data for ml
split data for training (smaller sample)
train test split
using bayesian search for parameters
then go to grid search
then train the whole model

#### import packages and data

In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')

In [None]:
# import data
dataset = pd.read_csv('data/datasets/cleaned_pitch_2024.csv')

# Convert boolean columns to int (1/0)
dataset['called_strike'] = dataset['called_strike'].astype(int)
dataset['swinging_strike'] = dataset['swinging_strike'].astype(int) 
dataset['in_strike_zone'] = dataset['in_strike_zone'].astype(int)

# hyperpara trainset
hyper = dataset.sample(30000, random_state=26)

#### feature and target selection

In [None]:
# features
X = hyper.drop(columns=["Unnamed: 0", "player_name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['estimated_woba']
X = X.drop(columns=['estimated_woba'])

#### Train Test Split | ML import

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import bayes_opt
from sklearn.metrics import mean_squared_error


In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.2, random_state=26)

### HyperParamater Tuning

In [None]:
def bay_opt(X, y, parameters, n_iter=10, int_points = 20, random_state=26):
    def xgb_evaluate(
        learning_rate,
        max_depth,
        min_child_weight,
        subsample,
        colsample_bytree,
        early_stopping_rounds,
        max_leaves
    ):
        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'max_leaves': int(max_leaves),
            'early_stopping_rounds': early_stopping_rounds,
            'colsample_bytree': colsample_bytree,
            'n_estimators': 50000,
            'objective': 'reg:squarederror', 
            'n_jobs': -1,
            'random_state': random_state
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X, y, eval_set=[(test_x, test_y)])
        predictions = model.predict(test_x)
        return -mean_squared_error(test_y, predictions) 

    optimizer = bayes_opt.BayesianOptimization(
        f=xgb_evaluate,
        pbounds=parameters,
        random_state=random_state,
    )

    optimizer.maximize(n_iter=n_iter, init_points=int_points)

    return optimizer.max['params']

Bay Opt for hyperparameter tuning

In [None]:
params = {
    'learning_rate': (0.05, 0.3),
    'max_leaves': (1, 40),
    'min_child_weight': (1, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.4, 1),
    'early_stopping_rounds': (10, 60),
    'max_depth': (0, 20)
}

best_params = bay_opt(train_x, train_y, parameters=params, int_points=100, n_iter=250)
print(best_params)

|   iter    |  target   | colsam... | early_... | learni... | max_depth | max_le... | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
[0]	validation_0-rmse:0.24057
[1]	validation_0-rmse:0.22970
[2]	validation_0-rmse:0.22737
[3]	validation_0-rmse:0.22389
[4]	validation_0-rmse:0.22252
[5]	validation_0-rmse:0.22044
[6]	validation_0-rmse:0.21724


[7]	validation_0-rmse:0.21664
[8]	validation_0-rmse:0.21651
[9]	validation_0-rmse:0.21582
[10]	validation_0-rmse:0.21585
[11]	validation_0-rmse:0.21551
[12]	validation_0-rmse:0.21544
[13]	validation_0-rmse:0.21503
[14]	validation_0-rmse:0.21498
[15]	validation_0-rmse:0.21461
[16]	validation_0-rmse:0.21454
[17]	validation_0-rmse:0.21452
[18]	validation_0-rmse:0.21477
[19]	validation_0-rmse:0.21479
[20]	validation_0-rmse:0.21473
[21]	validation_0-rmse:0.21484
[22]	validation_0-rmse:0.21487
[23]	validation_0-rmse:0.21503
[24]	validation_0-rmse:0.21527
[25]	validation_0-rmse:0.21527
[26]	validation_0-rmse:0.21538
[27]	validation_0-rmse:0.21534
[28]	validation_0-rmse:0.21542
[29]	validation_0-rmse:0.21548
[30]	validation_0-rmse:0.21577
[31]	validation_0-rmse:0.21603
[32]	validation_0-rmse:0.21612
[33]	validation_0-rmse:0.21637
[34]	validation_0-rmse:0.21647
[35]	validation_0-rmse:0.21643
[36]	validation_0-rmse:0.21656
[37]	validation_0-rmse:0.21679
[38]	validation_0-rmse:0.21678
[39]	valida

Grid Search to Narrow

In [None]:
def grid_search(X, y, params):
    
    model = xgb.XGBRegressor( n_jobs = 4, n_estimators = 50000)
    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X, y, eval_set=[(test_x, test_y)])
    
    print("Best parameters found:", grid.best_params_)
    print("Best score found:", grid.best_score_)

    best_estimator = grid.best_estimator_
    feature_importances = best_estimator.feature_importances_

    return best_estimator, feature_importances

In [None]:
grid_para = {'colsample_bytree': [0.7, 0.75, 0.8], 'early_stopping_rounds': [20,30, 40], 
             'learning_rate': [0.1, 0.15, 0.2], 'max_depth': [0, 5, 10], 
             'max_leaves': [10, 20, 30], 'min_child_weight': [2, 3, 4], 
             'subsample': [0.8, 0.9, 1]}
grid_search(X=train_x, y = train_y, params=grid_para)

### train on whole dataset

In [None]:
# features
X = dataset.drop(columns=["Unnamed: 0", "player_name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['estimated_woba']
X = X.drop(columns=['estimated_woba'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [None]:
from sklearn.model_selection import cross_val_score

# best para found after grid search
opti_para = {}

# model
reg = xgb.XGBRegressor(**opti_para, n_jobs = -1, n_estimators = 50000)
reg.fit(X, y, eval_set=[(test_x, test_y)])

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)

# cross validation
cv_scores = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')

# print
print("Mean Cross-Validation Score:", cv_scores.mean())
print("Standard Deviation of Cross-Validation Score:", cv_scores.std())
print("Mean Squared Error on Test Set:", -mean_squared_error(test_y, predictions))
print(reg)
print(feature_importances)
