#### import packages and data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [8]:
# import data
dataset = pd.read_csv('/content/drive/MyDrive/colab/cleaned_war_pitch.csv')

# Convert boolean columns to int (1/0)
dataset['called_strike'] = dataset['called_strike'].astype(int)
dataset['swinging_strike'] = dataset['swinging_strike'].astype(int)
dataset['in_strike_zone'] = dataset['in_strike_zone'].astype(int)

# hyperpara trainset
hyper = dataset.sample(300000, random_state=26)

#### feature and target selection

In [22]:
# features
hyper = hyper.dropna(axis=0) # remove nas
X = hyper.drop(columns=["Unnamed: 0", "Name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot", "estimated_woba"])
y = hyper['WAR']
X = X.drop(columns=['WAR'])


#### Train Test Split | ML import

In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import bayes_opt
from sklearn.metrics import mean_squared_error

In [24]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.2, random_state=26)

### HyperParamater Tuning

In [25]:
def bay_opt(X, y, parameters, n_iter=10, int_points = 20, random_state=26):
    def xgb_evaluate(
        learning_rate,
        max_depth,
        min_child_weight,
        subsample,
        colsample_bytree,
        early_stopping_rounds,
        max_leaves
    ):
        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'max_leaves': int(max_leaves),
            'early_stopping_rounds': early_stopping_rounds,
            'colsample_bytree': colsample_bytree,
            'n_estimators': 50000,
            'objective': 'reg:squarederror',
            'n_jobs': -1,
            'random_state': random_state
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X, y, eval_set=[(test_x, test_y)])
        predictions = model.predict(test_x)
        return -mean_squared_error(test_y, predictions)

    optimizer = bayes_opt.BayesianOptimization(
        f=xgb_evaluate,
        pbounds=parameters,
        random_state=random_state,
    )

    optimizer.maximize(n_iter=n_iter, init_points=int_points)

    return optimizer.max['params']

Bay Opt for hyperparameter tuning

In [None]:
params = {
    'learning_rate': (0.05, 0.3),
    'max_leaves': (1, 40),
    'min_child_weight': (1, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.4, 1),
    'early_stopping_rounds': (10, 60),
    'max_depth': (0, 20)
}

best_params = bay_opt(train_x, train_y, parameters=params, int_points=200, n_iter=300)
print(best_params)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[3494]	validation_0-rmse:1.06661
[3495]	validation_0-rmse:1.06656
[3496]	validation_0-rmse:1.06650
[3497]	validation_0-rmse:1.06650
[3498]	validation_0-rmse:1.06652
[3499]	validation_0-rmse:1.06650
[3500]	validation_0-rmse:1.06652
[3501]	validation_0-rmse:1.06651
[3502]	validation_0-rmse:1.06648
[3503]	validation_0-rmse:1.06642
[3504]	validation_0-rmse:1.06638
[3505]	validation_0-rmse:1.06638
[3506]	validation_0-rmse:1.06640
[3507]	validation_0-rmse:1.06641
[3508]	validation_0-rmse:1.06639
[3509]	validation_0-rmse:1.06638
[3510]	validation_0-rmse:1.06636
[3511]	validation_0-rmse:1.06633
[3512]	validation_0-rmse:1.06632
[3513]	validation_0-rmse:1.06631
[3514]	validation_0-rmse:1.06630
[3515]	validation_0-rmse:1.06627
[3516]	validation_0-rmse:1.06627
[3517]	validation_0-rmse:1.06624
[3518]	validation_0-rmse:1.06623
[3519]	validation_0-rmse:1.06622
[3520]	validation_0-rmse:1.06623
[3521]	validation_0-rmse:1.06622
[3522]	vali

Grid Search to Narrow

In [None]:
def grid_search(X, y, params):

    model = xgb.XGBRegressor( n_jobs = -1, n_estimators = 50000)
    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X, y, eval_set=[(test_x, test_y)])

    print("Best parameters found:", grid.best_params_)
    print("Best score found:", grid.best_score_)

    best_estimator = grid.best_estimator_
    feature_importances = best_estimator.feature_importances_

    return best_estimator, feature_importances

found para from grid search, slowly converged

In [None]:
grid_para = {}
grid_search(X=train_x, y = train_y, params=grid_para)