### training of model based on optimized hyperparamters

In [8]:
# import packages and dataset
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import os

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')
dataset = pd.read_csv('./data/datasets/cleaned_war_pitch.csv')

In [10]:
# features
X = dataset.drop(columns=["Unnamed: 0", "Name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot", "WAR"]).dropna(axis=0)
y = X['estimated_woba']
X = X.drop(columns=['estimated_woba'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [None]:
# best para found after grid search
opti_para = {'colsample_bytree': 0.9,
             'learning_rate': 0.2, 'max_depth': 5,
             'max_leaves': 39, 'min_child_weight': 6,
             'subsample': 1}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=50000, early_stopping_rounds=3)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)
print(predictions)

r2 = r2_score(test_y, predictions)
print("R-squared on Test Set:", r2)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
for feature, importance in zip(feature_names, feature_importances):
    print(f"Feature: {feature}, Importance: {importance}")


[-0.10586551  0.17400624  0.15825532 ...  0.17718679 -0.11977647
  0.11714765]
R-squared on Test Set: 0.2210806586877433
Mean Squared Error on Test Set: 0.045955857648654605
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=3,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=39,
             min_child_weight=6, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=50000, n_jobs=-1,
             num_parallel_tree=None, random_state=None, ...)
Feature: called_strike, Importance: 0.452169269323349
Feature: swinging_strike, Importance: 0