### Model

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import os 
import pandas as pd

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')
dataset = pd.read_csv('./data/datasets/yearly/yearly_player_average.csv')

In [7]:
# features
X = dataset.drop(columns=[ "Name", "launch_angle", "launch_speed", "sz_top", "sz_bot","estimated_woba"]).dropna(axis=0)
y = X['WAR']
X = X.drop(columns=['WAR'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [8]:
# best para found after grid search
opti_para = {'colsample_bytree': 0.9,
             'learning_rate': 0.05, 'max_depth': 40,
             'max_leaves': 39, 'min_child_weight': 6,
             'subsample': 1}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=50000, early_stopping_rounds=3)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)



# Attach predictions to true values and player names
results_df = test_x.copy()
results_df['True_WAR'] = test_y.values
results_df['Predicted_WAR'] = predictions
results_df['Name'] = dataset.loc[test_x.index, 'Name'].values

print(results_df[['Name', 'True_WAR', 'Predicted_WAR']])

r2 = r2_score(test_y, predictions)
print("R-squared on Test Set:", r2)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
# Create a dataframe for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

print(feature_importance_df)

                  Name  True_WAR  Predicted_WAR
681   toussaint, touki -0.146697      -0.017541
441    mcarthur, james  0.102310       0.778534
248     gordon, tanner -0.243583      -0.120517
502     nittoli, vinny  0.078904       0.029006
69     blackburn, paul  0.449186       0.146967
..                 ...       ...            ...
94       brown, hunter  3.128726       2.283175
757  yastrzemski, mike -0.045487      -0.007902
595  sandoval, patrick  1.246824       0.738500
417       maeda, kenta -0.048807       0.775979
167        darvish, yu  1.071558       0.857380

[77 rows x 3 columns]
R-squared on Test Set: 0.7322505554958834
Mean Squared Error on Test Set: 0.36904851840223113
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=3,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=