### Model

In [7]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import os 
import pandas as pd

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')
dataset = pd.read_csv('./data/datasets/cleaned_war_pitch.csv')

In [10]:
# features
X = dataset.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "Name", "pitch_name", "description", "launch_angle", "launch_speed", "sz_top", "sz_bot","estimated_woba"]).dropna(axis=0)
y = X['WAR']
X = X.drop(columns=['WAR'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [11]:
# best para found after grid search
opti_para = {'colsample_bytree': 0.9,
             'learning_rate': 0.2, 'max_depth': 20,
             'max_leaves': 39, 'min_child_weight': 6,
             'subsample': 1}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=50000, early_stopping_rounds=3)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)
# Attach predictions to true values and player names
results_df = test_x.copy()
results_df['True_WAR'] = test_y.values
results_df['Predicted_WAR'] = predictions
results_df['Name'] = dataset.loc[test_x.index, 'Name'].values

print(results_df[['Name', 'True_WAR', 'Predicted_WAR']])

r2 = r2_score(test_y, predictions)
print("R-squared on Test Set:", r2)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
# Create a dataframe for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

print(feature_importance_df)

                       Name  True_WAR  Predicted_WAR
259513       harrison, kyle  0.801637       0.785467
655072       gausman, kevin  2.860714       1.314264
204225        brieske, beau  0.719378       0.850329
529547     crawford, kutter  1.858968       1.563137
208627          mears, nick  0.772697       0.511024
...                     ...       ...            ...
271057     alexander, tyler -0.569328       0.633645
481607          gray, sonny  3.837520       4.025503
673560  yamamoto, yoshinobu  2.822026       2.755807
575505        barlow, scott  0.038422       1.866762
196039    kochanowicz, jack  0.612831       0.161843

[64828 rows x 3 columns]
R-squared on Test Set: 0.6238331898250111
Mean Squared Error on Test Set: 0.7509932658257675
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.9, device=None, early_stopping_rounds=3,
             enable_categorical=False, eval_metric=N