### Model

In [11]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import os 
import pandas as pd

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')
dataset = pd.read_csv('./data/datasets/yearly/yearly_player_average.csv')

In [None]:
# features
X = dataset.drop(columns=[ "Name", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['WAR']
X = X.drop(columns=['WAR'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [13]:
# best para found after grid search
opti_para = {'colsample_bytree': 1,
             'learning_rate': 0.2, 'max_depth': 12,
             'max_leaves': 5, 'min_child_weight': 6,
             'subsample': 0.6}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=55)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)


# Attach predictions to true values and player names
results_df = test_x.copy()
results_df['True_WAR'] = test_y.values
results_df['Predicted_WAR'] = predictions
results_df['Name'] = dataset.loc[test_x.index, 'Name'].values

print(results_df[['Name', 'True_WAR', 'Predicted_WAR']])

r2 = r2_score(test_y, predictions)
print("R-squared on Test Set:", r2)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
# Create a dataframe for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

print(feature_importance_df)

                    Name  True_WAR  Predicted_WAR
420     mercado, michael -0.742375      -0.116191
608        swanson, erik -0.665117      -0.197235
665    whitlock, garrett  0.416752       0.617049
633          vesia, alex  1.072765       1.164011
358           lee, dylan  0.777845       1.182667
..                   ...       ...            ...
271        hicks, jordan  0.472937       1.400937
63      birdsong, hayden  0.100516       0.489009
521    rodriguez, carlos -0.024470      -0.304334
27   arrighetti, spencer  1.623255       2.767948
278        hoffman, jeff  2.042234       1.220502

[70 rows x 3 columns]
R-squared on Test Set: 0.8360260223579288
Mean Squared Error on Test Set: 0.22903751132913297
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=None, 

In [15]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores = cross_val_score(reg, X, y, cv=10, scoring='r2')

# Print the cross-validation scores
print("Cross-validation R-squared scores:", cv_scores)
print("Mean cross-validation R-squared score:", np.mean(cv_scores))
print("Standard deviation of cross-validation R-squared scores:", np.std(cv_scores))

cv_mse_scores = cross_val_score(reg, X, y, cv=10, scoring='neg_mean_squared_error')
cv_mse_scores = -cv_mse_scores  # Convert back to positive MSE

# Print the cross-validation scores
print("Cross-validation MSE scores:", cv_mse_scores)
print("Mean cross-validation MSE score:", np.mean(cv_mse_scores))
print("Standard deviation of cross-validation MSE scores:", np.std(cv_mse_scores))

Cross-validation R-squared scores: [0.46843109 0.77406979 0.703822   0.83263767 0.78832306 0.58064328
 0.72892975 0.78443528 0.7709707  0.68886623]
Mean cross-validation R-squared score: 0.7121128857832468
Standard deviation of cross-validation R-squared scores: 0.10508314273953138
Cross-validation MSE scores: [0.30810754 0.23084449 0.33210695 0.21347935 0.2461374  0.37026526
 0.28868362 0.28347956 0.30141882 0.42273807]
Mean cross-validation MSE score: 0.29972610514948317
Standard deviation of cross-validation MSE scores: 0.06055589510891567
