### Model

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import os 
import pandas as pd

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')
dataset = pd.read_csv('./data/datasets/yearly/yearly_player_average.csv')

In [3]:
# features
X = dataset.drop(columns=[ "Name", "launch_angle", "launch_speed", "sz_top", "sz_bot"]).dropna(axis=0)
y = X['WAR']
X = X.drop(columns=['WAR'])

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.1, random_state=26)

In [30]:
# best para found after grid search
opti_para = {'colsample_bytree': 1,
             'learning_rate': 0.2, 'max_depth': 12,
             'max_leaves': 5, 'min_child_weight': 6,
             'subsample': 0.6}
# model
reg = xgb.XGBRegressor(**opti_para, n_jobs=-1, n_estimators=55)
reg.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=False)

# features / predictions
feature_importances = reg.feature_importances_
predictions = reg.predict(test_x)


# Attach predictions to true values and player names
results_df = test_x.copy()
results_df['True_WAR'] = test_y.values
results_df['Predicted_WAR'] = predictions
results_df['Name'] = dataset.loc[test_x.index, 'Name'].values

print(results_df[['Name', 'True_WAR', 'Predicted_WAR']])

r2 = r2_score(test_y, predictions)
print("R-squared on Test Set:", r2)

# results and feature importances
print("Mean Squared Error on Test Set:", mean_squared_error(test_y, predictions))
print(reg)
hasattr(train_x, 'columns')
feature_names = train_x.columns
# Create a dataframe for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

print(feature_importance_df)

                  Name  True_WAR  Predicted_WAR
681   toussaint, touki -0.146697       0.051244
441    mcarthur, james  0.102310       0.243448
248     gordon, tanner -0.243583      -0.167279
502     nittoli, vinny  0.078904       0.173153
69     blackburn, paul  0.449186      -0.121409
..                 ...       ...            ...
94       brown, hunter  3.128726       2.361960
757  yastrzemski, mike -0.045487      -0.068280
595  sandoval, patrick  1.246824       0.902513
417       maeda, kenta -0.048807       1.381054
167        darvish, yu  1.071558       0.883550

[77 rows x 3 columns]
R-squared on Test Set: 0.8325445274735737
Mean Squared Error on Test Set: 0.23080979364374749
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
      

In [33]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores = cross_val_score(reg, X, y, cv=18, scoring='r2')

# Print the cross-validation scores
print("Cross-validation R-squared scores:", cv_scores)
print("Mean cross-validation R-squared score:", np.mean(cv_scores))
print("Standard deviation of cross-validation R-squared scores:", np.std(cv_scores))

cv_mse_scores = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse_scores = -cv_mse_scores  # Convert back to positive MSE

# Print the cross-validation scores
print("Cross-validation MSE scores:", cv_mse_scores)
print("Mean cross-validation MSE score:", np.mean(cv_mse_scores))
print("Standard deviation of cross-validation MSE scores:", np.std(cv_mse_scores))

Cross-validation R-squared scores: [0.07959972 0.65794828 0.79461273 0.73921682 0.59843064 0.84942697
 0.66324266 0.69249295 0.8543054  0.6253489  0.62576183 0.73885656
 0.78503122 0.81109192 0.80698846 0.63481022 0.77182449 0.70567669]
Mean cross-validation R-squared score: 0.6908148034659234
Standard deviation of cross-validation R-squared scores: 0.16775494709175903
Cross-validation MSE scores: [0.28257146 0.24566368 0.28311804 0.26569828 0.32090205]
Mean cross-validation MSE score: 0.27959070275404996
Standard deviation of cross-validation MSE scores: 0.024787081593460755
