# Using a model to predict a player's future offensive performance using wRC+

In [44]:
import numpy as np
import pandas as pd

import plotly.express as px

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

import plotly.io as pio
pio.templates.default = "plotly_dark"

In [45]:
# Data prior to 2023 will be used for training
data = pd.read_csv('../data/fangraphs_numeric_data_simplified_cleaned')
data = data.sort_values(by = 'Season', ascending=False)
data

Unnamed: 0,Season,AVG,ISO,BB%,K%,Barrel%,HardHit%,EV,maxEV,LA,GB%,FB%,SwStr%,CStr%,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,wRC+,wRC+_next
1868,2023,0.272,0.169,0.061,0.190,0.093,0.466,90.9,113.5,5.7,0.532,0.310,0.144,0.132,0.395,0.730,0.575,0.856,105.0,78.0
374,2023,0.292,0.208,0.062,0.248,0.110,0.397,89.1,113.3,14.3,0.397,0.391,0.148,0.125,0.389,0.757,0.609,0.814,130.0,98.0
2177,2023,0.222,0.155,0.040,0.252,0.071,0.439,89.8,112.1,16.6,0.391,0.391,0.165,0.142,0.416,0.711,0.525,0.831,77.0,89.0
402,2023,0.268,0.179,0.127,0.234,0.119,0.507,91.3,111.7,12.8,0.411,0.372,0.108,0.186,0.291,0.638,0.682,0.798,122.0,100.0
802,2023,0.236,0.241,0.114,0.319,0.165,0.485,91.0,112.3,22.2,0.288,0.481,0.159,0.131,0.301,0.753,0.498,0.775,118.0,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1643,2015,0.262,0.160,0.085,0.230,0.029,0.233,86.8,105.4,14.7,0.390,0.377,0.101,0.169,0.300,0.698,0.584,0.896,105.0,73.0
791,2015,0.304,0.194,0.076,0.181,0.074,0.377,88.1,108.5,5.4,0.500,0.284,0.076,0.203,0.329,0.598,0.727,0.898,136.0,111.0
2215,2015,0.226,0.209,0.089,0.223,0.079,0.374,89.7,111.4,15.5,0.370,0.442,0.110,0.129,0.318,0.752,0.617,0.873,93.0,111.0
799,2015,0.278,0.204,0.062,0.217,0.096,0.456,91.4,112.5,11.4,0.456,0.346,0.112,0.166,0.304,0.695,0.578,0.869,117.0,83.0


## RandomForestRegression Model
- Using a random forest regression model to try to predict future wRC+
- I am going to use a Random Forest Regressor for the ML model as it is robust to non-linear relationships in the data, which we found using a correlation plot in the EDA file.

In [46]:
data['Season'].unique()

array([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015])

In [58]:
def baseball_data_train_and_plot(data):
    train_years = range(2015, 2023)
    test_years = [2023]

    train_data = data[data['Season'].isin(train_years)]
    X_train = train_data.drop(columns=['wRC+_next', 'Season'])
    y_train = train_data['wRC+_next']

    test_data = data[data['Season'].isin(test_years)]
    X_test = test_data.drop(columns=['wRC+_next', 'Season'])
    y_test = test_data['wRC+_next']

    print(f"Training data: {X_train.shape}, {y_train.shape}")
    print(f"Testing data: {X_test.shape}, {y_test.shape}")

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)

    print(f"MSE: {mse:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")

    test_pred = pd.DataFrame({"wRC+ Predicted": predictions, "wRC+ Actual": y_test})
    test_pred_plot = px.scatter(test_pred, x = "wRC+ Predicted", y = "wRC+ Actual")

    test_pred_plot.add_shape(
        type="line",
        x0=test_pred["wRC+ Actual"].min(),
        y0=test_pred["wRC+ Actual"].min(),
        x1=test_pred["wRC+ Actual"].max(),
        y1=test_pred["wRC+ Actual"].max(),
        line=dict(color="red", dash="dash"),
    )

    test_pred_plot.show()


In [59]:
baseball_data_train_and_plot(data)

Training data: (2491, 18), (2491,)
Testing data: (322, 18), (322,)
MSE: 544.42, RMSE: 23.33, R²: 0.19


- With a root mean squeared error of 23.33, this means that the model is roughly off by 20 wRC+ points from the actual wRC+ the following year.
- Also with an $R^2$ of 0.19, our model only predicts around 20% of the variance in the data, which is not very accurate.
- It looks like the variance in the model's predictions are much more conservative than the actual variance in wRC+ that actually occurs. I would like to see how the results would have changed if we kept outliers in our data instead of removing them outright, since Random Forests are robust to outliers. 

In [43]:
data_with_outliers = pd.read_csv('../data/fangraphs_numeric_data_with_outliers')
data_with_outliers

Unnamed: 0,Season,AVG,ISO,BB%,K%,Barrel%,HardHit%,EV,maxEV,LA,GB%,FB%,SwStr%,CStr%,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,wRC+,wRC+_next
0,2017,0.210,0.161,0.074,0.178,0.034,0.310,86.0,106.4,11.5,0.495,0.369,0.075,0.211,0.211,0.622,0.667,0.873,82,105.0
1,2016,0.216,0.082,0.097,0.158,0.028,0.333,86.9,105.5,14.6,0.414,0.364,0.046,0.217,0.179,0.594,0.743,0.920,64,82.0
2,2015,0.238,0.166,0.147,0.175,0.055,0.329,87.7,108.5,9.8,0.437,0.352,0.054,0.231,0.142,0.570,0.682,0.889,116,64.0
3,2015,0.300,0.130,0.044,0.085,0.029,0.255,85.4,110.0,10.6,0.466,0.287,0.071,0.108,0.403,0.757,0.806,0.920,111,41.0
4,2022,0.245,0.143,0.061,0.186,0.094,0.420,88.8,110.8,11.8,0.452,0.358,0.118,0.128,0.379,0.761,0.636,0.882,92,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3197,2017,0.297,0.251,0.122,0.154,0.044,0.288,85.8,108.3,14.8,0.382,0.423,0.060,0.204,0.244,0.604,0.712,0.922,139,81.0
3198,2016,0.252,0.172,0.073,0.165,0.023,0.278,86.2,105.6,15.5,0.394,0.399,0.079,0.179,0.289,0.664,0.651,0.916,90,139.0
3199,2015,0.258,0.201,0.065,0.136,0.018,0.231,85.9,102.8,17.0,0.386,0.422,0.067,0.179,0.315,0.658,0.738,0.925,106,90.0
3200,2023,0.267,0.237,0.087,0.273,0.111,0.407,89.8,108.5,11.8,0.410,0.335,0.161,0.142,0.325,0.735,0.563,0.747,132,82.0


In [60]:
baseball_data_train_and_plot(data_with_outliers)

Training data: (2844, 18), (2844,)
Testing data: (358, 18), (358,)
MSE: 623.04, RMSE: 24.96, R²: 0.26


- The model using the dataset with the outliers was not much better, with an $R^2$ values of 0.26
- It looks like the model is bad at predicting players with wRC+ that are very low.

## Hyperparameter Tuning
- I am going to see if changing the hyper parameters around will make the model any more accurate.