In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

Loading in the Dataset

In [2]:
master = pd.read_csv("master.csv")

master = master.dropna(subset =['AAV'])


Removing columns we don't want and performing train test split

In [13]:
X = master.drop(columns = ['AAV', 'Season', 'fullName', 'Age 7/1/21', 'Age 7/1/22', 'Age 7/1/23', 'Age 7/1/24',
                          'Details', 'Baseball Ops      head / club GM', 'Club Owner', 'Player Agent', 'Option',
                          'Term', 'Guarantee', 'Years', 'New Club', 'Old    Club', 'Qual    Offer', 'nameFirst',
                          'nameLast', 'MLBAMID', 'PlayerId', 'NameASCII', 'Team', "Pos'n", 'XBR', 'Age'])

y = master['AAV']

X.info()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 0 to 2166
Data columns (total 42 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PA        584 non-null    int64  
 1   BB%       584 non-null    float64
 2   K%        584 non-null    float64
 3   BB/K      584 non-null    float64
 4   AVG       584 non-null    float64
 5   OBP       584 non-null    float64
 6   SLG       584 non-null    float64
 7   OPS       584 non-null    float64
 8   ISO       584 non-null    float64
 9   Spd       584 non-null    float64
 10  BABIP     584 non-null    float64
 11  UBR       584 non-null    float64
 12  wGDP      584 non-null    float64
 13  wSB       584 non-null    float64
 14  wRC       584 non-null    float64
 15  wRAA      584 non-null    float64
 16  wOBA      584 non-null    float64
 17  wRC+      584 non-null    float64
 18  Events    584 non-null    int64  
 19  EV        584 non-null    float64
 20  maxEV     584 non-null    float64
 2

Setting up the preprocessor. Only numeric columns in the models, so just standard scaling needs to be done.

In [4]:
numerical_cols = X.columns.values

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols)
    ]
)

Start out with a multiple linear regression, using mean absolute error as the model evaluation metric.

In [5]:
# Linear Regression Pipeline

LR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

LR_pipeline.fit(X_train, y_train)

FA_pred = LR_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, FA_pred)
print('Linear Regression -- Mean absolute error:', mae)

Linear Regression -- Mean absolute error: 5796331.8740702635



-----------------------------------------------------------

Random Forest



In [6]:
rf_scores = cross_val_score(RandomForestRegressor(), X_train, y_train, cv = 5, scoring = 'neg_mean_absolute_error')
-rf_scores
np.mean(-rf_scores)

np.float64(5615637.100046443)

In [7]:
RF_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random forest', RandomForestRegressor())
])

RF_pipeline.fit(X_train, y_train)

FA_pred = RF_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, FA_pred)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 5671590.856752137



------------------------------------------------------------------

Random Forest Hyperparameter Grid Search:

In [28]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), rf_param_grid, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

New Random Forest Model with tuned hyperparameters:

In [8]:
RF_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random forest', RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_split = 2))
])

RF_pipeline.fit(X_train, y_train)

FA_pred = RF_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, FA_pred)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 5616345.140512821


Mean absolute error has still been pretty bad for the models, so we're gonna look at the importance of each feature and remove some as necessary

In [18]:
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)


     Feature  Importance
15      wRAA    0.138422
17      wRC+    0.074774
20     maxEV    0.072046
22   Barrels    0.055725
39     Soft%    0.054031
7        OPS    0.047747
31     HR/FB    0.046887
9        Spd    0.044243
14       wRC    0.039701
19        EV    0.034163
16      wOBA    0.032706
33      IFH%    0.027054
25  HardHit%    0.021936
11       UBR    0.021627
12      wGDP    0.021415
36     Pull%    0.018380
2         K%    0.015375
41     Hard%    0.015245
0         PA    0.014893
38     Oppo%    0.013608
13       wSB    0.013423
24   HardHit    0.012963
1        BB%    0.012281
5        OBP    0.011910
23   Barrel%    0.011377
3       BB/K    0.011073
27       LD%    0.011041
32       IFH    0.010522
30     IFFB%    0.009823
37     Cent%    0.009727
10     BABIP    0.008820
21        LA    0.008618
6        SLG    0.008153
4        AVG    0.007649
29       FB%    0.007484
18    Events    0.006951
40      Med%    0.006613
26     GB/FB    0.005397
28       GB%    0.005279


From FB% down, these features will be removed and we'll see if model is improved. Also need to modify the preprocessor since our columns have changed a bit.

In [14]:
new_X = X.drop(columns = ['FB%', 'Events', 'Med%', 'GB/FB', 'GB%', 'ISO', 'BUH%', 'BUH'])

new_X.info()

X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size = 0.2)

numerical_cols = new_X.columns.values

new_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols)
    ]
)

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 0 to 2166
Data columns (total 34 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PA        584 non-null    int64  
 1   BB%       584 non-null    float64
 2   K%        584 non-null    float64
 3   BB/K      584 non-null    float64
 4   AVG       584 non-null    float64
 5   OBP       584 non-null    float64
 6   SLG       584 non-null    float64
 7   OPS       584 non-null    float64
 8   Spd       584 non-null    float64
 9   BABIP     584 non-null    float64
 10  UBR       584 non-null    float64
 11  wGDP      584 non-null    float64
 12  wSB       584 non-null    float64
 13  wRC       584 non-null    float64
 14  wRAA      584 non-null    float64
 15  wOBA      584 non-null    float64
 16  wRC+      584 non-null    float64
 17  EV        584 non-null    float64
 18  maxEV     584 non-null    float64
 19  LA        584 non-null    float64
 20  Barrels   584 non-null    int64  
 2

In [17]:
RF_pipeline = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('random forest', RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_split = 2))
])

RF_pipeline.fit(X_train, y_train)

FA_pred = RF_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, FA_pred)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 5585602.536923076


Mean absolute error looks a bit better after removing some features