In [227]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

-------------------------------------------------------
Building the models

Do the following individually:

* Linear Regression
* Decision Tree Regression
* Support Vector Machine Regression
* Random Forest Regression

Perform some ensembles:

* LR, DTR, and SVR
* LR, DTR, SVR, and Random Forest
* Various Combinations



-------------------------------------------------------


Loading in the Dataset

In [197]:
master = pd.read_csv("master.csv")

master = master.dropna(subset =['AAV'])


Removing columns we don't want and performing train test split

In [198]:
X = master.drop(columns = ['AAV', 'Season', 'fullName', 'Age 7/1/21', 'Age 7/1/22', 'Age 7/1/23', 'Age 7/1/24',
                          'Details', 'Baseball Ops      head / club GM', 'Club Owner', 'Player Agent', 'Option',
                          'Term', 'Guarantee', 'Years', 'New Club', 'Old    Club', 'Qual    Offer', 'nameFirst',
                          'nameLast', 'MLBAMID', 'PlayerId', 'NameASCII', 'Team', "Pos'n", 'XBR', 'Age'])

y = master['AAV']

X.info()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 0 to 2166
Data columns (total 42 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PA        584 non-null    int64  
 1   BB%       584 non-null    float64
 2   K%        584 non-null    float64
 3   BB/K      584 non-null    float64
 4   AVG       584 non-null    float64
 5   OBP       584 non-null    float64
 6   SLG       584 non-null    float64
 7   OPS       584 non-null    float64
 8   ISO       584 non-null    float64
 9   Spd       584 non-null    float64
 10  BABIP     584 non-null    float64
 11  UBR       584 non-null    float64
 12  wGDP      584 non-null    float64
 13  wSB       584 non-null    float64
 14  wRC       584 non-null    float64
 15  wRAA      584 non-null    float64
 16  wOBA      584 non-null    float64
 17  wRC+      584 non-null    float64
 18  Events    584 non-null    int64  
 19  EV        584 non-null    float64
 20  maxEV     584 non-null    float64
 2

Setting up the preprocessor. Only numeric columns in the models, so just standard scaling needs to be done.

In [199]:
numerical_cols = X.columns.values

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols)
    ]
)

Start out with a multiple linear regression, using mean absolute error as the model evaluation metric.

In [200]:
# Linear Regression Pipeline

LR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

LR_pipeline.fit(X_train, y_train)

LR_FA_pred = LR_pipeline.predict(X_test)

LR_MAE = mean_absolute_error(y_test, LR_FA_pred)
print('Linear Regression -- Mean absolute error:', LR_MAE)

Linear Regression -- Mean absolute error: 5731563.16262646


-------------------------------------------------------
Decision Tree Regression

In [201]:
DTR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

DTR_pipeline.fit(X_train, y_train)

DTR_FA_pred = DTR_pipeline.predict(X_test)

DTR_MAE = mean_absolute_error(y_test, DTR_FA_pred)
print('Decision Tree Regression -- Mean absolute error:', DTR_MAE)

Decision Tree Regression -- Mean absolute error: 6890782.846153846


Decision Tree Regression Hyperparameter Grid Search:

In [65]:
DTR_param_grid = {
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

DTR_grid_search = GridSearchCV(DecisionTreeRegressor(), DTR_param_grid, scoring = 'neg_mean_absolute_error')
DTR_grid_search.fit(X_train, y_train)
DTR_grid_search.best_params_

{'max_depth': 5, 'min_samples_split': 10, 'splitter': 'random'}

New Decision Tree Regression with tuned hyperparameters:

In [202]:
DTR_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth = 5, min_samples_split = 10, splitter = 'random'))
])

DTR_pipeline2.fit(X_train, y_train)

DTR_FA_pred2 = DTR_pipeline2.predict(X_test)

DTR_MAE2 = mean_absolute_error(y_test, DTR_FA_pred2)
print('Decision Tree Regression -- Mean absolute error:', DTR_MAE2)

Decision Tree Regression -- Mean absolute error: 5545654.164611248


----------------------------------------

Support Vector Machine Regression

In [205]:
SVR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

SVR_pipeline.fit(X_train, y_train)

SVR_FA_pred = SVR_pipeline.predict(X_test)

SVR_MAE = mean_absolute_error(y_test, SVR_FA_pred)

print('SVM Regression -- Mean absolute error:', SVR_MAE)

SVM Regression -- Mean absolute error: 8258760.263538601



-----------------------------------------------------------

Random Forest Regression



In [206]:
RF_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random forest', RandomForestRegressor())
])

RF_pipeline.fit(X_train, y_train)

RF_FA_pred = RF_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, RF_FA_pred)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 4967980.809316238





Random Forest Hyperparameter Grid Search:

In [28]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), rf_param_grid, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

New Random Forest Model with tuned hyperparameters:

In [210]:
RF_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('random forest', RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_split = 2))
])

RF_pipeline2.fit(X_train, y_train)

RF_FA_pred2 = RF_pipeline2.predict(X_test)

mae = mean_absolute_error(y_test, RF_FA_pred2)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 5000523.2391453


Mean absolute error has still been pretty bad for the models, so we're gonna look at the importance of each feature and remove some as necessary

In [173]:
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances)


     Feature  Importance
17      wRC+    0.111363
15      wRAA    0.098519
20     maxEV    0.061816
16      wOBA    0.056090
19        EV    0.054484
25  HardHit%    0.048059
0         PA    0.043100
31     HR/FB    0.035559
39     Soft%    0.033277
14       wRC    0.026953
9        Spd    0.026770
22   Barrels    0.025235
33      IFH%    0.025216
7        OPS    0.023952
41     Hard%    0.021784
11       UBR    0.020949
24   HardHit    0.020782
1        BB%    0.020260
12      wGDP    0.020258
23   Barrel%    0.016509
36     Pull%    0.016084
13       wSB    0.014334
2         K%    0.012473
38     Oppo%    0.012383
8        ISO    0.011192
27       LD%    0.011172
10     BABIP    0.011099
5        OBP    0.010487
18    Events    0.010330
32       IFH    0.009880
3       BB/K    0.009769
30     IFFB%    0.009699
6        SLG    0.009467
37     Cent%    0.009464
4        AVG    0.008760
40      Med%    0.008556
28       GB%    0.008484
21        LA    0.007713
26     GB/FB    0.006801


From FB% down, these features will be removed and we'll see if models are improved. Also need to modify the preprocessor since our columns have changed a bit.

In [211]:
new_X = X.drop(columns = ['FB%', 'Events', 'Med%', 'GB/FB', 'GB%', 'ISO', 'BUH%', 'BUH'])

new_X.info()

X_train2, X_test2, y_train2, y_test2 = train_test_split(new_X, y, test_size = 0.2)

numerical_cols = new_X.columns.values

new_preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols)
    ]
)

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 0 to 2166
Data columns (total 34 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PA        584 non-null    int64  
 1   BB%       584 non-null    float64
 2   K%        584 non-null    float64
 3   BB/K      584 non-null    float64
 4   AVG       584 non-null    float64
 5   OBP       584 non-null    float64
 6   SLG       584 non-null    float64
 7   OPS       584 non-null    float64
 8   Spd       584 non-null    float64
 9   BABIP     584 non-null    float64
 10  UBR       584 non-null    float64
 11  wGDP      584 non-null    float64
 12  wSB       584 non-null    float64
 13  wRC       584 non-null    float64
 14  wRAA      584 non-null    float64
 15  wOBA      584 non-null    float64
 16  wRC+      584 non-null    float64
 17  EV        584 non-null    float64
 18  maxEV     584 non-null    float64
 19  LA        584 non-null    float64
 20  Barrels   584 non-null    int64  
 2

--------------------------------

Linear Regression with removed features:

In [176]:
LR_pipeline3 = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('regressor', LinearRegression())
])

LR_pipeline3.fit(X_train2, y_train2)

LR_FA_pred3 = LR_pipeline3.predict(X_test2)

mae = mean_absolute_error(y_test2, LR_FA_pred3)
print('Linear Regression -- Mean absolute error:', mae)

Linear Regression -- Mean absolute error: 6265482.8036259785


--------------------------------

Decision Tree Regression with removed features:

In [178]:
DTR_pipeline3 = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth = 5, min_samples_split = 10, splitter = 'random'))
])

DTR_pipeline3.fit(X_train2, y_train2)

DTR_FA_pred3 = DTR_pipeline3.predict(X_test2)

mae = mean_absolute_error(y_test2, DTR_FA_pred3)
print('Decision Tree Regression -- Mean absolute error:', mae)

Decision Tree Regression -- Mean absolute error: 7029840.910463934


--------------------------------

SVM Regression with removed features:

In [180]:
SVR_pipeline3 = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('regressor', SVR())
])

SVR_pipeline3.fit(X_train2, y_train2)

SVR_FA_pred3 = SVR_pipeline3.predict(X_test2)

SVR_MAE2 = mean_absolute_error(y_test2, SVR_FA_pred3)

print('SVM Regression -- Mean absolute error:', SVR_MAE2)

SVM Regression -- Mean absolute error: 7845474.899472245


--------------------------------

Random Forest Regression with removed features:

In [213]:
RF_pipeline3 = Pipeline(steps=[
    ('preprocessor', new_preprocessor),
    ('random forest', RandomForestRegressor(n_estimators = 200, max_depth = None, min_samples_split = 2))
])

RF_pipeline3.fit(X_train2, y_train2)

RF_FA_pred3 = RF_pipeline3.predict(X_test2)

mae = mean_absolute_error(y_test2, RF_FA_pred3)
print('Random Forest Regressor -- Mean absolute error:', mae)

Random Forest Regressor -- Mean absolute error: 6214224.535683761


Mean absolute error looks a bit better after removing some features for all of the individual models

--------------------------------------

let's do some ensemble methods

In [228]:
# Base models
LR = LinearRegression()
DTR = DecisionTreeRegressor(max_depth = 5, min_samples_split = 10, splitter = 'random')
SVR = SVR()
RF = RandomForestRegressor(n_estimators = 300, max_depth = None, min_samples_split = 2)


In [194]:
# Stacking Regressor
ensemble1 = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('DTR', DTR),
        ('SVR', SVR)
    ],
    final_estimator=LR
)

ensemble1.fit(X_train, y_train)

ensemble_FA_pred = ensemble1.predict(X_test)

ensemble_MAE = mean_absolute_error(y_test, ensemble_FA_pred)
print('Ensemble -- Mean absolute error:', ensemble_MAE)

Ensemble -- Mean absolute error: 5935341.185873699


In [196]:

# Stacking Regressor
ensemble2 = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('DTR', DTR),
        ('SVR', SVR),
        ('RF', RF)
    ],
    final_estimator= LR
)

ensemble2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble2)

])

ensemble2_pipeline.fit(X_train, y_train)

ensemble_FA_pred2 = ensemble2_pipeline.predict(X_test)

ensemble_MAE2 = mean_absolute_error(y_test, ensemble_FA_pred2)
print('Ensemble -- Mean absolute error:', ensemble_MAE2)

Ensemble -- Mean absolute error: 5785603.0027840305


In [230]:
# Stacking Regressor
ensemble5 = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('RF', RF)
    ],
    final_estimator= LR
)

ensemble5_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble5)

])

ensemble5_pipeline.fit(X_train, y_train)

ensemble_FA_pred5 = ensemble5_pipeline.predict(X_test)

ensemble_MAE5 = mean_absolute_error(y_test, ensemble_FA_pred5)
print('Ensemble -- Mean absolute error:', ensemble_MAE5)

Ensemble -- Mean absolute error: 5126994.809496413


In [118]:
from sklearn.svm import SVR

Let's try the same ensembles but with voting instead of stacking:

In [186]:

# Voting Regressor
ensemble3 = VotingRegressor(estimators=[
    ('LR', LR),
    ('DTR', DTR),
    ('SVR', SVR)
])

ensemble3_pipeline = Pipeline([
    ('preprocessor', new_preprocessor),
    ('ensemble', ensemble3)

])

ensemble3_pipeline.fit(X_train2, y_train2)

ensemble_FA_pred3 = ensemble3_pipeline.predict(X_test2)

ensemble_MAE3 = mean_absolute_error(y_test2, ensemble_FA_pred3)
print('Ensemble -- Mean absolute error:', ensemble_MAE3)

Ensemble -- Mean absolute error: 6355592.213450073


In [187]:

# Voting Regressor
ensemble4 = VotingRegressor(estimators=[
    ('LR', LR),
    ('DTR', DTR),
    ('SVR', SVR),
    ('RF', RF)
])

ensemble4_pipeline = Pipeline([
    ('preprocessor', new_preprocessor),
    ('ensemble', ensemble4)

])

ensemble4_pipeline.fit(X_train2, y_train2)

ensemble_FA_pred4 = ensemble4_pipeline.predict(X_test2)

ensemble_MAE4 = mean_absolute_error(y_test2, ensemble_FA_pred4)
print('Ensemble -- Mean absolute error:', ensemble_MAE4)

Ensemble -- Mean absolute error: 6277858.943226337


Feeding Models new data (example cases):

In [218]:
ten_players = pd.read_csv('ten_players.csv')

names = ten_players['Name']

ten_players_scaled = preprocessor.transform(ten_players.drop(columns = ['Name']))

ten_players_df = pd.DataFrame(ten_players_scaled, columns = ten_players.drop(columns = ['Name']).columns)

predicted_salary = RF_pipeline.predict(ten_players_df)

ten_players_df['Player'] = names.values

ten_players_df['Predicted Salary'] = predicted_salary

ten_players_df

Unnamed: 0,PA,BB%,K%,BB/K,AVG,OBP,SLG,OPS,Spd,BABIP,...,FB%,Events,Med%,GB/FB,GB%,ISO,BUH%,BUH,Player,Predicted Salary
0,1.63822,2.927791,-0.704122,3.5281,0.914205,2.701617,1.918764,2.388817,1.798709,0.203734,...,1.691068,1.947975,0.831793,0.079548,-1.195396,-1.214535,-0.842337,1.40335,Juan Soto,24093818.3
1,-0.958367,1.078786,0.537878,0.393988,0.442244,1.855958,1.089303,1.470873,1.058841,-0.046111,...,-0.013383,0.405866,1.750162,0.408812,-2.704493,-0.943426,0.026358,0.522893,Joc Pederson,21625688.43
2,1.392332,0.60614,0.864375,-0.14343,-0.443709,-0.097046,0.286753,0.178711,0.553798,0.28828,...,-0.581533,-0.69564,0.852109,-0.270532,-0.948615,-0.137977,0.473009,-0.314472,Willy Adames,18860416.67
3,-0.378069,1.126926,0.070563,0.788561,-0.346748,0.803025,-0.591232,-0.153815,-0.522294,0.059048,...,-0.013383,-0.034736,-0.909293,0.359126,0.957053,0.914348,0.038981,-0.560547,Jesse Winker,16042000.07
4,-2.020607,-0.570291,1.041412,-1.027806,-0.917288,-1.030657,-0.10641,-0.446067,0.325808,-0.985088,...,-0.581533,-0.69564,-0.453555,-0.664138,1.138481,-1.094096,1.177681,-0.349486,Travis d'Arnaud,14045593.07
5,-0.112509,-0.549823,0.152606,-0.671609,-1.06133,-0.947291,-1.189919,-1.222203,-0.872687,-1.528385,...,-0.581533,-0.69564,-0.894299,0.57291,0.769028,-0.795356,1.688363,-0.947569,Ty France,13154332.32
6,-2.315673,-1.271199,-0.937107,-0.788159,0.535743,-0.730851,-1.042854,-1.035774,-1.484696,1.118807,...,-0.581533,-0.69564,-0.905372,0.333482,0.971817,-0.319523,2.873776,-2.210181,Kevin Newman,15458220.84
7,-2.994327,-0.312987,-0.044117,-0.367153,-1.783144,-1.306336,-1.370502,-1.484329,-0.729758,0.04508,...,-0.581533,-0.69564,-0.153874,0.79802,-0.416027,-0.898842,1.423054,-0.666721,Kyle Farmer,12455378.93
8,-3.289394,1.08344,1.449926,-0.112171,-1.961679,-0.435606,-2.710109,-2.170613,-2.211308,-0.368705,...,1.122918,1.782749,-1.282226,-0.459668,2.10696,-1.502324,1.96067,-0.766186,Austin Slater,21681439.47
9,-3.93854,-1.497,2.552692,-1.865079,-4.051675,-4.188735,-3.408303,-4.026036,-2.002832,0.501356,...,0.554768,0.405866,2.290862,-3.000109,-0.768503,-0.542468,2.176171,-1.500109,Austin Hedges,13532779.36
