In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [22]:
master = pd.read_csv("master.csv")
master = master.dropna(subset =['AAV'])

X = master.drop(columns = ['AAV', 'Season', 'fullName', 'Age 7/1/21', 'Age 7/1/22', 'Age 7/1/23', 'Age 7/1/24',
                          'Details', 'Baseball Ops      head / club GM', 'Club Owner', 'Player Agent', 'Option',
                          'Term', 'Guarantee', 'Years', 'New Club', 'Old    Club', 'Qual    Offer', 'nameFirst',
                          'nameLast', 'MLBAMID', 'PlayerId', 'NameASCII', 'Team', "Pos'n", 'XBR', 'Age'])

y = master['AAV']

X.info()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 0 to 2166
Data columns (total 42 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PA        584 non-null    int64  
 1   BB%       584 non-null    float64
 2   K%        584 non-null    float64
 3   BB/K      584 non-null    float64
 4   AVG       584 non-null    float64
 5   OBP       584 non-null    float64
 6   SLG       584 non-null    float64
 7   OPS       584 non-null    float64
 8   ISO       584 non-null    float64
 9   Spd       584 non-null    float64
 10  BABIP     584 non-null    float64
 11  UBR       584 non-null    float64
 12  wGDP      584 non-null    float64
 13  wSB       584 non-null    float64
 14  wRC       584 non-null    float64
 15  wRAA      584 non-null    float64
 16  wOBA      584 non-null    float64
 17  wRC+      584 non-null    float64
 18  Events    584 non-null    int64  
 19  EV        584 non-null    float64
 20  maxEV     584 non-null    float64
 2

In [23]:
numerical_cols = X.columns.values

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_cols)
    ]
)

In [93]:
# Linear Regression
LR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('regressor', LinearRegression())
])

# Decision Tree Regression
DTR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('regressor', DecisionTreeRegressor(criterion = 'absolute_error'))
])

# SVR
SVR_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('regressor', SVR())
])

# Random Forest
RF_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('random forest', RandomForestRegressor())
])



In [94]:
LR_pipeline.fit(X_train, y_train)

DTR_pipeline.fit(X_train, y_train)

SVR_pipeline.fit(X_train, y_train)

RF_pipeline.fit(X_train, y_train)

In [95]:
LR_FA_pred = LR_pipeline.predict(X_test)

DTR_FA_pred = DTR_pipeline.predict(X_test)

SVR_FA_pred = SVR_pipeline.predict(X_test)

RF_FA_pred = RF_pipeline.predict(X_test)

In [97]:
LR_MAE = mean_absolute_error(y_test, LR_FA_pred)

DTR_MAE = mean_absolute_error(y_test, DTR_FA_pred)

SVR_MAE = mean_absolute_error(y_test, SVR_FA_pred)

RF_MAE = mean_absolute_error(y_test, RF_FA_pred)

print('Linear Regression -- Mean absolute error:', LR_MAE)
print('Decision Tree Regression -- Mean absolute error:', DTR_MAE)
print('Support Vector Regression -- Mean absolute error:', SVR_MAE)
print('Random Forest Regression -- Mean absolute error:', RF_MAE)

Linear Regression -- Mean absolute error: 6118864.6721081035
Decision Tree Regression -- Mean absolute error: 8166470.495726496
Support Vector Regression -- Mean absolute error: 7200103.571179798
Random Forest Regression -- Mean absolute error: 5838804.17119658


In [36]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'criterion': ['squared_error', 'absolute_error'],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(), rf_param_grid, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'criterion': 'absolute_error',
 'max_depth': 5,
 'min_samples_split': 10,
 'n_estimators': 300}

In [61]:
dtr_param_grid = {
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20],
    'criterion': ['squared_error', 'absolute_error'],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(DecisionTreeRegressor(), dtr_param_grid, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'criterion': 'absolute_error',
 'max_depth': 5,
 'min_samples_split': 5,
 'splitter': 'random'}

In [66]:
svr_param_grid = {
    'kernel': ['poly', 'rbf'],
    'C': [1, 2, 3, 4, 5, 6, 10]
}

grid_search = GridSearchCV(SVR(), svr_param_grid, scoring = 'neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'C': 10, 'kernel': 'poly'}

In [69]:
RF_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('random forest', RandomForestRegressor(n_estimators = 300, max_depth = 5, min_samples_split = 10, criterion = 'absolute_error'))
])

DTR_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('random forest', DecisionTreeRegressor(splitter = 'random', max_depth = 5, min_samples_split = 5, criterion = 'absolute_error'))
])

SVR_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', SVR(kernel = 'poly', C = 10))
])


In [70]:
RF_pipeline2.fit(X_train, y_train)
RF_FA_pred2 = RF_pipeline2.predict(X_test)
RF_MAE2 = mean_absolute_error(y_test, RF_FA_pred2)
print('Random Forest Regression (tuned) -- Mean absolute error:', RF_MAE2)

DTR_pipeline2.fit(X_train, y_train)
DTR_FA_pred2 = DTR_pipeline2.predict(X_test)
DTR_MAE2 = mean_absolute_error(y_test, DTR_FA_pred2)
print('Decision Tree Regression (tuned) -- Mean absolute error:', DTR_MAE2)

SVR_pipeline2.fit(X_train, y_train)
SVR_FA_pred2 = SVR_pipeline2.predict(X_test)
SVR_MAE2 = mean_absolute_error(y_test, SVR_FA_pred2)
print('Support Vector Regression (tuned) -- Mean absolute error:', SVR_MAE2)

Random Forest Regression (tuned) -- Mean absolute error: 5985847.365028491
Decision Tree Regression (tuned) -- Mean absolute error: 6600964.564102564
Support Vector Regression (tuned) -- Mean absolute error: 7199783.322270055


In [100]:
LR = LinearRegression()
DTR = DecisionTreeRegressor(splitter = 'random', max_depth = 5, min_samples_split = 5, criterion = 'absolute_error')
SVR = SVR(kernel = 'poly', C = 10)
RF = RandomForestRegressor(n_estimators = 300, max_depth = 5, min_samples_split = 10, criterion = 'absolute_error')



In [101]:
ensemble = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('DTR', DTR),
        ('SVR', SVR),
        ('RF', RF)
    ],
    final_estimator= RF
)

ensemble_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('ensemble', ensemble)

])

ensemble_pipeline.fit(X_train, y_train)

ensemble_pred = ensemble_pipeline.predict(X_test)

ensemble_MAE = mean_absolute_error(y_test, ensemble_pred)


In [102]:
ensemble2 = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('DTR', DTR),
        ('SVR', SVR),
    ],
    final_estimator= LR
)

ensemble2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('ensemble', ensemble2)

])

ensemble2_pipeline.fit(X_train, y_train)

ensemble2_pred = ensemble2_pipeline.predict(X_test)

ensemble2_MAE = mean_absolute_error(y_test, ensemble2_pred)


In [103]:
ensemble3 = StackingRegressor(
    estimators=[
        ('LR', LR),
        ('RF', RF),
        ('SVR', SVR),
    ],
    final_estimator= RF
)

ensemble3_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('ensemble', ensemble3)

])

ensemble3_pipeline.fit(X_train, y_train)

ensemble3_pred = ensemble3_pipeline.predict(X_test)

ensemble3_MAE = mean_absolute_error(y_test, ensemble3_pred)


In [104]:
ensemble4 = StackingRegressor(
    estimators=[
        ('DTR', LR),
        ('RF', RF),
        ('SVR', SVR),
    ],
    final_estimator= RF
)

ensemble4_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('ensemble', ensemble4)

])

ensemble4_pipeline.fit(X_train, y_train)

ensemble4_pred = ensemble4_pipeline.predict(X_test)

ensemble4_MAE = mean_absolute_error(y_test, ensemble4_pred)


In [105]:
print('Ensemble 1 (LR, DTR, SVR, RF) -- Mean absolute error:', ensemble_MAE)
print('Ensemble 2 (LR, DTR, SVR) -- Mean absolute error:', ensemble2_MAE)
print('Ensemble 3 (LR, SVR, RF) -- Mean absolute error:', ensemble3_MAE)
print('Ensemble 4 (DTR, SVR, RF) -- Mean absolute error:', ensemble4_MAE)

Ensemble 1 (LR, DTR, SVR, RF) -- Mean absolute error: 5614808.314301996
Ensemble 2 (LR, DTR, SVR) -- Mean absolute error: 6083534.819509822
Ensemble 3 (LR, SVR, RF) -- Mean absolute error: 5607273.757991455
Ensemble 4 (DTR, SVR, RF) -- Mean absolute error: 5786278.512022791


In [113]:
models = ['Linear Regression', 'Decision Tree', 'Support Vector', 'Random Forest', 
          'Ensemble (LR, DTR, SVR, RF)', 'Ensemble (LR, DTR, SVR)', 'Ensemble (LR, SVR, RF)', 'Ensemble (DTR, SVR, RF)']

MAEs = [LR_MAE, DTR_MAE, SVR_MAE, RF_MAE, ensemble_MAE, ensemble2_MAE, ensemble3_MAE, ensemble4_MAE]

MAE_df = pd.DataFrame({
    "Model": models,
    "Mean Absolute Error": MAEs
})

MAE_df

Unnamed: 0,Model,Mean Absolute Error
0,Linear Regression,6118865.0
1,Decision Tree,8166470.0
2,Support Vector,7200104.0
3,Random Forest,5838804.0
4,"Ensemble (LR, DTR, SVR, RF)",5614808.0
5,"Ensemble (LR, DTR, SVR)",6083535.0
6,"Ensemble (LR, SVR, RF)",5607274.0
7,"Ensemble (DTR, SVR, RF)",5786279.0


In [112]:
new_data = pd.read_csv("ten_players.csv")

PA = new_data['PA']

names = new_data['Name']

new_data_scaled = preprocessor.transform(new_data.drop(columns = ['Name']))

new_df = pd.DataFrame(new_data_scaled, columns = new_data.drop(columns = ['Name']).columns)

prediction_ensemble1 = ensemble_pipeline.predict(new_df)
prediction_ensemble2 = ensemble2_pipeline.predict(new_df)
prediction_ensemble3 = ensemble3_pipeline.predict(new_df)
prediction_ensemble4 = ensemble4_pipeline.predict(new_df)

prediction_LR = LR_pipeline.predict(new_df)
prediction_DTR = DTR_pipeline2.predict(new_df)
prediction_SVR = SVR_pipeline2.predict(new_df)
prediction_RF = RF_pipeline2.predict(new_df)


new_df['Player'] = names.values
new_df['PA_unscaled'] = PA.values


new_df['Ensemble 1 Prediction'] = prediction_ensemble1
new_df['Ensemble 2 Prediction'] = prediction_ensemble2
new_df['Ensemble 3 Prediction'] = prediction_ensemble3
new_df['Ensemble 4 Prediction'] = prediction_ensemble4
new_df['Linear Regression Prediction'] = prediction_LR
new_df['Decision Tree Prediction'] = prediction_DTR
new_df['Support Vector Prediction'] = prediction_SVR
new_df['Random Forest Preditction'] = prediction_RF


new_df

#new_data

Unnamed: 0,PA,BB%,K%,BB/K,AVG,OBP,SLG,OPS,Spd,BABIP,...,Player,PA_unscaled,Ensemble 1 Prediction,Ensemble 2 Prediction,Ensemble 3 Prediction,Ensemble 4 Prediction,Linear Regression Prediction,Decision Tree Prediction,Support Vector Prediction,Random Forest Preditction
0,1.592988,2.922524,-0.669508,3.279465,0.852458,2.557434,1.889329,2.311572,1.780034,0.17824,...,Juan Soto,713,17701760.0,-3.662127e+20,13020070.0,11809500.0,-4.730005e+20,40000000.0,10245140.0,28738480.0
1,-0.968491,1.076356,0.531401,0.33907,0.401134,1.75172,1.062556,1.41448,1.041233,-0.067953,...,Joc Pederson,449,17607200.0,-2.520402e+20,11377870.0,10629000.0,-3.255354e+20,23333333.0,9028092.0,18912240.0
2,1.350423,0.604435,0.847096,-0.165132,-0.446081,-0.109034,0.262606,0.15167,0.536918,0.26155,...,Willy Adames,688,6969694.0,-8.088403e+19,5544722.0,5876417.0,-1.044699e+20,8000000.0,8917879.0,10967270.0
3,-0.396039,1.124423,0.079547,0.709255,-0.35336,0.748522,-0.612534,-0.173303,-0.537622,0.035669,...,Jesse Winker,508,27608570.0,9.106312e+19,27999930.0,29628570.0,1.176172e+20,8000000.0,8939643.0,20510540.0
4,-2.016369,-0.57019,1.018276,-0.994846,-0.898954,-0.998546,-0.129283,-0.458916,0.309257,-0.993206,...,Travis d'Arnaud,341,10205990.0,-1.3033e+20,9639544.0,7787528.0,-1.683343e+20,8500000.0,8047142.0,20440220.0
5,-0.13407,-0.549754,0.158876,-0.660665,-1.036697,-0.919118,-1.209281,-1.217422,-0.887509,-1.528561,...,Ty France,535,24094720.0,6.284026e+18,26487290.0,29002730.0,8.116453e+18,8500000.0,7370276.0,19818610.0
6,-2.307446,-1.270023,-0.894786,-0.770011,0.490544,-0.712902,-1.062693,-1.035227,-1.498636,1.079937,...,Kevin Newman,311,16381010.0,1.728837e+20,18074610.0,18614080.0,2.232967e+20,8500000.0,8133914.0,10134490.0
7,-2.976923,-0.313282,-0.031339,-0.375026,-1.72695,-1.261204,-1.389278,-1.473593,-0.744786,0.021905,...,Kyle Farmer,242,12211310.0,-3.40269e+19,9991520.0,7995972.0,-4.394917e+19,8500000.0,7188676.0,21989640.0
8,-3.268,1.081003,1.413275,-0.135804,-1.897679,-0.431602,-2.724545,-2.144288,-2.2242,-0.385832,...,Austin Slater,212,27336620.0,9.293026e+18,27956260.0,29047790.0,1.200288e+19,8500000.0,7311856.0,21436850.0
9,-3.90837,-1.495477,2.479557,-1.780367,-3.896291,-4.007452,-3.420476,-3.957564,-2.016024,0.471512,...,Austin Hedges,146,25045860.0,1.490088e+20,24811520.0,28037860.0,1.924598e+20,8500000.0,-1515631.0,19564350.0
