In [187]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [188]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [189]:
import warnings
warnings.filterwarnings('ignore')

In [190]:
df_players = pd.read_csv('df_players_pos.csv')

In [191]:
#remove players with no pa
df_players_batting = df_players[df_players['total_pa'] > 0]

In [192]:
df_players_batting['age'] = df_players_batting['year'] - df_players_batting['birthYear']
df_players_batting['stolen_bases'] = df_players_batting['batter_stolen_base_2b'] + df_players_batting['batter_stolen_base_3b']

In [193]:
current_batter_features = [
    'player_mlb_id',
    'year',
    'age',
    'years_after_debut',
    'bats',
    'primary_position',
    'stolen_bases',
    'avg',
    'obp',
    'ops',
    'iso',
    'k_rate_batter',
    'bb_rate_batter',
    'batter_avg_exp_ba',
    'total_pa'
]

In [194]:
df_players_batting.replace(np.inf, 1, inplace=True)


In [195]:
df_players_batting = df_players_batting[current_batter_features]

In [196]:
#players with no fielding likely DH
df_players_batting['primary_position'] = df_players_batting['primary_position'].fillna('DH')

In [197]:
#fill with 0 for columns where it makes sense
cols_for_0 = ['avg','obp','ops','iso','k_rate_batter','bb_rate_batter','batter_avg_exp_ba']
df_players_batting[cols_for_0] = df_players_batting[cols_for_0].fillna(0)

In [198]:
#check for nulls
df_players_batting.isna().sum()

player_mlb_id        0
year                 0
age                  0
years_after_debut    0
bats                 0
primary_position     0
stolen_bases         0
avg                  0
obp                  0
ops                  0
iso                  0
k_rate_batter        0
bb_rate_batter       0
batter_avg_exp_ba    0
total_pa             0
dtype: int64

In [199]:
X_2021 = df_players_batting[df_players_batting['year'] == 2021]
X_2022 = df_players_batting[df_players_batting['year'] == 2022]
X_2023 = df_players_batting[df_players_batting['year'] == 2023][['player_mlb_id','total_pa','age','years_after_debut','bats']]
X_all = X_2021.merge(X_2022, on='player_mlb_id', how='outer', suffixes=("_2ya","_1ya")).drop(columns=['year_2ya','year_1ya'])
X_all = X_all.merge(X_2023, on='player_mlb_id',how='outer').drop(columns='player_mlb_id')

In [200]:
#fill in empty columns for batting hand
X_all['bats'] = X_all['bats'].fillna(X_all['bats_1ya']).fillna(X_all['bats_2ya'])


In [201]:
#fill in empty positions
X_all['primary_position_2ya'] = X_all['primary_position_2ya'].fillna(X_all['primary_position_1ya']).fillna('DH')
X_all['primary_position_1ya'] = X_all['primary_position_1ya'].fillna(X_all['primary_position_2ya'])

In [202]:
#fill in empty ages/years after debut for players
X_all['age'] = X_all['age'].fillna(X_all['age_1ya'] + 1).fillna(X_all['age_2ya'] + 2)
X_all['years_after_debut'] = X_all['years_after_debut'].fillna(X_all['years_after_debut_1ya'] + 1).fillna(X_all['years_after_debut_2ya'] + 2)

X_all = X_all.drop(columns=['age_1ya','age_2ya','years_after_debut_1ya','years_after_debut_2ya','bats_1ya','bats_2ya'])

In [203]:
#make remaining columns 0
X_all = X_all.fillna(0)

In [204]:
y = X_all['total_pa']
X = X_all.drop(columns='total_pa')

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [206]:
# Scale features
numeric_cols = X_train.select_dtypes(include=np.number).columns

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [207]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)


In [208]:
X_train.isna().sum()

stolen_bases_2ya           0
avg_2ya                    0
obp_2ya                    0
ops_2ya                    0
iso_2ya                    0
k_rate_batter_2ya          0
bb_rate_batter_2ya         0
batter_avg_exp_ba_2ya      0
total_pa_2ya               0
stolen_bases_1ya           0
avg_1ya                    0
obp_1ya                    0
ops_1ya                    0
iso_1ya                    0
k_rate_batter_1ya          0
bb_rate_batter_1ya         0
batter_avg_exp_ba_1ya      0
total_pa_1ya               0
age                        0
years_after_debut          0
primary_position_2ya_2b    0
primary_position_2ya_3b    0
primary_position_2ya_DH    0
primary_position_2ya_c     0
primary_position_2ya_cf    0
primary_position_2ya_lf    0
primary_position_2ya_p     0
primary_position_2ya_rf    0
primary_position_2ya_ss    0
primary_position_1ya_2b    0
primary_position_1ya_3b    0
primary_position_1ya_DH    0
primary_position_1ya_c     0
primary_position_1ya_cf    0
primary_positi

In [209]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge

# Ensure X_train and X_test have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Linear Regression Model
lr = LinearRegression()

# Fit model
lr.fit(X_train, y_train)

# Predict
# prediction = lr.predict(X_test)

# Actual values
actual = y_test

# Train and test scores
train_score_lr = lr.score(X_train, y_train)
test_score_lr = lr.score(X_test, y_test)

print("The train score for lr model is {}".format(train_score_lr))
print("The test score for lr model is {}".format(test_score_lr))

# Ridge Regression Model
ridgeReg = Ridge(alpha=10)

ridgeReg.fit(X_train, y_train)

# Train and test scores for ridge regression
train_score_ridge = ridgeReg.score(X_train, y_train)
test_score_ridge = ridgeReg.score(X_test, y_test)

print("\nRidge Model............................................\n")
print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))

# Predictions
lr_preds = lr.predict(X_test)
ridge_preds = ridgeReg.predict(X_test)

# RMSE calculations
rmse_lr = np.sqrt(mean_squared_error(y_test, lr_preds))
rmse_ridge = np.sqrt(mean_squared_error(y_test, ridge_preds))

print("\nRMSE for Linear Regression: {}".format(rmse_lr))
print("RMSE for Ridge Regression: {}".format(rmse_ridge))

The train score for lr model is 0.6848337608871804
The test score for lr model is 0.6146397822277827

Ridge Model............................................

The train score for ridge model is 0.6819026452476264
The test score for ridge model is 0.6230827124549445

RMSE for Linear Regression: 134.95563630471614
RMSE for Ridge Regression: 133.46906447662775


In [210]:
X_all['total_pa'].describe()

count    1196.000000
mean      152.811037
std       212.557866
min         0.000000
25%         0.000000
50%        16.000000
75%       286.250000
max       753.000000
Name: total_pa, dtype: float64

In [211]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize Lasso model
lasso = Lasso()

# Define the hyperparameter grid to search over
param_grid = {'alpha': np.logspace(-6, 6, 13)}  # Search over a range of alpha values from 1e-6 to 1e6

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best alpha
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha from GridSearchCV: {best_alpha}")

# Get the best model
best_lasso_model = grid_search.best_estimator_

# Evaluate the model
y_train_pred = best_lasso_model.predict(X_train)
y_test_pred = best_lasso_model.predict(X_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"RMSE (Train) for best alpha: {rmse_train}")
print(f"RMSE (Test) for best alpha: {rmse_test}")

Best alpha from GridSearchCV: 1.0
RMSE (Train) for best alpha: 119.68167929055457
RMSE (Test) for best alpha: 133.96876157298098


In [212]:
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(r2_train)
print(r2_test)

0.6755255350500049
0.6202551351743416


In [213]:
# Feature importance: Get the coefficients
feature_importance = best_lasso_model.coef_

# Create a DataFrame to display feature importance with their corresponding names (if available)
import pandas as pd
importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Use your feature names here
    'Importance': feature_importance
})

# Sort by importance (absolute value)
importance_df['Abs_Importance'] = importance_df['Importance'].abs()
importance_df = importance_df.sort_values(by='Abs_Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df[['Feature', 'Importance']])


Feature Importances:
                    Feature  Importance
17             total_pa_1ya  148.941669
18                      age  -33.190742
31  primary_position_1ya_DH   28.641245
21  primary_position_2ya_3b   15.956617
13                  iso_1ya   15.434339
37  primary_position_1ya_ss   15.414686
20  primary_position_2ya_2b  -14.864724
8              total_pa_2ya   13.512791
5         k_rate_batter_2ya  -12.661623
14        k_rate_batter_1ya  -12.314175
22  primary_position_2ya_DH    8.089812
7     batter_avg_exp_ba_2ya   -6.602766
26   primary_position_2ya_p   -6.442417
38                   bats_L    5.684901
33  primary_position_1ya_cf    4.634793
39                   bats_R   -4.404518
2                   obp_2ya   -4.014662
34  primary_position_1ya_lf    3.120468
0          stolen_bases_2ya   -1.557266
1                   avg_2ya    1.496035
15       bb_rate_batter_1ya    1.481400
6        bb_rate_batter_2ya    1.461775
12                  ops_1ya    1.303220
19        years_af

In [214]:
#issue: can't be negative. fix later

results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_test_pred
})

# Display the first few rows of the results
print(results_df.head(50))

      Actual   Predicted
694    187.0  170.967304
1133   180.0  377.270769
1131     0.0    3.603653
567    589.0  435.385693
976    690.0  549.477177
357      0.0  -43.405182
913    345.0  182.353180
1153   255.0  143.320083
969      0.0   98.103608
839     46.0  145.916909
1070    65.0  143.519436
983    277.0  129.661868
0       60.0  206.309226
1058    71.0   99.466666
398    141.0  129.661868
384    524.0  470.548225
113     90.0  176.127245
114    250.0  404.200506
424    169.0  109.683826
522    610.0  549.107906
139    337.0  302.981364
326      4.0  105.279308
741    292.0  296.614945
862     96.0  238.904787
505     40.0  129.661868
836    123.0  247.705515
493      0.0   59.646449
52     330.0  357.193856
207      0.0   47.180752
479    717.0  567.358601
872    401.0  229.820695
833    732.0  461.853371
171      0.0   45.282624
244      0.0  -11.326164
416    617.0  538.657000
869      0.0   33.882363
811    355.0  254.462644
831     41.0  105.279308
958    314.0  328.886900


In [215]:
np.max(y_test, axis=0)

732.0

Try a Random Forest Regression

In [217]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')

In [None]:
X_init = X_all.drop(columns='total_pa')
y = X_all['total_pa']

label_encoder = LabelEncoder()
x_categorical = X_init.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = X_init.select_dtypes(exclude=['object']).values
x = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [257]:
regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)

regressor.fit(x, y)

In [259]:
from sklearn.metrics import mean_squared_error, r2_score

oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')

predictions = regressor.predict(x)

mse = mean_squared_error(y, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y, predictions)
print(f'R-squared: {r2}')

Out-of-Bag Score: 0.5119043001123835
Mean Squared Error: 4231.70776192979
R-squared: 0.9062600794475405


In [271]:
np.sqrt(mse)

65.05157770515477

In [273]:
#seems to work a lot better, maybe due to non-linear columns