In [51]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [107]:
import warnings
warnings.filterwarnings('ignore')

In [109]:
df_players = pd.read_csv('df_players_pos.csv')

In [110]:
#remove players with no pa
df_players_batting = df_players[df_players['total_pa'] > 0]

In [113]:
df_players_batting['age'] = df_players_batting['year'] - df_players_batting['birthYear']
df_players_batting['stolen_bases'] = df_players_batting['batter_stolen_base_2b'] + df_players_batting['batter_stolen_base_3b']

In [114]:
current_batter_features = [
    'player_mlb_id',
    'year',
    'age',
    'years_after_debut',
    'bats',
    'primary_position',
    'stolen_bases',
    'avg',
    'obp',
    'ops',
    'iso',
    'k_rate_batter',
    'bb_rate_batter',
    'batter_avg_exp_ba',
    'total_pa'
]

In [117]:
df_players_batting.replace(np.inf, 1, inplace=True)


In [119]:
df_players_batting = df_players_batting[current_batter_features]

In [121]:
#players with no fielding likely DH
df_players_batting['primary_position'] = df_players_batting['primary_position'].fillna('DH')

In [123]:
#fill with 0 for columns where it makes sense
cols_for_0 = ['avg','obp','ops','iso','k_rate_batter','bb_rate_batter','batter_avg_exp_ba']
df_players_batting[cols_for_0] = df_players_batting[cols_for_0].fillna(0)

In [125]:
#check for nulls
df_players_batting.isna().sum()

player_mlb_id        0
year                 0
age                  0
years_after_debut    0
bats                 0
primary_position     0
stolen_bases         0
avg                  0
obp                  0
ops                  0
iso                  0
k_rate_batter        0
bb_rate_batter       0
batter_avg_exp_ba    0
total_pa             0
dtype: int64

In [127]:
X_2021 = df_players_batting[df_players_batting['year'] == 2021]
X_2022 = df_players_batting[df_players_batting['year'] == 2022]
X_2023 = df_players_batting[df_players_batting['year'] == 2023][['player_mlb_id','total_pa','age','years_after_debut','bats']]
X_all = X_2021.merge(X_2022, on='player_mlb_id', how='outer', suffixes=("_2ya","_1ya")).drop(columns=['year_2ya','year_1ya'])
X_all = X_all.merge(X_2023, on='player_mlb_id',how='outer').drop(columns='player_mlb_id')

In [129]:
#fill in empty columns for batting hand
X_all['bats'] = X_all['bats'].fillna(X_all['bats_1ya']).fillna(X_all['bats_2ya'])


In [131]:
#fill in empty ages/years after debut for players
X_all['age'] = X_all['age'].fillna(X_all['age_1ya'] + 1).fillna(X_all['age_2ya'] + 2)
X_all['years_after_debut'] = X_all['years_after_debut'].fillna(X_all['years_after_debut_1ya'] + 1).fillna(X_all['years_after_debut_2ya'] + 2)

X_all = X_all.drop(columns=['age_1ya','age_2ya','years_after_debut_1ya','years_after_debut_2ya','bats_1ya','bats_2ya'])

In [133]:
#make sure to adjust for age
X_all = X_all.fillna(0)

In [135]:
y = X_all['total_pa']
X = X_all.drop(columns='total_pa')

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [139]:
# Scale features
numeric_cols = X_train.select_dtypes(include=np.number).columns

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [141]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)


In [143]:
X_train.isna().sum()

stolen_bases_2ya           0
avg_2ya                    0
obp_2ya                    0
ops_2ya                    0
iso_2ya                    0
k_rate_batter_2ya          0
bb_rate_batter_2ya         0
batter_avg_exp_ba_2ya      0
total_pa_2ya               0
stolen_bases_1ya           0
avg_1ya                    0
obp_1ya                    0
ops_1ya                    0
iso_1ya                    0
k_rate_batter_1ya          0
bb_rate_batter_1ya         0
batter_avg_exp_ba_1ya      0
total_pa_1ya               0
age                        0
years_after_debut          0
primary_position_2ya_1b    0
primary_position_2ya_2b    0
primary_position_2ya_3b    0
primary_position_2ya_DH    0
primary_position_2ya_c     0
primary_position_2ya_cf    0
primary_position_2ya_lf    0
primary_position_2ya_p     0
primary_position_2ya_rf    0
primary_position_2ya_ss    0
primary_position_1ya_1b    0
primary_position_1ya_2b    0
primary_position_1ya_3b    0
primary_position_1ya_DH    0
primary_positi

In [145]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge

# Ensure X_train and X_test have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Linear Regression Model
lr = LinearRegression()

# Fit model
lr.fit(X_train, y_train)

# Predict
# prediction = lr.predict(X_test)

# Actual values
actual = y_test

# Train and test scores
train_score_lr = lr.score(X_train, y_train)
test_score_lr = lr.score(X_test, y_test)

print("The train score for lr model is {}".format(train_score_lr))
print("The test score for lr model is {}".format(test_score_lr))

# Ridge Regression Model
ridgeReg = Ridge(alpha=10)

ridgeReg.fit(X_train, y_train)

# Train and test scores for ridge regression
train_score_ridge = ridgeReg.score(X_train, y_train)
test_score_ridge = ridgeReg.score(X_test, y_test)

print("\nRidge Model............................................\n")
print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))

# Predictions
lr_preds = lr.predict(X_test)
ridge_preds = ridgeReg.predict(X_test)

# RMSE calculations
rmse_lr = np.sqrt(mean_squared_error(y_test, lr_preds))
rmse_ridge = np.sqrt(mean_squared_error(y_test, ridge_preds))

print("\nRMSE for Linear Regression: {}".format(rmse_lr))
print("RMSE for Ridge Regression: {}".format(rmse_ridge))

The train score for lr model is 0.690928668563695
The test score for lr model is 0.6172525551816038

Ridge Model............................................

The train score for ridge model is 0.6851706054089861
The test score for ridge model is 0.6214803487031546

RMSE for Linear Regression: 134.4973532057988
RMSE for Ridge Regression: 133.75246773407426


In [147]:
X_all['total_pa'].describe()

count    1196.000000
mean      152.811037
std       212.557866
min         0.000000
25%         0.000000
50%        16.000000
75%       286.250000
max       753.000000
Name: total_pa, dtype: float64

In [149]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize Lasso model
lasso = Lasso()

# Define the hyperparameter grid to search over
param_grid = {'alpha': np.logspace(-6, 6, 13)}  # Search over a range of alpha values from 1e-6 to 1e6

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best alpha
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha from GridSearchCV: {best_alpha}")

# Get the best model
best_lasso_model = grid_search.best_estimator_

# Evaluate the model
y_train_pred = best_lasso_model.predict(X_train)
y_test_pred = best_lasso_model.predict(X_test)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"RMSE (Train) for best alpha: {rmse_train}")
print(f"RMSE (Test) for best alpha: {rmse_test}")

Best alpha from GridSearchCV: 1.0
RMSE (Train) for best alpha: 119.37949091179561
RMSE (Test) for best alpha: 133.91821550808936


In [150]:
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(r2_train)
print(r2_test)

0.6771620198510533
0.6205416346137107


In [153]:
# Feature importance: Get the coefficients
feature_importance = best_lasso_model.coef_

# Create a DataFrame to display feature importance with their corresponding names (if available)
import pandas as pd
importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Use your feature names here
    'Importance': feature_importance
})

# Sort by importance (absolute value)
importance_df['Abs_Importance'] = importance_df['Importance'].abs()
importance_df = importance_df.sort_values(by='Abs_Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df[['Feature', 'Importance']])


Feature Importances:
                    Feature  Importance
17             total_pa_1ya  151.818405
30  primary_position_1ya_1b  -53.311356
18                      age  -32.978477
27   primary_position_2ya_p  -19.774583
8              total_pa_2ya   15.813065
14        k_rate_batter_1ya  -15.392034
31  primary_position_1ya_2b  -15.260779
13                  iso_1ya   15.131035
5         k_rate_batter_2ya  -14.976076
21  primary_position_2ya_2b   -9.299564
7     batter_avg_exp_ba_2ya   -8.831076
40                   bats_L    8.305268
2                   obp_2ya   -6.823384
34   primary_position_1ya_c   -6.758282
41                   bats_R   -2.686728
6        bb_rate_batter_2ya    2.191959
1                   avg_2ya    1.859122
0          stolen_bases_2ya   -1.289722
15       bb_rate_batter_1ya    0.960434
22  primary_position_2ya_3b    0.894498
28  primary_position_2ya_rf    0.699935
39  primary_position_1ya_ss    0.055284
12                  ops_1ya    0.000000
3                 

In [155]:
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_test_pred
})

# Display the first few rows of the results
print(results_df.head(50))

      Actual   Predicted
694    187.0  180.369434
1133   180.0  397.304889
1131     0.0    0.566161
567    589.0  446.657674
976    690.0  567.767036
357      0.0  -46.103296
913    345.0  180.866052
1153   255.0  145.396189
969      0.0  107.272084
839     46.0  131.446844
1070    65.0  134.731814
983    277.0  115.295760
0       60.0  219.133575
1058    71.0  116.338243
398    141.0  115.295760
384    524.0  466.625869
113     90.0  157.066332
114    250.0  410.680919
424    169.0   93.755861
522    610.0  551.175498
139    337.0  282.384073
326      4.0   91.069133
741    292.0  299.863654
862     96.0  248.559545
505     40.0  115.295760
836    123.0  250.525325
493      0.0   66.479574
52     330.0  364.731038
207      0.0   51.523409
479    717.0  576.756427
872    401.0  223.854555
833    732.0  473.802109
171      0.0   40.392729
244      0.0  -12.425725
416    617.0  501.711323
869      0.0   33.129505
811    355.0  262.585587
831     41.0   91.069133
958    314.0  339.119747


In [157]:
X_all.loc[1130]

primary_position_2ya           cf
stolen_bases_2ya              0.0
avg_2ya                  0.280528
obp_2ya                  0.331288
ops_2ya                  0.684424
iso_2ya                  0.072607
k_rate_batter_2ya         0.19802
bb_rate_batter_2ya       0.069307
batter_avg_exp_ba_2ya    0.310109
total_pa_2ya                331.0
primary_position_1ya           cf
stolen_bases_1ya              0.0
avg_1ya                  0.297082
obp_1ya                  0.348894
ops_1ya                  0.728205
iso_1ya                  0.082228
k_rate_batter_1ya        0.153846
bb_rate_batter_1ya       0.068966
batter_avg_exp_ba_1ya    0.326715
total_pa_1ya                409.0
total_pa                     80.0
age                          29.0
years_after_debut             4.0
bats                            R
Name: 1130, dtype: object