In [641]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [642]:
df_players = pd.read_csv('df_players.csv')

In [643]:
#make only for batting
df_players_batting = df_players[df_players['total_pa'] > 0]

In [644]:
#selected features
current_batter_features = [
    'player_mlb_id',
    'year',
    'age',
    'years_after_28',
    'bats',
    'primary_position',
    'ops',
    'k_rate_batter',
    'bb_rate_batter',
    'batter_avg_exp_ba',
    'babip_batter',
    'fly_balls_ratio_batter',
    'ground_balls_ratio_batter',
    'line_drives_ratio_batter',
    'popups_ratio_batter',
    'hr_fb_pct_batter',
    'avg_lineup_position',
    'batter_avg_xwoba',
    'batter_avg_woba',
    'total_pa'
]

df_players_batting = df_players_batting[current_batter_features]

In [645]:
df_players_batting.head()

Unnamed: 0,player_mlb_id,year,age,years_after_28,bats,primary_position,ops,k_rate_batter,bb_rate_batter,batter_avg_exp_ba,babip_batter,fly_balls_ratio_batter,ground_balls_ratio_batter,line_drives_ratio_batter,popups_ratio_batter,hr_fb_pct_batter,avg_lineup_position,batter_avg_xwoba,batter_avg_woba,total_pa
1,33438ca2fdf93ff73b2e1bcc5da3058a04cf1661,2021,26,0,R,p,0.666667,0.333333,0.0,0.2025,0.5,,0.666667,,,,10.5,0.186,0.3,3
4,80f3a94c7bf1ed6397017744dbb75ff931902e2d,2021,34,6,R,1b,0.802607,0.217656,0.08828,0.343302,0.292804,0.188455,0.348048,0.159593,0.039049,0.27027,3.368421,0.40555,0.365902,657
6,b4246bd7233f08e749626eb0b7647d295f2e37b7,2021,24,0,R,rf,0.965172,0.23743,0.131285,0.403417,0.310881,0.218954,0.22549,0.205882,0.058824,0.358209,1.231707,0.519829,0.418627,358
8,81272c934b63edda7b2c6ca9f268f5a594ce5131,2021,26,0,R,ss,0.817233,0.281588,0.101083,0.33943,0.33123,0.247485,0.255533,0.148893,0.036217,0.203252,4.678832,0.408804,0.366065,554
13,c296dc19142d27a8a0d4a738256f7423945d04b6,2021,33,5,L,1b,0.425214,0.230769,0.076923,0.184407,0.222222,0.222222,0.388889,0.083333,0.055556,0.0,9.090909,0.209815,0.224359,39


In [646]:
#deal with n/a values here and in functions

#set infinite ratios to 1
df_players_batting.replace(np.inf, 1, inplace=True)

In [647]:
#get leaguewide averages for stats
numeric_cols = df_players_batting.select_dtypes(include=np.number).columns.tolist()

stat_averages_league = df_players_batting.groupby('year')[numeric_cols].apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_pa']) 
        if col.name not in ['total_pa'] and group.loc[col.dropna().index, 'total_pa'].sum() > 0
        else col.mean(),  # Use col.mean() for total_pa column
        axis=0
    )
)

In [648]:
#get averages by position
numeric_cols = df_players_batting.select_dtypes(include=np.number).columns.tolist()
numeric_cols += ['primary_position']

df_batting_num_cols = df_players_batting[numeric_cols]

#get means
stat_averages_position = df_batting_num_cols.groupby(['year', 'primary_position']).apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_pa']) 
        if col.name != 'total_pa' and group.loc[col.dropna().index, 'total_pa'].sum() > 0
        else col.mean(),  # Use col.mean() for total_pa column
        axis=0
    )
).fillna(0).reset_index()

In [649]:
#replace stats for players with unknown positions with league averages
common_cols = stat_averages_league.columns.intersection(stat_averages_position.columns.drop('primary_position'))
for year in stat_averages_position['year'].unique():
    mask = (stat_averages_position['year'] == year) & (stat_averages_position['primary_position'] == 'Unknown')
    stat_averages_position.loc[mask, common_cols] = stat_averages_league.loc[
        stat_averages_league['year'] == year, common_cols
    ].values

In [650]:
#check for nas
df_players_batting.isna().sum()

player_mlb_id                  0
year                           0
age                            0
years_after_28                 0
bats                           0
primary_position               0
ops                           12
k_rate_batter                  0
bb_rate_batter                 0
batter_avg_exp_ba            105
babip_batter                 104
fly_balls_ratio_batter       308
ground_balls_ratio_batter    143
line_drives_ratio_batter     286
popups_ratio_batter          454
hr_fb_pct_batter             308
avg_lineup_position            0
batter_avg_xwoba             105
batter_avg_woba                4
total_pa                       0
dtype: int64

In [651]:
#these all make sense to set to 0
df_players_batting = df_players_batting.fillna(0)

In [652]:
#put all of player's stats into one row
df_2021 = df_players_batting[df_players_batting['year'] == 2021]
df_2022 = df_players_batting[df_players_batting['year'] == 2022]
df_2023 = df_players_batting[df_players_batting['year'] == 2023]
df_all = df_2021.merge(df_2022, on='player_mlb_id', how='outer', suffixes=("_2ya","_1ya")).drop(columns=['year_2ya','year_1ya'])
df_all = df_all.merge(df_2023, on='player_mlb_id',how='outer')

In [653]:
#remove pitchers due to universal DH adoption (except if they got over 200 PAs in a season (Shohei))
df_all = df_all[((df_all['primary_position_1ya'] != 'p') & (df_all['primary_position_2ya'] != 'p')) | (df_all['total_pa_1ya'] > 200) | (df_all['total_pa_2ya'] > 200)]

In [654]:
#check nulls
df_all.isna().sum()

player_mlb_id                      0
age_2ya                          268
years_after_28_2ya               268
bats_2ya                         268
primary_position_2ya             268
ops_2ya                          268
k_rate_batter_2ya                268
bb_rate_batter_2ya               268
batter_avg_exp_ba_2ya            268
babip_batter_2ya                 268
fly_balls_ratio_batter_2ya       268
ground_balls_ratio_batter_2ya    268
line_drives_ratio_batter_2ya     268
popups_ratio_batter_2ya          268
hr_fb_pct_batter_2ya             268
avg_lineup_position_2ya          268
batter_avg_xwoba_2ya             268
batter_avg_woba_2ya              268
total_pa_2ya                     268
age_1ya                          216
years_after_28_1ya               216
bats_1ya                         216
primary_position_1ya             216
ops_1ya                          216
k_rate_batter_1ya                216
bb_rate_batter_1ya               216
batter_avg_exp_ba_1ya            216
b

In [655]:
#fill the non-numerical nulls
df_all['age'] = df_all['age'].fillna(df_all['age_1ya']+1).fillna(df_all['age_2ya'] + 2)  # Fill missing age and adjust

df_all['years_after_28'] = df_all['years_after_28'].fillna(df_all['years_after_28_1ya']+1).fillna(df_all['years_after_28_2ya'] + 2)  # Fill missing years_after_28 and adjust

df_all['primary_position'] = df_all['primary_position'].fillna(df_all['primary_position_1ya']).fillna(df_all['primary_position_2ya']).fillna('Unknown')

df_all['bats'] = df_all['bats'].fillna(df_all['bats_1ya']).fillna(df_all['bats_2ya'])

df_all = df_all.drop(columns=['age_1ya','age_2ya','years_after_28_1ya','years_after_28_2ya','bats_1ya','bats_2ya','primary_position_2ya','primary_position_1ya'])

#everything else can be filled with 0
df_all = df_all.fillna(0)

In [656]:
df_all.columns

Index(['player_mlb_id', 'ops_2ya', 'k_rate_batter_2ya', 'bb_rate_batter_2ya',
       'batter_avg_exp_ba_2ya', 'babip_batter_2ya',
       'fly_balls_ratio_batter_2ya', 'ground_balls_ratio_batter_2ya',
       'line_drives_ratio_batter_2ya', 'popups_ratio_batter_2ya',
       'hr_fb_pct_batter_2ya', 'avg_lineup_position_2ya',
       'batter_avg_xwoba_2ya', 'batter_avg_woba_2ya', 'total_pa_2ya',
       'ops_1ya', 'k_rate_batter_1ya', 'bb_rate_batter_1ya',
       'batter_avg_exp_ba_1ya', 'babip_batter_1ya',
       'fly_balls_ratio_batter_1ya', 'ground_balls_ratio_batter_1ya',
       'line_drives_ratio_batter_1ya', 'popups_ratio_batter_1ya',
       'hr_fb_pct_batter_1ya', 'avg_lineup_position_1ya',
       'batter_avg_xwoba_1ya', 'batter_avg_woba_1ya', 'total_pa_1ya', 'year',
       'age', 'years_after_28', 'bats', 'primary_position', 'ops',
       'k_rate_batter', 'bb_rate_batter', 'batter_avg_exp_ba', 'babip_batter',
       'fly_balls_ratio_batter', 'ground_balls_ratio_batter',
       'line_

In [657]:
#get columns to be averaged
cols_0ya = ['ops', 'k_rate_batter', 
 'bb_rate_batter', 'batter_avg_exp_ba', 'babip_batter', 'batter_avg_xwoba', 'batter_avg_woba',
 'fly_balls_ratio_batter', 'ground_balls_ratio_batter', 
 'line_drives_ratio_batter', 'popups_ratio_batter', 
 'hr_fb_pct_batter', 'avg_lineup_position']


cols_1ya = ['ops_1ya', 'k_rate_batter_1ya', 
            'bb_rate_batter_1ya', 'batter_avg_exp_ba_1ya', 'babip_batter_1ya', 'batter_avg_xwoba_1ya', 'batter_avg_woba_1ya',
            'fly_balls_ratio_batter_1ya', 'ground_balls_ratio_batter_1ya', 
            'line_drives_ratio_batter_1ya', 'popups_ratio_batter_1ya', 
            'hr_fb_pct_batter_1ya', 'avg_lineup_position_1ya']

cols_2ya = ['ops_2ya', 'k_rate_batter_2ya', 
            'bb_rate_batter_2ya', 'batter_avg_exp_ba_2ya', 'babip_batter_2ya', 'batter_avg_xwoba_2ya', 'batter_avg_woba_2ya',
            'fly_balls_ratio_batter_2ya', 'ground_balls_ratio_batter_2ya', 
            'line_drives_ratio_batter_2ya', 'popups_ratio_batter_2ya', 
            'hr_fb_pct_batter_2ya', 'avg_lineup_position_2ya']

pa_0ya = 'total_pa'
pa_1ya = 'total_pa_1ya'
pa_2ya = 'total_pa_2ya'

# Calculate the weighted averages and replace the original columns
for col_0ya, col_1ya, col_2ya in zip(cols_0ya, cols_1ya, cols_2ya):
    new_col_name = col_0ya + '_'
    df_all[new_col_name] = (
        (5 * df_all[col_0ya] * df_all[pa_0ya] + 3 * df_all[col_1ya] * df_all[pa_1ya] + 2 * df_all[col_2ya] * df_all[pa_2ya]) /
        (5 * df_all[pa_0ya] + 3 * df_all[pa_1ya] + 2 * df_all[pa_2ya])
    )

# Drop the original _1ya and _2ya columns
df_all_weighed = df_all.drop(columns=cols_1ya + cols_2ya + cols_0ya)

#do the same for plate appearances
df_all_weighed['total_pa_'] = (5*df_all_weighed['total_pa'] + 3*df_all_weighed['total_pa_1ya'] + 2*df_all_weighed['total_pa_2ya']) / 10

df_all_weighed.drop(columns=['total_pa_1ya','total_pa_2ya','total_pa'],inplace=True)

In [658]:
df_all_weighed.head()

Unnamed: 0,player_mlb_id,year,age,years_after_28,bats,primary_position,ops_,k_rate_batter_,bb_rate_batter_,batter_avg_exp_ba_,babip_batter_,batter_avg_xwoba_,batter_avg_woba_,fly_balls_ratio_batter_,ground_balls_ratio_batter_,line_drives_ratio_batter_,popups_ratio_batter_,hr_fb_pct_batter_,avg_lineup_position_,total_pa_
0,0014c193005b425aaad55358686fb0dd1a4a0755,2023.0,25.0,0.0,R,lf,0.57395,0.294187,0.048636,0.229937,0.328573,0.242284,0.271485,0.12193,0.39964,0.103205,0.066655,0.029279,9.109847,84.3
1,003af1e4636109b822c9acfa703cb517c46d89fc,2023.0,28.0,0.0,R,lf,0.671603,0.168193,0.049126,0.314174,0.268205,0.340024,0.309119,0.181395,0.389899,0.191811,0.061879,0.163482,6.589376,120.1
2,0043ac96d4fde6fcfd5a841b8d902661e69a6009,2023.0,23.0,0.0,R,c,0.629208,0.258741,0.108392,0.342575,0.363905,0.34672,0.306144,0.077909,0.365878,0.150888,0.071992,0.0,7.783883,28.6
3,0084c15ee1d82fb5b793e1ff130f46651dd13e17,2023.0,24.0,0.0,R,lf,0.490906,0.322404,0.071038,0.296672,0.236534,0.350587,0.253552,0.120492,0.359251,0.105855,0.067564,0.117096,8.70765,36.6
4,00e99b6a19161174f9f6520a80783c70236d8403,2023.0,28.0,0.0,L,cf,0.788638,0.157009,0.08223,0.281089,0.290464,0.290408,0.356902,0.225806,0.336502,0.178758,0.082798,0.159902,3.74398,362.4


In [659]:
df_all_weighed.select_dtypes(include=np.number).corr()

Unnamed: 0,year,age,years_after_28,ops_,k_rate_batter_,bb_rate_batter_,batter_avg_exp_ba_,babip_batter_,batter_avg_xwoba_,batter_avg_woba_,fly_balls_ratio_batter_,ground_balls_ratio_batter_,line_drives_ratio_batter_,popups_ratio_batter_,hr_fb_pct_batter_,avg_lineup_position_,total_pa_
year,1.0,-0.296978,-0.338792,0.357484,-0.177269,0.097112,0.321373,0.266459,0.344055,0.347915,0.134194,-0.019503,0.179279,0.028358,0.242034,-0.329469,0.492418
age,-0.296978,1.0,0.9222,-0.02839,-0.087731,0.015867,-0.068385,-0.095028,-0.057431,-0.030609,0.061315,0.034649,-0.010398,0.092986,-0.047517,-0.039072,0.101182
years_after_28,-0.338792,0.9222,1.0,-0.04421,-0.08586,0.004485,-0.058959,-0.094714,-0.061582,-0.044787,0.033571,0.058282,-0.000391,0.060886,-0.058775,-0.025247,0.019336
ops_,0.357484,-0.02839,-0.04421,1.0,-0.486735,0.262288,0.683071,0.781627,0.67584,0.982589,0.250744,-0.018666,0.554933,0.116978,0.588744,-0.52075,0.502787
k_rate_batter_,-0.177269,-0.087731,-0.08586,-0.486735,1.0,-0.10623,-0.325069,-0.314164,-0.190959,-0.511824,-0.290461,-0.570406,-0.525398,-0.203197,-0.028232,0.328311,-0.336848
bb_rate_batter_,0.097112,0.015867,0.004485,0.262288,-0.10623,1.0,0.202809,0.073142,0.215753,0.334938,0.021457,-0.088067,-0.012604,0.129608,0.143736,-0.204395,0.147431
batter_avg_exp_ba_,0.321373,-0.068385,-0.058959,0.683071,-0.325069,0.202809,1.0,0.609183,0.935581,0.686374,0.094961,-0.031678,0.523043,-0.080365,0.451078,-0.427601,0.359989
babip_batter_,0.266459,-0.095028,-0.094714,0.781627,-0.314164,0.073142,0.609183,1.0,0.489247,0.770836,0.060697,0.011384,0.49325,-0.018814,0.234528,-0.302397,0.269947
batter_avg_xwoba_,0.344055,-0.057431,-0.061582,0.67584,-0.190959,0.215753,0.935581,0.489247,1.0,0.662534,0.193157,-0.167584,0.368483,-0.05562,0.612064,-0.466743,0.40107
batter_avg_woba_,0.347915,-0.030609,-0.044787,0.982589,-0.511824,0.334938,0.686374,0.770836,0.662534,1.0,0.21889,-0.010848,0.561906,0.142627,0.552541,-0.509049,0.482666


In [660]:
df_all_weighed.columns

Index(['player_mlb_id', 'year', 'age', 'years_after_28', 'bats',
       'primary_position', 'ops_', 'k_rate_batter_', 'bb_rate_batter_',
       'batter_avg_exp_ba_', 'babip_batter_', 'batter_avg_xwoba_',
       'batter_avg_woba_', 'fly_balls_ratio_batter_',
       'ground_balls_ratio_batter_', 'line_drives_ratio_batter_',
       'popups_ratio_batter_', 'hr_fb_pct_batter_', 'avg_lineup_position_',
       'total_pa_'],
      dtype='object')

In [661]:
#regress everyone's stats to the mean of their position group by 5 PA (to help with players who had very few plate appearances)
stats_to_regress = ['ops_', 'k_rate_batter_',
       'bb_rate_batter_', 'batter_avg_exp_ba_', 'babip_batter_',
       'batter_avg_xwoba_', 'batter_avg_woba_', 'fly_balls_ratio_batter_',
       'ground_balls_ratio_batter_', 'line_drives_ratio_batter_',
       'popups_ratio_batter_', 'hr_fb_pct_batter_']

    

In [662]:
stat_averages_position_2021_2023 = stat_averages_position.drop(columns='year').reset_index().groupby('primary_position').mean().reset_index()

In [663]:
df_all_weighed_regress = df_all_weighed.merge(stat_averages_position_2021_2023, on='primary_position', suffixes=('', '_incorrect'))

In [664]:
df_all_weighed_regress.isna().sum()

player_mlb_id                 0
year                          0
age                           0
years_after_28                0
bats                          0
primary_position              0
ops_                          0
k_rate_batter_                0
bb_rate_batter_               0
batter_avg_exp_ba_            0
babip_batter_                 0
batter_avg_xwoba_             0
batter_avg_woba_              0
fly_balls_ratio_batter_       0
ground_balls_ratio_batter_    0
line_drives_ratio_batter_     0
popups_ratio_batter_          0
hr_fb_pct_batter_             0
avg_lineup_position_          0
total_pa_                     0
index                         0
age_incorrect                 0
years_after_28_incorrect      0
ops                           0
k_rate_batter                 0
bb_rate_batter                0
batter_avg_exp_ba             0
babip_batter                  0
fly_balls_ratio_batter        0
ground_balls_ratio_batter     0
line_drives_ratio_batter      0
popups_r

In [665]:
df_all_weighed_regress.columns

Index(['player_mlb_id', 'year', 'age', 'years_after_28', 'bats',
       'primary_position', 'ops_', 'k_rate_batter_', 'bb_rate_batter_',
       'batter_avg_exp_ba_', 'babip_batter_', 'batter_avg_xwoba_',
       'batter_avg_woba_', 'fly_balls_ratio_batter_',
       'ground_balls_ratio_batter_', 'line_drives_ratio_batter_',
       'popups_ratio_batter_', 'hr_fb_pct_batter_', 'avg_lineup_position_',
       'total_pa_', 'index', 'age_incorrect', 'years_after_28_incorrect',
       'ops', 'k_rate_batter', 'bb_rate_batter', 'batter_avg_exp_ba',
       'babip_batter', 'fly_balls_ratio_batter', 'ground_balls_ratio_batter',
       'line_drives_ratio_batter', 'popups_ratio_batter', 'hr_fb_pct_batter',
       'avg_lineup_position', 'batter_avg_xwoba', 'batter_avg_woba',
       'total_pa'],
      dtype='object')

In [666]:
for stat in stats_to_regress:
    stat_base = stat.rstrip('_')
    stat_regressed = f"{stat}reg"

    df_all_weighed_regress[stat_regressed] = (
        df_all_weighed_regress[stat] * df_all_weighed_regress["total_pa_"] 
        + df_all_weighed_regress[stat_base] * 5
    ) / (df_all_weighed_regress["total_pa_"] + 5)

    df_all_weighed_regress.drop(columns=[stat, stat_base], inplace=True)

# Remove incorrect columns and the index column
df_all_weighed_regress.drop(columns=df_all_weighed_regress.filter(like="_incorrect").columns, inplace=True)
df_all_weighed_regress.drop(columns="index", inplace=True)


In [667]:
df_all_weighed_regress.to_csv('batter_features_final_model.csv')