In [421]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [422]:
df_players = pd.read_csv('df_players.csv')

In [423]:
#make only for batting
df_players_pitching = df_players[df_players['total_bf'] > 0]

In [424]:
#selected features
current_pitcher_features = [
    'player_mlb_id',
    'age',
    'year',
    'primary_position',
    'years_after_28',
    'throws',
    'whip',
    'baa',
    'sp_pct',
    'k_rate_pitcher',
    'pitcher_avg_xwoba',
    'pitcher_avg_woba',
    'bb_rate_pitcher',
    'inplay_ratio_pitcher',
    'pitcher_avg_exp_ba',
    'babip_pitcher',
    'fly_balls_ratio_pitcher',
    'ground_balls_ratio_pitcher',
    'popups_ratio_pitcher',
    'hr_fb_pct_pitcher',
    'avg_bf_per_outing',
    'zone_chase_pct',
    'avg_fb_vel',
    'starter',
    'reliever',
    'both_starter_reliever',
    'total_bf'
    #regress stats based on 50 league average plate appearances
]

df_players_pitching = df_players_pitching[current_pitcher_features]

In [425]:
#remove position players who pitched
df_players_pitching = df_players_pitching[df_players_pitching['primary_position'] == 'p']

In [426]:
df_players_pitching.head()

Unnamed: 0,player_mlb_id,age,year,primary_position,years_after_28,throws,whip,baa,sp_pct,k_rate_pitcher,...,ground_balls_ratio_pitcher,popups_ratio_pitcher,hr_fb_pct_pitcher,avg_bf_per_outing,zone_chase_pct,avg_fb_vel,starter,reliever,both_starter_reliever,total_bf
0,0629cd765e77d52b8a43cb56621c82d949e50323,36,2021,p,8,L,1.8125,0.283951,0.0,0.123457,...,inf,inf,0.083333,3.950617,0.588629,87.645714,0,1,0,81
1,33438ca2fdf93ff73b2e1bcc5da3058a04cf1661,26,2021,p,0,R,1.823529,0.243902,22.222222,0.134146,...,7.666667,1.666667,0.368421,7.341463,0.5625,92.608173,0,1,0,82
3,3d65b1ac983ce90ff1f21f56b627883affc7dad1,24,2021,p,0,R,1.606061,0.217391,0.0,0.223602,...,inf,inf,0.153846,3.453416,0.574746,95.641275,0,1,0,161
5,3304ddb1ae3c28e484f383e6609fd35f1778ddf7,27,2021,p,0,R,1.258065,0.204545,0.0,0.204545,...,inf,inf,0.272727,2.931818,0.637931,92.895402,0,1,0,44
9,45be69e7dbc2d4b228aeb07ec08590cfdc6ff091,30,2021,p,2,R,1.548387,0.2,0.0,0.36,...,inf,inf,0.25,2.76,0.573394,93.7632,0,1,0,50


In [427]:
#deal with n/a values here and in functions

#set infinite ratios to 1
df_players_pitching.replace(np.inf, 1, inplace=True)

In [428]:
# Create the 'primary_position' column based on the column with the value of 1
df_players_pitching['primary_position'] = np.select(
    [df_players_pitching['starter'] == 1, 
     df_players_pitching['reliever'] == 1, 
     df_players_pitching['both_starter_reliever'] == 1],
    ['starter', 'reliever', 'both_starter_reliever'],
    default='unknown'  # This will handle the case where none of them are 1, if applicable
)

df_players_pitching = df_players_pitching.drop(columns=['starter','reliever','both_starter_reliever'])

In [429]:
#get leaguewide averages for stats
numeric_cols = df_players_pitching.select_dtypes(include=np.number).columns.tolist()

stat_averages_league = df_players_pitching.groupby('year')[numeric_cols].apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_bf']) 
        if col.name not in ['total_bf'] and group.loc[col.dropna().index, 'total_bf'].sum() > 0
        else col.mean(),  # Use col.mean() for total_bf column
        axis=0
    )
)

In [430]:
#get averages by position
numeric_cols = df_players_pitching.select_dtypes(include=np.number).columns.tolist()
numeric_cols += ['primary_position']

df_batting_num_cols = df_players_pitching[numeric_cols]

#get means
stat_averages_position = df_batting_num_cols.groupby(['year', 'primary_position']).apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_bf']) 
        if col.name != 'total_bf' and group.loc[col.dropna().index, 'total_bf'].sum() > 0
        else col.mean(),  # Use col.mean() for total_bf column
        axis=0
    )
).fillna(0).reset_index()

In [431]:
#replace stats for players with unknown positions with league averages
common_cols = stat_averages_league.columns.intersection(stat_averages_position.columns.drop('primary_position'))
for year in stat_averages_position['year'].unique():
    mask = (stat_averages_position['year'] == year) & (stat_averages_position['primary_position'] == 'Unknown')
    stat_averages_position.loc[mask, common_cols] = stat_averages_league.loc[
        stat_averages_league['year'] == year, common_cols
    ].values

In [432]:
#fill nas
df_players_pitching['avg_fb_vel'] = df_players_pitching['avg_fb_vel'].fillna((92.756616 + 92.970260) / 2)
df_players_pitching['primary_position'] = df_players_pitching['primary_position'].fillna('both_starter_reliever')
df_players_pitching = df_players_pitching.fillna(0)

In [433]:
#put all of player's stats into one row
df_2021 = df_players_pitching[df_players_pitching['year'] == 2021]
df_2022 = df_players_pitching[df_players_pitching['year'] == 2022]
df_2023 = df_players_pitching[df_players_pitching['year'] == 2023][['player_mlb_id','total_bf']]
df_all = df_2021.merge(df_2022, on='player_mlb_id', how='outer', suffixes=("_2ya","_1ya")).drop(columns=['year_2ya','year_1ya'])
df_all = df_all.merge(df_2023, on='player_mlb_id',how='outer').drop(columns='player_mlb_id')

In [434]:
#remove players who did not have a bf in 2021 or 2022
df_all = df_all[(df_all['total_bf_1ya'] > 0) | (df_all['total_bf_2ya'] > 0)]

In [435]:
#check nulls
df_all.isna().sum()

age_2ya                           180
primary_position_2ya              180
years_after_28_2ya                180
throws_2ya                        180
whip_2ya                          180
baa_2ya                           180
sp_pct_2ya                        180
k_rate_pitcher_2ya                180
pitcher_avg_xwoba_2ya             180
pitcher_avg_woba_2ya              180
bb_rate_pitcher_2ya               180
inplay_ratio_pitcher_2ya          180
pitcher_avg_exp_ba_2ya            180
babip_pitcher_2ya                 180
fly_balls_ratio_pitcher_2ya       180
ground_balls_ratio_pitcher_2ya    180
popups_ratio_pitcher_2ya          180
hr_fb_pct_pitcher_2ya             180
avg_bf_per_outing_2ya             180
zone_chase_pct_2ya                180
avg_fb_vel_2ya                    180
total_bf_2ya                      180
age_1ya                           175
primary_position_1ya              175
years_after_28_1ya                175
throws_1ya                        175
whip_1ya    

In [436]:
#fill the non-numerical nulls
df_all['age'] = df_all['age_1ya'].fillna(df_all['age_2ya'] + 1)  # Fill missing age and adjust

df_all['years_after_28'] = df_all['years_after_28_1ya'].fillna(df_all['years_after_28_2ya'] + 1)  # Fill missing years_after_28 and adjust

df_all['primary_position'] = df_all['primary_position_1ya'].fillna(df_all['primary_position_2ya']).fillna('both_starter_reliever')

df_all['throws'] = df_all['throws_1ya'].fillna(df_all['throws_2ya'])

df_all = df_all.drop(columns=['age_1ya','age_2ya','years_after_28_1ya','years_after_28_2ya','throws_1ya','throws_2ya','primary_position_2ya','primary_position_1ya'])

In [437]:
#fill remaining with 0, won't have an impact because remaining nulls are in years where total_bf is 0
df_all = df_all.fillna(0)

In [438]:
df_all.columns

Index(['whip_2ya', 'baa_2ya', 'sp_pct_2ya', 'k_rate_pitcher_2ya',
       'pitcher_avg_xwoba_2ya', 'pitcher_avg_woba_2ya', 'bb_rate_pitcher_2ya',
       'inplay_ratio_pitcher_2ya', 'pitcher_avg_exp_ba_2ya',
       'babip_pitcher_2ya', 'fly_balls_ratio_pitcher_2ya',
       'ground_balls_ratio_pitcher_2ya', 'popups_ratio_pitcher_2ya',
       'hr_fb_pct_pitcher_2ya', 'avg_bf_per_outing_2ya', 'zone_chase_pct_2ya',
       'avg_fb_vel_2ya', 'total_bf_2ya', 'whip_1ya', 'baa_1ya', 'sp_pct_1ya',
       'k_rate_pitcher_1ya', 'pitcher_avg_xwoba_1ya', 'pitcher_avg_woba_1ya',
       'bb_rate_pitcher_1ya', 'inplay_ratio_pitcher_1ya',
       'pitcher_avg_exp_ba_1ya', 'babip_pitcher_1ya',
       'fly_balls_ratio_pitcher_1ya', 'ground_balls_ratio_pitcher_1ya',
       'popups_ratio_pitcher_1ya', 'hr_fb_pct_pitcher_1ya',
       'avg_bf_per_outing_1ya', 'zone_chase_pct_1ya', 'avg_fb_vel_1ya',
       'total_bf_1ya', 'total_bf', 'age', 'years_after_28', 'primary_position',
       'throws'],
      dtype='obje

In [439]:
#get columns to be averaged
cols_1ya = ['whip_1ya', 'baa_1ya', 'sp_pct_1ya', 'k_rate_pitcher_1ya','pitcher_avg_xwoba_1ya','pitcher_avg_woba_1ya',
         'bb_rate_pitcher_1ya', 'inplay_ratio_pitcher_1ya',
         'pitcher_avg_exp_ba_1ya', 'babip_pitcher_1ya',
         'fly_balls_ratio_pitcher_1ya', 'ground_balls_ratio_pitcher_1ya',
         'popups_ratio_pitcher_1ya', 'hr_fb_pct_pitcher_1ya',
         'avg_bf_per_outing_1ya', 'zone_chase_pct_1ya', 'avg_fb_vel_1ya']

cols_2ya = ['whip_2ya', 'baa_2ya', 'sp_pct_2ya', 'k_rate_pitcher_2ya','pitcher_avg_xwoba_2ya','pitcher_avg_woba_2ya',
           'bb_rate_pitcher_2ya', 'inplay_ratio_pitcher_2ya',
           'pitcher_avg_exp_ba_2ya', 'babip_pitcher_2ya',
           'fly_balls_ratio_pitcher_2ya', 'ground_balls_ratio_pitcher_2ya',
           'popups_ratio_pitcher_2ya', 'hr_fb_pct_pitcher_2ya',
           'avg_bf_per_outing_2ya', 'zone_chase_pct_2ya', 'avg_fb_vel_2ya']

pa_1ya = 'total_bf_1ya'
pa_2ya = 'total_bf_2ya'

# Calculate the weighted averages and replace the original columns
for col_1ya, col_2ya in zip(cols_1ya, cols_2ya):
    weighted_avg_col = col_1ya[:-3]  # Get the base column name without _1ya or _2ya
    df_all[weighted_avg_col] = (
        (5 * df_all[col_1ya] * df_all[pa_1ya] + 3 * df_all[col_2ya] * df_all[pa_2ya]) /
        (5 * df_all[pa_1ya] + 3 * df_all[pa_2ya])
    )

# Drop the original _1ya and _2ya columns if you no longer need them
df_all_weighed = df_all.drop(columns=cols_1ya + cols_2ya)

#do the same for plate appearances
df_all_weighed['total_bf_'] = (5*df_all_weighed['total_bf_1ya'] + 3*df_all_weighed['total_bf_2ya']) / 8

df_all_weighed.drop(columns=['total_bf_1ya','total_bf_2ya'],inplace=True)

In [440]:
df_all_weighed.head()

Unnamed: 0,total_bf,age,years_after_28,primary_position,throws,whip_,baa_,sp_pct_,k_rate_pitcher_,pitcher_avg_xwoba_,...,pitcher_avg_exp_ba_,babip_pitcher_,fly_balls_ratio_pitcher_,ground_balls_ratio_pitcher_,popups_ratio_pitcher_,hr_fb_pct_pitcher_,avg_bf_per_outing_,zone_chase_pct_,avg_fb_vel_,total_bf_
0,0.0,23.0,0.0,reliever,R,1.657444,0.253333,0.0,0.208889,0.393517,...,0.346964,0.333333,1.0,1.0,1.0,0.211111,3.711111,0.616088,92.90577,56.25
1,218.0,29.0,1.0,reliever,L,1.107042,0.15493,0.0,0.276995,0.348197,...,0.312168,0.246144,0.915493,1.0,0.0,0.228873,2.7277,0.628187,92.863438,26.625
2,809.0,29.0,1.0,starter,L,1.254408,0.198358,100.0,0.229993,0.324662,...,0.304624,0.279898,3.782746,20.186303,1.07323,0.21552,13.953488,0.635648,87.719689,731.0
3,278.0,28.0,0.0,reliever,R,1.012188,0.197001,6.711409,0.24819,0.345247,...,0.312014,0.26164,1.0,1.0,1.0,0.094191,4.580145,0.630394,96.613636,241.75
4,282.0,28.0,0.0,reliever,R,2.423077,0.268041,0.0,0.216495,0.368,...,0.348237,0.410714,1.0,1.0,1.0,0.333333,4.845361,0.588832,95.67,60.625


In [441]:
df_all_weighed.select_dtypes(include=np.number).corr()

Unnamed: 0,total_bf,age,years_after_28,whip_,baa_,sp_pct_,k_rate_pitcher_,pitcher_avg_xwoba_,pitcher_avg_woba_,bb_rate_pitcher_,...,pitcher_avg_exp_ba_,babip_pitcher_,fly_balls_ratio_pitcher_,ground_balls_ratio_pitcher_,popups_ratio_pitcher_,hr_fb_pct_pitcher_,avg_bf_per_outing_,zone_chase_pct_,avg_fb_vel_,total_bf_
total_bf,1.0,-0.082709,-0.08057,-0.21777,-0.122388,0.539914,0.22738,-0.1192,-0.270806,-0.215142,...,-0.09599,-0.082698,0.120624,0.114232,0.127403,-0.103603,0.486866,0.243596,0.071055,0.656807
age,-0.082709,1.0,0.943372,0.008509,0.055073,-0.094545,0.011605,0.06742,0.015837,-0.122954,...,0.057247,0.014503,0.077818,0.076338,0.086652,0.041463,-0.161388,0.032964,-0.332363,0.074635
years_after_28,-0.08057,0.943372,1.0,0.0379,0.078246,-0.040916,-0.02714,0.082688,0.054268,-0.098911,...,0.066219,0.026481,0.071832,0.068744,0.077902,0.062488,-0.101972,0.01204,-0.33144,0.030318
whip_,-0.21777,0.008509,0.0379,1.0,0.615018,-0.116939,-0.391155,0.284025,0.755196,0.431174,...,0.358282,0.571737,-0.079289,-0.064715,-0.118664,0.185473,-0.083801,-0.322064,-0.064031,-0.258195
baa_,-0.122388,0.055073,0.078246,0.615018,1.0,0.031886,-0.430601,0.330318,0.7728,-0.168035,...,0.422256,0.718907,0.005214,0.020665,-0.043331,0.295451,0.106934,0.023413,-0.220291,-0.097583
sp_pct_,0.539914,-0.094545,-0.040916,-0.116939,0.031886,1.0,-0.018845,0.016534,-0.07295,-0.181696,...,0.018722,-0.052892,0.217257,0.206206,0.163316,0.020813,0.91674,0.134709,-0.134654,0.637507
k_rate_pitcher_,0.22738,0.011605,-0.02714,-0.391155,-0.430601,-0.018845,1.0,-0.020791,-0.4914,-0.161486,...,-0.07383,-0.078776,0.008355,-0.009832,0.043789,-0.103888,-0.134311,0.39135,0.365037,0.209903
pitcher_avg_xwoba_,-0.1192,0.06742,0.082688,0.284025,0.330318,0.016534,-0.020791,1.0,0.522228,0.17229,...,0.900423,0.386159,0.001414,-0.017453,-0.026742,0.378806,0.031138,-0.089061,-0.046668,-0.09312
pitcher_avg_woba_,-0.270806,0.015837,0.054268,0.755196,0.7728,-0.07295,-0.4914,0.522228,1.0,0.352517,...,0.522887,0.622159,-0.058712,-0.057597,-0.105595,0.478134,-0.019572,-0.363403,-0.165009,-0.296981
bb_rate_pitcher_,-0.215142,-0.122954,-0.098911,0.431174,-0.168035,-0.181696,-0.161486,0.17229,0.352517,1.0,...,0.194942,0.217159,-0.105675,-0.102348,-0.117983,-0.033876,-0.203465,-0.707248,0.110928,-0.287336


In [442]:
df_all_weighed.columns

Index(['total_bf', 'age', 'years_after_28', 'primary_position', 'throws',
       'whip_', 'baa_', 'sp_pct_', 'k_rate_pitcher_', 'pitcher_avg_xwoba_',
       'pitcher_avg_woba_', 'bb_rate_pitcher_', 'inplay_ratio_pitcher_',
       'pitcher_avg_exp_ba_', 'babip_pitcher_', 'fly_balls_ratio_pitcher_',
       'ground_balls_ratio_pitcher_', 'popups_ratio_pitcher_',
       'hr_fb_pct_pitcher_', 'avg_bf_per_outing_', 'zone_chase_pct_',
       'avg_fb_vel_', 'total_bf_'],
      dtype='object')

In [443]:
#regress everyone's stats to the mean of their position group by 5 PA (to help with players who had very few plate appearances)
stats_to_regress = ['whip_', 'baa_', 'k_rate_pitcher_', 'bb_rate_pitcher_',
       'inplay_ratio_pitcher_', 'pitcher_avg_exp_ba_', 'babip_pitcher_',
       'fly_balls_ratio_pitcher_', 'ground_balls_ratio_pitcher_',
       'popups_ratio_pitcher_', 'hr_fb_pct_pitcher_', 'avg_bf_per_outing_','pitcher_avg_xwoba_','pitcher_avg_woba_',
       'zone_chase_pct_', 'avg_fb_vel_']

In [444]:
stat_averages_position_2021_2022 = stat_averages_position.drop(columns='year').reset_index().iloc[0:20].groupby('primary_position').mean().reset_index()

In [445]:
df_all_weighed_regress = df_all_weighed.merge(stat_averages_position_2021_2022, on='primary_position', suffixes=('', '_incorrect'))

In [446]:
for i in stats_to_regress:
    j = i.rstrip('_')
    k = i + 'reg'
    df_all_weighed_regress[k] = (df_all_weighed_regress[i]*df_all_weighed_regress['total_bf_'] + df_all_weighed_regress[j]*5 ) / (df_all_weighed_regress['total_bf_']+5)
    df_all_weighed_regress.drop([i, j ], axis=1, inplace=True)

df_all_weighed_regress.drop(df_all_weighed_regress.filter(like='_incorrect').columns, axis=1, inplace=True)
df_all_weighed_regress.drop(columns='index', inplace=True)

In [447]:
df_all_weighed_regress

Unnamed: 0,total_bf,age,years_after_28,primary_position,throws,sp_pct_,total_bf_,sp_pct,whip_reg,baa_reg,...,babip_pitcher_reg,fly_balls_ratio_pitcher_reg,ground_balls_ratio_pitcher_reg,popups_ratio_pitcher_reg,hr_fb_pct_pitcher_reg,avg_bf_per_outing_reg,pitcher_avg_xwoba_reg,pitcher_avg_woba_reg,zone_chase_pct_reg,avg_fb_vel_reg
0,0.0,23.0,0.0,reliever,R,0.000000,56.250,2.883527,1.634001,0.249757,...,0.329874,1.194588,1.344661,1.049938,0.207623,3.706988,0.390563,0.355679,0.616499,92.973393
1,218.0,29.0,1.0,reliever,L,0.000000,26.625,2.883527,1.148660,0.163561,...,0.253228,1.305724,1.667525,0.254821,0.219309,2.875195,0.349642,0.292201,0.627072,93.001101
2,809.0,29.0,1.0,starter,L,100.000000,731.000,95.853353,1.254859,0.198514,...,0.279968,3.804790,20.124974,1.082276,0.215285,13.940916,0.324974,0.289439,0.635585,87.752936
3,278.0,28.0,0.0,reliever,R,6.711409,241.750,2.883527,1.019444,0.197255,...,0.262234,1.048302,1.085554,1.012396,0.095694,4.561512,0.345491,0.269180,0.630207,96.555288
4,282.0,28.0,0.0,reliever,R,0.000000,60.625,2.883527,2.342864,0.263583,...,0.401590,1.181615,1.321683,1.046609,0.320766,4.755094,0.367187,0.408527,0.591293,95.522507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,258.0,24.0,0.0,reliever,R,2.027027,177.500,2.883527,1.120610,0.209165,...,0.277442,1.959143,9.905057,1.761623,0.163432,2.948236,0.299682,0.284890,0.686878,96.468267
892,553.0,32.0,4.0,starter,R,97.872340,303.750,95.853353,1.242974,0.210060,...,0.249905,1.097614,1.164514,1.022748,0.164209,11.457130,0.368161,0.320134,0.630690,91.648776
893,0.0,33.0,5.0,both_starter_reliever,R,40.425532,256.500,55.507716,1.528966,0.210454,...,0.273833,38.199109,89.291171,8.557356,0.144074,8.644320,0.334892,0.330372,0.579920,94.302002
894,237.0,32.0,4.0,reliever,L,0.000000,226.250,2.883527,1.336108,0.195341,...,0.269837,1.051540,1.091289,1.013227,0.104943,2.688337,0.356120,0.313528,0.609256,92.772464


In [448]:
df_all_weighed_regress.to_csv('pitcher_features.csv')