In [139]:
#import standard data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [140]:
df_players = pd.read_csv('df_players.csv')

In [141]:
#make only for batting
df_players_pitching = df_players[df_players['total_bf'] > 0]

In [142]:
#selected features
current_pitcher_features = [
    'player_mlb_id',
    'age',
    'year',
    'primary_position',
    'years_after_28',
    'throws',
    'whip',
    'baa',
    'sp_pct',
    'k_rate_pitcher',
    'pitcher_avg_xwoba',
    'pitcher_avg_woba',
    'bb_rate_pitcher',
    'inplay_ratio_pitcher',
    'pitcher_avg_exp_ba',
    'babip_pitcher',
    'fly_balls_ratio_pitcher',
    'ground_balls_ratio_pitcher',
    'line_drives_ratio_pitcher',
    'popups_ratio_pitcher',
    'hr_fb_pct_pitcher',
    'avg_bf_per_outing',
    'zone_chase_pct',
    'avg_fb_vel',
    'starter',
    'reliever',
    'both_starter_reliever',
    'total_bf'
    #regress stats based on 50 league average plate appearances
]

df_players_pitching = df_players_pitching[current_pitcher_features]

In [143]:
#remove position players who pitched
df_players_pitching = df_players_pitching[df_players_pitching['primary_position'] == 'p']

In [144]:
df_players_pitching.head()

Unnamed: 0,player_mlb_id,age,year,primary_position,years_after_28,throws,whip,baa,sp_pct,k_rate_pitcher,...,line_drives_ratio_pitcher,popups_ratio_pitcher,hr_fb_pct_pitcher,avg_bf_per_outing,zone_chase_pct,avg_fb_vel,starter,reliever,both_starter_reliever,total_bf
0,0629cd765e77d52b8a43cb56621c82d949e50323,36,2021,p,8,L,1.8125,0.283951,0.0,0.123457,...,inf,inf,0.083333,3.950617,0.588629,87.645714,0,1,0,81
1,33438ca2fdf93ff73b2e1bcc5da3058a04cf1661,26,2021,p,0,R,1.823529,0.243902,22.222222,0.134146,...,4.0,1.666667,0.368421,7.341463,0.5625,92.608173,0,1,0,82
3,3d65b1ac983ce90ff1f21f56b627883affc7dad1,24,2021,p,0,R,1.606061,0.217391,0.0,0.223602,...,inf,inf,0.153846,3.453416,0.574746,95.641275,0,1,0,161
5,3304ddb1ae3c28e484f383e6609fd35f1778ddf7,27,2021,p,0,R,1.258065,0.204545,0.0,0.204545,...,inf,inf,0.272727,2.931818,0.637931,92.895402,0,1,0,44
9,45be69e7dbc2d4b228aeb07ec08590cfdc6ff091,30,2021,p,2,R,1.548387,0.2,0.0,0.36,...,inf,inf,0.25,2.76,0.573394,93.7632,0,1,0,50


In [145]:
#deal with n/a values here and in functions

#set infinite ratios to 1
df_players_pitching.replace(np.inf, 1, inplace=True)

In [146]:
# Create the 'primary_position' column based on the column with the value of 1
df_players_pitching['primary_position'] = np.select(
    [df_players_pitching['starter'] == 1, 
     df_players_pitching['reliever'] == 1, 
     df_players_pitching['both_starter_reliever'] == 1],
    ['starter', 'reliever', 'both_starter_reliever'],
    default='unknown'  # This will handle the case where none of them are 1, if applicable
)

df_players_pitching = df_players_pitching.drop(columns=['starter','reliever','both_starter_reliever'])

In [147]:
#get leaguewide averages for stats
numeric_cols = df_players_pitching.select_dtypes(include=np.number).columns.tolist()

stat_averages_league = df_players_pitching.groupby('year')[numeric_cols].apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_bf']) 
        if col.name not in ['total_bf'] and group.loc[col.dropna().index, 'total_bf'].sum() > 0
        else col.mean(),  # Use col.mean() for total_bf column
        axis=0
    )
)

In [148]:
#get averages by position
numeric_cols = df_players_pitching.select_dtypes(include=np.number).columns.tolist()
numeric_cols += ['primary_position']

df_batting_num_cols = df_players_pitching[numeric_cols]

#get means
stat_averages_position = df_batting_num_cols.groupby(['year', 'primary_position']).apply(
    lambda group: group.apply(
        lambda col: np.average(col.dropna(), weights=group.loc[col.dropna().index, 'total_bf']) 
        if col.name != 'total_bf' and group.loc[col.dropna().index, 'total_bf'].sum() > 0
        else col.mean(),  # Use col.mean() for total_bf column
        axis=0
    )
).fillna(0).reset_index()

In [149]:
#replace stats for players with unknown positions with league averages
common_cols = stat_averages_league.columns.intersection(stat_averages_position.columns.drop('primary_position'))
for year in stat_averages_position['year'].unique():
    mask = (stat_averages_position['year'] == year) & (stat_averages_position['primary_position'] == 'Unknown')
    stat_averages_position.loc[mask, common_cols] = stat_averages_league.loc[
        stat_averages_league['year'] == year, common_cols
    ].values

In [150]:
#fill nas
df_players_pitching['avg_fb_vel'] = df_players_pitching['avg_fb_vel'].fillna((92.756616 + 92.970260) / 2)
df_players_pitching['primary_position'] = df_players_pitching['primary_position'].fillna('both_starter_reliever')
df_players_pitching = df_players_pitching.fillna(0)

In [151]:
#put all of player's stats into one row
df_2021 = df_players_pitching[df_players_pitching['year'] == 2021]
df_2022 = df_players_pitching[df_players_pitching['year'] == 2022]
df_2023 = df_players_pitching[df_players_pitching['year'] == 2023]
df_all = df_2021.merge(df_2022, on='player_mlb_id', how='outer', suffixes=("_2ya","_1ya")).drop(columns=['year_2ya','year_1ya'])
df_all = df_all.merge(df_2023, on='player_mlb_id',how='outer')

In [152]:
#check nulls
df_all.isna().sum()

player_mlb_id             0
age_2ya                 350
primary_position_2ya    350
years_after_28_2ya      350
throws_2ya              350
                       ... 
hr_fb_pct_pitcher       303
avg_bf_per_outing       303
zone_chase_pct          303
avg_fb_vel              303
total_bf                303
Length: 71, dtype: int64

In [153]:
#fill the non-numerical nulls
df_all['age'] = df_all['age'].fillna(df_all['age_1ya']+1).fillna(df_all['age_2ya'] + 2)  # Fill missing age and adjust

df_all['years_after_28'] = df_all['years_after_28'].fillna(df_all['years_after_28_1ya']+1).fillna(df_all['years_after_28_2ya'] + 2)  # Fill missing years_after_28 and adjust

df_all['primary_position'] = df_all['primary_position_1ya'].fillna(df_all['primary_position_2ya']).fillna('both_starter_reliever')

df_all['throws'] = df_all['throws_1ya'].fillna(df_all['throws_2ya'])

df_all = df_all.drop(columns=['age_1ya','age_2ya','years_after_28_1ya','years_after_28_2ya','throws_1ya','throws_2ya','primary_position_2ya','primary_position_1ya'])

In [154]:
#fill remaining with 0, won't have an impact because remaining nulls are in years where total_bf is 0
df_all = df_all.fillna(0)

In [155]:
df_all.columns

Index(['player_mlb_id', 'whip_2ya', 'baa_2ya', 'sp_pct_2ya',
       'k_rate_pitcher_2ya', 'pitcher_avg_xwoba_2ya', 'pitcher_avg_woba_2ya',
       'bb_rate_pitcher_2ya', 'inplay_ratio_pitcher_2ya',
       'pitcher_avg_exp_ba_2ya', 'babip_pitcher_2ya',
       'fly_balls_ratio_pitcher_2ya', 'ground_balls_ratio_pitcher_2ya',
       'line_drives_ratio_pitcher_2ya', 'popups_ratio_pitcher_2ya',
       'hr_fb_pct_pitcher_2ya', 'avg_bf_per_outing_2ya', 'zone_chase_pct_2ya',
       'avg_fb_vel_2ya', 'total_bf_2ya', 'whip_1ya', 'baa_1ya', 'sp_pct_1ya',
       'k_rate_pitcher_1ya', 'pitcher_avg_xwoba_1ya', 'pitcher_avg_woba_1ya',
       'bb_rate_pitcher_1ya', 'inplay_ratio_pitcher_1ya',
       'pitcher_avg_exp_ba_1ya', 'babip_pitcher_1ya',
       'fly_balls_ratio_pitcher_1ya', 'ground_balls_ratio_pitcher_1ya',
       'line_drives_ratio_pitcher_1ya', 'popups_ratio_pitcher_1ya',
       'hr_fb_pct_pitcher_1ya', 'avg_bf_per_outing_1ya', 'zone_chase_pct_1ya',
       'avg_fb_vel_1ya', 'total_bf_1ya', 'a

In [156]:
#get columns to be averaged
cols_0ya = ['whip', 'baa', 'sp_pct', 'k_rate_pitcher', 'pitcher_avg_xwoba', 'pitcher_avg_woba',
 'bb_rate_pitcher', 'inplay_ratio_pitcher', 'pitcher_avg_exp_ba', 'babip_pitcher',
 'fly_balls_ratio_pitcher', 'ground_balls_ratio_pitcher', 'line_drives_ratio_pitcher',
 'popups_ratio_pitcher', 'hr_fb_pct_pitcher', 'avg_bf_per_outing', 'zone_chase_pct', 'avg_fb_vel']


cols_1ya = ['whip_1ya', 'baa_1ya', 'sp_pct_1ya', 'k_rate_pitcher_1ya','pitcher_avg_xwoba_1ya','pitcher_avg_woba_1ya',
         'bb_rate_pitcher_1ya', 'inplay_ratio_pitcher_1ya',
         'pitcher_avg_exp_ba_1ya', 'babip_pitcher_1ya',
         'fly_balls_ratio_pitcher_1ya', 'ground_balls_ratio_pitcher_1ya', 'line_drives_ratio_pitcher_1ya',
         'popups_ratio_pitcher_1ya', 'hr_fb_pct_pitcher_1ya',
         'avg_bf_per_outing_1ya', 'zone_chase_pct_1ya', 'avg_fb_vel_1ya']

cols_2ya = ['whip_2ya', 'baa_2ya', 'sp_pct_2ya', 'k_rate_pitcher_2ya','pitcher_avg_xwoba_2ya','pitcher_avg_woba_2ya',
           'bb_rate_pitcher_2ya', 'inplay_ratio_pitcher_2ya',
           'pitcher_avg_exp_ba_2ya', 'babip_pitcher_2ya',
           'fly_balls_ratio_pitcher_2ya', 'ground_balls_ratio_pitcher_2ya', 'line_drives_ratio_pitcher_2ya',
           'popups_ratio_pitcher_2ya', 'hr_fb_pct_pitcher_2ya',
           'avg_bf_per_outing_2ya', 'zone_chase_pct_2ya', 'avg_fb_vel_2ya']

pa_0ya = 'total_bf'
pa_1ya = 'total_bf_1ya'
pa_2ya = 'total_bf_2ya'

# Calculate the weighted averages and replace the original columns
for col_0ya, col_1ya, col_2ya in zip(cols_0ya, cols_1ya, cols_2ya):
    new_col_name = col_0ya + '_'
    df_all[new_col_name] = (
        (5 * df_all[col_0ya] * df_all[pa_0ya] + 3 * df_all[col_1ya] * df_all[pa_1ya] + 2 * df_all[col_2ya] * df_all[pa_2ya]) /
        (5 * df_all[pa_0ya] + 3 * df_all[pa_1ya] + 2 * df_all[pa_2ya])
    )

# Drop the original _1ya and _2ya columns
df_all_weighed = df_all.drop(columns=cols_1ya + cols_2ya + cols_0ya)

#do the same for plate appearances
df_all_weighed['total_bf_'] = (5*df_all_weighed['total_bf'] + 3*df_all_weighed['total_bf_1ya'] + 2*df_all_weighed['total_bf_2ya']) / 10

df_all_weighed.drop(columns=['total_bf','total_bf_1ya','total_bf_2ya'],inplace=True)

In [157]:
df_all_weighed.head()

Unnamed: 0,player_mlb_id,age,year,primary_position,years_after_28,throws,whip_,baa_,sp_pct_,k_rate_pitcher_,...,babip_pitcher_,fly_balls_ratio_pitcher_,ground_balls_ratio_pitcher_,line_drives_ratio_pitcher_,popups_ratio_pitcher_,hr_fb_pct_pitcher_,avg_bf_per_outing_,zone_chase_pct_,avg_fb_vel_,total_bf_
0,007d7209f1d3287648e11f52c5a5148732f7a791,24.0,0.0,reliever,1.0,R,1.642009,0.252669,0.0,0.209964,...,0.333333,1.0,1.0,1.0,1.0,0.202847,3.708185,0.617289,92.968517,28.1
1,007dcc596b82af90fd37f3413e98812b87a6b305,30.0,2023.0,reliever,2.0,L,1.419595,0.205086,0.0,0.205906,...,0.290608,0.990156,1.0,0.990156,0.894176,0.126187,3.565217,0.650512,89.531435,121.9
2,0110ddbbd4e67bbd0a355dd5561021b075920eb3,30.0,2023.0,starter,2.0,L,1.228113,0.201878,100.0,0.238915,...,0.280905,2.414841,10.754956,3.271192,1.037233,0.203103,13.807903,0.634324,86.698159,766.8
3,01171c9c71028faf86b7447ec692000f50df9fac,29.0,2023.0,reliever,1.0,R,1.163752,0.187669,6.711409,0.275145,...,0.269804,1.0,1.0,1.0,1.0,0.135299,3.633526,0.626591,97.447531,259.5
4,01b9d70c4ec72b8a0f733bdf6bc47df596617dfd,29.0,2023.0,reliever,1.0,R,1.292942,0.222222,0.0,0.228101,...,0.304706,1.0,1.0,1.0,1.0,0.163983,3.095238,0.63705,96.371822,170.1


In [158]:
df_all_weighed.select_dtypes(include=np.number).corr()

Unnamed: 0,age,year,years_after_28,whip_,baa_,sp_pct_,k_rate_pitcher_,pitcher_avg_xwoba_,pitcher_avg_woba_,bb_rate_pitcher_,...,babip_pitcher_,fly_balls_ratio_pitcher_,ground_balls_ratio_pitcher_,line_drives_ratio_pitcher_,popups_ratio_pitcher_,hr_fb_pct_pitcher_,avg_bf_per_outing_,zone_chase_pct_,avg_fb_vel_,total_bf_
age,1.0,-0.268441,0.9458,-0.01928,0.065915,-0.105263,-0.023816,0.045899,0.012477,-0.126449,...,-0.002401,0.129743,0.127681,0.139251,0.136927,0.056983,-0.142932,0.043958,-0.335283,0.076484
year,-0.268441,1.0,-0.324657,-0.130058,-0.136679,0.142213,0.224449,-0.116941,-0.248107,-0.133344,...,-0.073807,-0.08271,-0.084207,-0.082419,-0.032692,-0.116038,0.133329,0.2072,0.148975,0.427963
years_after_28,0.9458,-0.324657,1.0,0.000114,0.077769,-0.057993,-0.050579,0.059486,0.047206,-0.096794,...,0.006819,0.129632,0.122843,0.135528,0.130305,0.07392,-0.104591,0.012531,-0.32447,0.020448
whip_,-0.01928,-0.130058,0.000114,1.0,0.452775,-0.104985,-0.370795,0.21044,0.676899,0.539974,...,0.470981,-0.059991,-0.048021,-0.049724,-0.11246,0.091871,-0.077303,-0.425461,-0.02968,-0.212309
baa_,0.065915,-0.136679,0.077769,0.452775,1.0,0.065451,-0.406892,0.351848,0.763959,-0.182942,...,0.742656,0.005334,0.017975,0.021495,-0.038798,0.269789,0.130504,0.046391,-0.236871,-0.075471
sp_pct_,-0.105263,0.142213,-0.057993,-0.104985,0.065451,1.0,-0.061502,0.027166,-0.044576,-0.185924,...,-0.05423,0.16614,0.153951,0.162887,0.124851,0.05685,0.933974,0.129492,-0.133402,0.605164
k_rate_pitcher_,-0.023816,0.224449,-0.050579,-0.370795,-0.406892,-0.061502,1.0,0.000937,-0.457956,-0.164228,...,-0.104095,-0.02866,-0.040503,-0.038176,0.004503,-0.066652,-0.154567,0.368813,0.339461,0.184968
pitcher_avg_xwoba_,0.045899,-0.116941,0.059486,0.21044,0.351848,0.027166,0.000937,1.0,0.489433,0.084355,...,0.394377,-0.005752,-0.020317,-0.004647,-0.04066,0.388801,0.040907,-0.055489,-0.04385,-0.081239
pitcher_avg_woba_,0.012477,-0.248107,0.047206,0.676899,0.763959,-0.044576,-0.457956,0.489433,1.0,0.372416,...,0.660318,-0.040699,-0.040831,-0.035496,-0.09441,0.423815,0.01115,-0.375258,-0.167283,-0.26014
bb_rate_pitcher_,-0.126449,-0.133344,-0.096794,0.539974,-0.182942,-0.185924,-0.164228,0.084355,0.372416,1.0,...,0.175612,-0.076605,-0.074377,-0.077853,-0.108948,-0.06472,-0.199684,-0.72344,0.124651,-0.259837


In [159]:
df_all_weighed.columns

Index(['player_mlb_id', 'age', 'year', 'primary_position', 'years_after_28',
       'throws', 'whip_', 'baa_', 'sp_pct_', 'k_rate_pitcher_',
       'pitcher_avg_xwoba_', 'pitcher_avg_woba_', 'bb_rate_pitcher_',
       'inplay_ratio_pitcher_', 'pitcher_avg_exp_ba_', 'babip_pitcher_',
       'fly_balls_ratio_pitcher_', 'ground_balls_ratio_pitcher_',
       'line_drives_ratio_pitcher_', 'popups_ratio_pitcher_',
       'hr_fb_pct_pitcher_', 'avg_bf_per_outing_', 'zone_chase_pct_',
       'avg_fb_vel_', 'total_bf_'],
      dtype='object')

In [160]:
#regress everyone's stats to the mean of their position group by 5 PA (to help with players who had very few plate appearances)
stats_to_regress = ['whip_', 'baa_', 'k_rate_pitcher_', 'bb_rate_pitcher_',
       'inplay_ratio_pitcher_', 'pitcher_avg_exp_ba_', 'babip_pitcher_',
       'fly_balls_ratio_pitcher_', 'ground_balls_ratio_pitcher_', 'line_drives_ratio_pitcher_',
       'popups_ratio_pitcher_', 'hr_fb_pct_pitcher_', 'avg_bf_per_outing_','pitcher_avg_xwoba_','pitcher_avg_woba_',
       'zone_chase_pct_', 'avg_fb_vel_']

In [161]:
stat_averages_position_2021_2023 = stat_averages_position.drop(columns='year').reset_index().groupby('primary_position').mean().reset_index()

In [162]:
df_all_weighed_regress = df_all_weighed.merge(stat_averages_position_2021_2023, on='primary_position', suffixes=('', '_incorrect'))

In [163]:
for stat in stats_to_regress:
    stat_base = stat.rstrip('_')
    stat_regressed = f"{stat}reg"

    df_all_weighed_regress[stat_regressed] = (
        df_all_weighed_regress[stat] * df_all_weighed_regress["total_bf_"] 
        + df_all_weighed_regress[stat_base] * 5
    ) / (df_all_weighed_regress["total_bf_"] + 5)

    df_all_weighed_regress.drop(columns=[stat, stat_base], inplace=True)

# Remove incorrect columns and the index column
df_all_weighed_regress.drop(columns=df_all_weighed_regress.filter(like="_incorrect").columns, inplace=True)
df_all_weighed_regress.drop(columns="index", inplace=True)

In [164]:
df_all_weighed_regress

Unnamed: 0,player_mlb_id,age,year,primary_position,years_after_28,throws,sp_pct_,total_bf_,sp_pct,total_bf,...,fly_balls_ratio_pitcher_reg,ground_balls_ratio_pitcher_reg,line_drives_ratio_pitcher_reg,popups_ratio_pitcher_reg,hr_fb_pct_pitcher_reg,avg_bf_per_outing_reg,pitcher_avg_xwoba_reg,pitcher_avg_woba_reg,zone_chase_pct_reg,avg_fb_vel_reg
0,007d7209f1d3287648e11f52c5a5148732f7a791,24.0,0.0,reliever,1.0,R,0.000000,28.1,2.883527,145.646085,...,1.360076,1.637779,1.329180,1.092409,0.197641,3.700998,0.387320,0.349836,0.617869,93.084172
1,007dcc596b82af90fd37f3413e98812b87a6b305,30.0,2023.0,reliever,2.0,L,0.000000,121.9,2.883527,145.646085,...,1.084464,1.166355,1.076405,0.922449,0.127849,3.568976,0.314967,0.324067,0.649354,89.697027
2,0110ddbbd4e67bbd0a355dd5561021b075920eb3,30.0,2023.0,starter,2.0,L,100.000000,766.8,95.853353,462.613470,...,2.444724,10.757571,3.291986,1.046092,0.202959,13.796857,0.358834,0.295012,0.634273,86.736481
3,01171c9c71028faf86b7447ec692000f50df9fac,29.0,2023.0,reliever,1.0,R,6.711409,259.5,2.883527,145.646085,...,1.045061,1.079813,1.041194,1.011564,0.135925,3.634038,0.369724,0.280398,0.626488,97.377335
4,01b9d70c4ec72b8a0f733bdf6bc47df596617dfd,29.0,2023.0,reliever,1.0,R,0.000000,170.1,2.883527,145.646085,...,1.068067,1.120562,1.062226,1.017468,0.164109,3.111382,0.347038,0.292177,0.636595,96.296504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,ff3b7d21e60c24b6e926e3ffc3fbeb78cc9e4057,25.0,2023.0,reliever,0.0,R,2.027027,217.1,2.883527,145.646085,...,1.445378,4.946918,2.746492,1.340201,0.125904,2.809559,0.306369,0.272758,0.671504,96.440758
1062,ffa57541bf6d7030fdf7206f4aa0141a2c918647,33.0,2023.0,starter,5.0,R,97.872340,422.3,95.853353,462.613470,...,1.070532,1.118871,1.064134,1.016437,0.136487,12.035839,0.355818,0.318762,0.629909,92.507638
1063,ffbad18fedea89e09f42dac340e9ebb0bc18303d,34.0,0.0,both_starter_reliever,6.0,R,40.425532,131.2,55.507716,236.178996,...,39.199464,91.614840,41.571129,8.761737,0.147379,8.864278,0.336234,0.331180,0.580588,94.216635
1064,ffc9b6a7a663bc322e74c70a5ed8756787a577d0,33.0,2023.0,reliever,5.0,L,0.000000,232.6,2.883527,145.646085,...,1.050162,1.088849,1.045858,1.012873,0.147913,2.719710,0.376257,0.330159,0.617098,92.712398


In [165]:
df_all_weighed_regress.to_csv('pitcher_features_final_model.csv')