In [3]:
from pybaseball import playerid_lookup
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the cleaned dataset
df = pd.read_csv('full_statcast_2024.csv')

# Display the first few rows to verify
df.head()



Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,CH,2024-10-01,88.1,-1.65,6.12,"Brieske, Beau",518792,689225,field_out,hit_into_play,...,1,1,3.0,3.0,1.0,1.0,2.5,1.4,-1.4,46.4
1,CH,2024-10-01,87.1,-1.69,6.17,"Brieske, Beau",518792,689225,,swinging_strike,...,1,1,3.0,3.0,1.0,1.0,2.45,1.19,-1.19,44.1
2,CH,2024-10-01,89.7,-1.89,6.14,"Brieske, Beau",518792,689225,,ball,...,1,1,3.0,3.0,1.0,1.0,2.44,1.33,-1.33,47.4
3,FF,2024-10-01,97.5,-1.51,6.32,"Brieske, Beau",518792,689225,,foul,...,1,1,3.0,3.0,1.0,1.0,0.82,0.4,-0.4,55.0
4,CH,2024-10-01,88.6,-1.77,6.19,"Brieske, Beau",518792,689225,,blocked_ball,...,1,1,3.0,3.0,1.0,1.0,2.32,1.3,-1.3,47.0


In [4]:
# Lookup player metadata by first and last name
player_metadata = playerid_lookup('Hedges', 'Austin')

# View the player metadata
player_metadata.head()

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,hedges,austin,595978,hedga001,hedgeau01,12976,2015.0,2024.0


In [9]:
# List of Guardians players' MLBAM IDs
guardians_ids = [
    680757, 665926, 608070, 647304, 657041, 677587, 666310, 681807, 
    686823, 671289, 682177, 678877, 672356, 682657, 700932, 595978
]

# Filter dataset for only Guardians players
df_guardians_full = df[df['batter'].isin(guardians_ids)]

# Save full dataset for later use
df_guardians_full.to_csv('Guardians_Full_Dataset.csv', index=False)

# Verify dataset size
df_guardians_full.shape


(22839, 113)

In [11]:
# Load the saved full dataset
df_guardians = pd.read_csv('Guardians_Full_Dataset.csv')

# Verify it loaded correctly
df_guardians.shape, df_guardians.head()


((22839, 113),
   pitch_type   game_date  release_speed  release_pos_x  release_pos_z  \
 0         CH  2024-09-28           87.1          -2.12           5.78   
 1         CU  2024-09-28           80.9          -2.12           5.90   
 2         CU  2024-09-28           82.2          -2.05           5.85   
 3         CU  2024-09-28           82.4          -2.03           5.85   
 4         CU  2024-09-28           81.0          -2.06           5.84   
 
     player_name  batter  pitcher     events      description  ...  \
 0  Dubin, Shawn  677587   681869  strikeout    called_strike  ...   
 1  Dubin, Shawn  677587   681869        NaN             ball  ...   
 2  Dubin, Shawn  677587   681869        NaN             ball  ...   
 3  Dubin, Shawn  677587   681869        NaN  swinging_strike  ...   
 4  Dubin, Shawn  677587   681869        NaN  swinging_strike  ...   
 
    n_thruorder_pitcher  n_priorpa_thisgame_player_at_bat  \
 0                    1                                 

In [13]:
# Keep only first-pitch swings
df_guardians = df_guardians[df_guardians['pitch_number'] == 1]

# Verify dataset size after filtering
df_guardians.shape


(5936, 113)

In [17]:
df_guardians = df_guardians[df_guardians['events'].isin([
    'single', 'double', 'triple', 'home_run', 'sac_fly', 'sac_fly_double_play',
    'field_out', 'force_out', 'grounded_into_double_play', 'fielders_choice',
    'double_play', 'fielders_choice_out'
])]

# Verify size after filtering
df_guardians.shape




(629, 113)

In [21]:
# Save the filtered dataset
df_guardians.to_csv('Guardians_First_Pitch_Swings_in_play.csv', index=False)

In [23]:
# Load the filtered dataset
df_guardians = pd.read_csv('Guardians_First_Pitch_Swings_in_play.csv')

# Display the first few rows to verify
df_guardians.head()


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,FS,2024-09-28,82.8,-1.25,6.12,"Neris, Héctor",686823,593576,single,hit_into_play,...,1,3,3.0,1.0,3.0,9.0,3.53,0.37,-0.37,42.0
1,CH,2024-09-28,85.6,-1.85,6.87,"Verlander, Justin",678877,434378,single,hit_into_play,...,2,1,8.0,1.0,,7.0,2.36,1.03,1.03,47.9
2,CU,2024-09-28,78.3,-1.94,6.94,"Verlander, Justin",700932,434378,field_out,hit_into_play,...,2,1,8.0,1.0,,7.0,4.94,-0.48,0.48,51.7
3,SI,2024-09-27,95.4,2.83,5.69,"Hader, Josh",677587,623352,field_out,hit_into_play,...,1,0,3.0,2.0,4.0,1.0,0.85,0.49,-0.49,33.1
4,FF,2024-09-25,93.2,-1.84,5.6,"Pagán, Emilio",680757,641941,single,hit_into_play,...,1,3,1.0,13.0,,2.0,1.12,0.19,-0.19,42.7


In [25]:
# Define relevant columns
ml_columns = [
    'events', 'pitch_type', 'release_speed', 'pfx_z', 'plate_x', 'plate_z',
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 
    'release_spin_rate', 'release_extension', 'release_pos_y'
]

# Keep only ML-relevant columns
df_guardians = df_guardians[ml_columns]

# Verify the selected columns
df_guardians.head()


Unnamed: 0,events,pitch_type,release_speed,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,release_spin_rate,release_extension,release_pos_y
0,single,FS,82.8,-0.19,0.01,0.86,3.572215,-120.515904,-5.097357,-4.296812,24.513082,-33.193864,3.27,1.51,1068.0,6.0,54.51
1,single,CH,85.6,0.75,0.25,2.92,6.985398,-124.411673,-4.535563,-12.068063,24.942271,-23.612537,3.54,1.72,1869.0,6.1,54.43
2,field_out,CU,78.3,-1.21,0.59,1.21,4.531914,-113.946826,-3.251351,3.327798,21.71108,-42.302207,3.17,1.44,2763.0,6.0,54.45
3,field_out,SI,95.4,1.64,0.58,2.35,-7.052662,-138.765117,-6.936673,7.766018,28.303703,-9.561874,3.14,1.44,2397.0,6.7,53.8
4,single,FF,93.2,1.51,-0.01,2.05,5.151152,-135.525703,-6.704149,-3.494438,29.517357,-12.293563,3.21,1.47,2563.0,6.7,53.8


In [27]:
# Fill missing values in 'pitch_type'
df_guardians['pitch_type'] = df_guardians['pitch_type'].fillna('Unknown')

# Select numerical columns
numerical_columns = [
    'release_speed', 'pfx_z', 'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0',
    'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'release_spin_rate', 
    'release_extension', 'release_pos_y'
]

# Apply median imputation
imputer = SimpleImputer(strategy='median')
df_guardians[numerical_columns] = imputer.fit_transform(df_guardians[numerical_columns])

# Verify missing values are handled
df_guardians.isna().sum()


events               0
pitch_type           0
release_speed        0
pfx_z                0
plate_x              0
plate_z              0
vx0                  0
vy0                  0
vz0                  0
ax                   0
ay                   0
az                   0
sz_top               0
sz_bot               0
release_spin_rate    0
release_extension    0
release_pos_y        0
dtype: int64

In [29]:
# One-hot encode pitch type
df_pitch_encoded = pd.get_dummies(df_guardians['pitch_type'], prefix='pitch_type')

# Merge encoded columns and drop original 'pitch_type'
df_guardians = pd.concat([df_guardians, df_pitch_encoded], axis=1)
df_guardians = df_guardians.drop(columns=['pitch_type'])

# Verify encoding
df_guardians.head()



Unnamed: 0,events,release_speed,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,...,pitch_type_EP,pitch_type_FA,pitch_type_FC,pitch_type_FF,pitch_type_FS,pitch_type_KC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV
0,single,82.8,-0.19,0.01,0.86,3.572215,-120.515904,-5.097357,-4.296812,24.513082,...,False,False,False,False,True,False,False,False,False,False
1,single,85.6,0.75,0.25,2.92,6.985398,-124.411673,-4.535563,-12.068063,24.942271,...,False,False,False,False,False,False,False,False,False,False
2,field_out,78.3,-1.21,0.59,1.21,4.531914,-113.946826,-3.251351,3.327798,21.71108,...,False,False,False,False,False,False,False,False,False,False
3,field_out,95.4,1.64,0.58,2.35,-7.052662,-138.765117,-6.936673,7.766018,28.303703,...,False,False,False,False,False,False,True,False,False,False
4,single,93.2,1.51,-0.01,2.05,5.151152,-135.525703,-6.704149,-3.494438,29.517357,...,False,False,False,True,False,False,False,False,False,False


In [31]:
# Define successful outcomes
successful_outcomes = ['single', 'double', 'triple', 'home_run', 'sac_fly', 'sac_fly_double_play']

# Create target column
df_guardians['target'] = df_guardians['events'].apply(lambda x: 1 if x in successful_outcomes else 0)

# Drop 'events' column since we don't need it anymore
df_guardians = df_guardians.drop(columns=['events'])

# Verify target distribution
df_guardians['target'].value_counts(normalize=True)


target
0    0.686804
1    0.313196
Name: proportion, dtype: float64

In [33]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
df_guardians[numerical_columns] = scaler.fit_transform(df_guardians[numerical_columns])

# Verify scaling
df_guardians.head()


Unnamed: 0,release_speed,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,...,pitch_type_FA,pitch_type_FC,pitch_type_FF,pitch_type_FS,pitch_type_KC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,target
0,-1.24995,-1.277075,0.060915,-3.132875,0.213579,1.250361,-0.460457,-0.130008,-0.503001,-1.188888,...,False,False,False,True,False,False,False,False,False,1
1,-0.751499,0.151979,0.522776,1.123725,0.833234,0.770934,-0.245613,-0.821492,-0.413877,-0.019984,...,False,False,False,False,False,False,False,False,False,1
2,-2.051034,-2.827751,1.17708,-2.409666,0.38781,2.058773,0.2455,0.548428,-1.084859,-2.300089,...,False,False,False,False,False,False,False,False,False,0
3,0.993083,1.505019,1.157836,-0.054072,-1.715344,-0.995449,-1.163855,0.94334,0.284151,1.694171,...,False,False,False,False,False,True,False,False,False,0
4,0.601442,1.307384,0.022426,-0.673965,0.500231,-0.596796,-1.074932,-0.058613,0.536176,1.36091,...,False,False,True,False,False,False,False,False,False,1


In [35]:
df_guardians.to_csv('Guardians_First_Pitch_Swings_Cleaned.csv', index=False)
