In [224]:
from pybaseball import statcast
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler

# Enable caching for faster re-runs
from pybaseball import cache
cache.enable()

# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Fetch full 2024 dataset
start_date = "2024-03-30"
end_date = "2024-10-01"
df = statcast(start_dt=start_date, end_dt=end_date)

# Save full dataset to avoid data loss
df.to_csv("full_statcast_2024.csv", index=False)

# Check the dataset
print(f"Full dataset shape: {df.shape}")
df.head()


This is a large query, it may take a moment to complete


100%|████████████████████████████████████████████████████████████████████████████████| 186/186 [00:10<00:00, 17.37it/s]


Full dataset shape: (703168, 113)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
603,CH,2024-10-01,88.1,-1.65,6.12,"Brieske, Beau",518792,689225,field_out,hit_into_play,...,1,1,3,3,1,1,2.5,1.4,-1.4,46.4
611,CH,2024-10-01,87.1,-1.69,6.17,"Brieske, Beau",518792,689225,,swinging_strike,...,1,1,3,3,1,1,2.45,1.19,-1.19,44.1
636,CH,2024-10-01,89.7,-1.89,6.14,"Brieske, Beau",518792,689225,,ball,...,1,1,3,3,1,1,2.44,1.33,-1.33,47.4
656,FF,2024-10-01,97.5,-1.51,6.32,"Brieske, Beau",518792,689225,,foul,...,1,1,3,3,1,1,0.82,0.4,-0.4,55.0
694,CH,2024-10-01,88.6,-1.77,6.19,"Brieske, Beau",518792,689225,,blocked_ball,...,1,1,3,3,1,1,2.32,1.3,-1.3,47.0


In [226]:
# Filter for first-pitch swings and relevant columns
df_first_pitch = df[df['pitch_number'] == 1][[
    'events', 'description', 'pitch_type', 'release_speed', 'launch_speed',
    'launch_angle', 'hit_location', 'bb_type', 'batter', 'pitcher'
]]

# Filter out rows where the 'events' column contains non-swing outcomes
df_first_pitch = df_first_pitch[df_first_pitch['events'].isin(['single', 'double', 'triple', 'home_run', 
                                                               'sac_fly', 'sac_fly_double_play', 
                                                               'field_out', 'force_out', 'grounded_into_double_play',
                                                               'fielders_choice', 'double_play', 'fielders_choice_out'])]

# Display the first few rows of the filtered dataframe
df_first_pitch.head()



Unnamed: 0,events,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher
985,double,hit_into_play,SI,95.4,88.2,40,,fly_ball,682985,623352
883,single,hit_into_play,FF,92.4,85.9,20,7.0,line_drive,668670,687911
125,field_out,hit_into_play,CH,89.0,84.5,40,9.0,fly_ball,514888,669373
445,force_out,hit_into_play,SI,96.3,94.5,-27,4.0,ground_ball,678009,664285
1045,field_out,hit_into_play,CH,87.1,81.4,30,7.0,fly_ball,514888,669373


In [228]:
# Fill missing values in 'hit_location' with 0
df_first_pitch['hit_location'] = df_first_pitch['hit_location'].fillna(0)

# Verify that missing values have been filled
df_first_pitch['hit_location'].isna().sum()

df_first_pitch.head()

Unnamed: 0,events,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher
985,double,hit_into_play,SI,95.4,88.2,40,0,fly_ball,682985,623352
883,single,hit_into_play,FF,92.4,85.9,20,7,line_drive,668670,687911
125,field_out,hit_into_play,CH,89.0,84.5,40,9,fly_ball,514888,669373
445,force_out,hit_into_play,SI,96.3,94.5,-27,4,ground_ball,678009,664285
1045,field_out,hit_into_play,CH,87.1,81.4,30,7,fly_ball,514888,669373


In [230]:
# Save the filtered DataFrame to a CSV file
df_first_pitch.to_csv('first_pitch_swing_results.csv', index=False)


In [232]:
# Get unique pitch types in the 'pitch_type' column
df_first_pitch['pitch_type'].unique()


array(['SI', 'FF', 'CH', 'SL', 'CU', 'FC', 'ST', 'KC', 'FS', 'EP', 'FA',
       'SV', 'KN', 'SC', 'CS', None, 'FO'], dtype=object)

In [234]:
# Replace None values in 'pitch_type' with 'Unknown'
df_first_pitch['pitch_type'] = df_first_pitch['pitch_type'].fillna('Unknown')

# Apply one-hot encoding to 'pitch_type'
df_pitch_encoded = pd.get_dummies(df_first_pitch['pitch_type'], prefix='pitch_type')

# Add the encoded columns back to the DataFrame
df_first_pitch = pd.concat([df_first_pitch, df_pitch_encoded], axis=1)

# Verify the changes
df_first_pitch.head()


Unnamed: 0,events,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher,...,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown
985,double,hit_into_play,SI,95.4,88.2,40,0,fly_ball,682985,623352,...,False,False,False,False,False,True,False,False,False,False
883,single,hit_into_play,FF,92.4,85.9,20,7,line_drive,668670,687911,...,False,False,False,False,False,False,False,False,False,False
125,field_out,hit_into_play,CH,89.0,84.5,40,9,fly_ball,514888,669373,...,False,False,False,False,False,False,False,False,False,False
445,force_out,hit_into_play,SI,96.3,94.5,-27,4,ground_ball,678009,664285,...,False,False,False,False,False,True,False,False,False,False
1045,field_out,hit_into_play,CH,87.1,81.4,30,7,fly_ball,514888,669373,...,False,False,False,False,False,False,False,False,False,False


In [236]:
# Select the numerical columns to scale
numerical_columns = ['release_speed', 'launch_speed', 'launch_angle']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical columns
df_first_pitch[numerical_columns] = scaler.fit_transform(df_first_pitch[numerical_columns])

# Verify the changes
df_first_pitch.head()


Unnamed: 0,events,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher,...,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown
985,double,hit_into_play,SI,0.933544,-0.026175,0.882051,0,fly_ball,682985,623352,...,False,False,False,False,False,True,False,False,False,False
883,single,hit_into_play,FF,0.439291,-0.171918,0.19337,7,line_drive,668670,687911,...,False,False,False,False,False,False,False,False,False,False
125,field_out,hit_into_play,CH,-0.120862,-0.260631,0.882051,9,fly_ball,514888,669373,...,False,False,False,False,False,False,False,False,False,False
445,force_out,hit_into_play,SI,1.08182,0.373034,-1.42503,4,ground_ball,678009,664285,...,False,False,False,False,False,True,False,False,False,False
1045,field_out,hit_into_play,CH,-0.433889,-0.457067,0.53771,7,fly_ball,514888,669373,...,False,False,False,False,False,False,False,False,False,False


In [238]:
# Define the successful outcomes, considering sac flies and sac bunts as hits
successful_outcomes = ['single', 'double', 'triple', 'home_run', 'sac_fly', 'sac_fly_double_play', 'sac_bunt']

# Create the target variable, treating sac flies and sac bunts as hits
df_first_pitch['target'] = df_first_pitch['events'].apply(lambda x: 1 if x in successful_outcomes else 0)

# Drop the 'events' column, as it's no longer needed
df_first_pitch = df_first_pitch.drop(columns=['events'])

# Verify the final dataset
df_first_pitch.head()


Unnamed: 0,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher,pitch_type_CH,...,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,target
985,hit_into_play,SI,0.933544,-0.026175,0.882051,0,fly_ball,682985,623352,False,...,False,False,False,False,True,False,False,False,False,1
883,hit_into_play,FF,0.439291,-0.171918,0.19337,7,line_drive,668670,687911,False,...,False,False,False,False,False,False,False,False,False,1
125,hit_into_play,CH,-0.120862,-0.260631,0.882051,9,fly_ball,514888,669373,True,...,False,False,False,False,False,False,False,False,False,0
445,hit_into_play,SI,1.08182,0.373034,-1.42503,4,ground_ball,678009,664285,False,...,False,False,False,False,True,False,False,False,False,0
1045,hit_into_play,CH,-0.433889,-0.457067,0.53771,7,fly_ball,514888,669373,True,...,False,False,False,False,False,False,False,False,False,0


In [192]:
# Save the final DataFrame to a CSV file
df_first_pitch.to_csv('first_pitch_swing_ml_ready.csv', index=False)


In [240]:
# One-hot encode the 'bb_type' column
df_bb_type_encoded = pd.get_dummies(df_first_pitch['bb_type'], prefix='bb_type')

# Add the encoded 'bb_type' columns back to the DataFrame
df_first_pitch = pd.concat([df_first_pitch, df_bb_type_encoded], axis=1)

# Verify the changes
df_first_pitch.head()


Unnamed: 0,description,pitch_type,release_speed,launch_speed,launch_angle,hit_location,bb_type,batter,pitcher,pitch_type_CH,...,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,target,bb_type_fly_ball,bb_type_ground_ball,bb_type_line_drive,bb_type_popup
985,hit_into_play,SI,0.933544,-0.026175,0.882051,0,fly_ball,682985,623352,False,...,True,False,False,False,False,1,True,False,False,False
883,hit_into_play,FF,0.439291,-0.171918,0.19337,7,line_drive,668670,687911,False,...,False,False,False,False,False,1,False,False,True,False
125,hit_into_play,CH,-0.120862,-0.260631,0.882051,9,fly_ball,514888,669373,True,...,False,False,False,False,False,0,True,False,False,False
445,hit_into_play,SI,1.08182,0.373034,-1.42503,4,ground_ball,678009,664285,False,...,True,False,False,False,False,0,False,True,False,False
1045,hit_into_play,CH,-0.433889,-0.457067,0.53771,7,fly_ball,514888,669373,True,...,False,False,False,False,False,0,True,False,False,False


In [242]:
# Drop the 'pitch_type', 'batter', 'pitcher', 'bb_type', and 'description' columns
df_first_pitch = df_first_pitch.drop(columns=['pitch_type', 'batter', 'pitcher', 'bb_type', 'description'])

# Verify the changes
df_first_pitch.head()


Unnamed: 0,release_speed,launch_speed,launch_angle,hit_location,pitch_type_CH,pitch_type_CS,pitch_type_CU,pitch_type_EP,pitch_type_FA,pitch_type_FC,...,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,target,bb_type_fly_ball,bb_type_ground_ball,bb_type_line_drive,bb_type_popup
985,0.933544,-0.026175,0.882051,0,False,False,False,False,False,False,...,True,False,False,False,False,1,True,False,False,False
883,0.439291,-0.171918,0.19337,7,False,False,False,False,False,False,...,False,False,False,False,False,1,False,False,True,False
125,-0.120862,-0.260631,0.882051,9,True,False,False,False,False,False,...,False,False,False,False,False,0,True,False,False,False
445,1.08182,0.373034,-1.42503,4,False,False,False,False,False,False,...,True,False,False,False,False,0,False,True,False,False
1045,-0.433889,-0.457067,0.53771,7,True,False,False,False,False,False,...,False,False,False,False,False,0,True,False,False,False


In [244]:
# Save the cleaned DataFrame to a CSV file
df_first_pitch.to_csv('first_pitch_swing_ml_ready_cleaned.csv', index=False)
