In [65]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the original dataset
df = pd.read_csv('full_statcast_2024.csv')  # Use the actual file name

# Display column names to verify we have all needed features
list(df.columns)


['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'zone',
 'des',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'hit_location',
 'bb_type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'umpire',
 'sv_id',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'fielder_2',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'w

In [67]:
# Filter for first-pitch swings
df_first_pitch = df[df['pitch_number'] == 1][[
    'events', 'pitch_type', 'release_speed', 'launch_speed', 'launch_angle',
    'pfx_z', 'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
    'sz_top', 'sz_bot', 'release_spin_rate', 'release_extension', 'release_pos_y', 'bb_type'
]]

# Filter for relevant 'events' (only at-bat ending outcomes)
df_first_pitch = df_first_pitch[df_first_pitch['events'].isin([
    'single', 'double', 'triple', 'home_run', 'sac_fly', 'sac_fly_double_play',
    'field_out', 'force_out', 'grounded_into_double_play',
    'fielders_choice', 'double_play', 'fielders_choice_out'
])]

# Display the first few rows to confirm filtering
df_first_pitch.head()


Unnamed: 0,events,pitch_type,release_speed,launch_speed,launch_angle,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,release_spin_rate,release_extension,release_pos_y,bb_type
46,double,SI,95.4,88.2,40.0,1.75,-0.18,2.65,-9.273209,-138.455653,-6.649274,8.661045,34.126078,-8.302247,3.43,1.65,2459.0,6.7,53.82,fly_ball
153,single,FF,92.4,85.9,20.0,1.26,-0.54,3.08,-8.287732,-134.298834,-3.612801,2.455684,30.122055,-16.378992,3.34,1.53,2339.0,6.6,53.91,line_drive
159,field_out,CH,89.0,84.5,40.0,0.61,1.03,1.91,-4.309089,-129.316231,-6.346179,18.105344,30.680564,-24.109504,2.86,1.3,1889.0,6.4,54.11,fly_ball
288,force_out,SI,96.3,94.5,-27.0,0.49,-0.42,2.09,-5.893561,-139.856059,-6.387026,20.301146,36.453879,-24.365965,3.44,1.62,2206.0,5.9,54.55,ground_ball
307,field_out,CH,87.1,81.4,30.0,0.42,0.38,1.74,-5.182791,-126.572237,-5.885035,14.418523,28.669354,-26.534846,2.86,1.3,1558.0,6.3,54.19,fly_ball


In [69]:
# Define successful hit outcomes
successful_outcomes = ['single', 'double', 'triple', 'home_run', 'sac_fly', 'sac_fly_double_play', 'sac_bunt']

# Define events to ignore (not balls in play)
outcomes_to_ignore = ['walk', 'hit_by_pitch', 'strikeout', 'strikeout_double_play', 'truncated_pa', 'catcher_interf']

# Keep only relevant events (balls in play)
df_first_pitch = df_first_pitch[~df_first_pitch['events'].isin(outcomes_to_ignore)]

# Create the target variable (1 = hit, 0 = out)
df_first_pitch['target'] = df_first_pitch['events'].apply(lambda x: 1 if x in successful_outcomes else 0)

# Drop 'events' since it's now encoded
df_first_pitch = df_first_pitch.drop(columns=['events'])

# Verify class distribution
df_first_pitch['target'].value_counts(normalize=True)


target
0    0.656877
1    0.343123
Name: proportion, dtype: float64

In [71]:
# Check for missing values in each column
df_first_pitch.isna().sum()


pitch_type            5
release_speed         5
launch_speed         56
launch_angle         48
pfx_z                 5
plate_x               5
plate_z               5
vx0                   5
vy0                   5
vz0                   5
ax                    5
ay                    5
az                    5
sz_top                5
sz_bot                5
release_spin_rate    90
release_extension    26
release_pos_y         5
bb_type               0
target                0
dtype: int64

In [73]:
# Fill missing values in 'pitch_type' with 'Unknown'
df_first_pitch['pitch_type'] = df_first_pitch['pitch_type'].fillna('Unknown')

# Select numerical columns that need imputation
numerical_columns = [
    'release_speed', 'launch_speed', 'launch_angle', 'pfx_z', 'plate_x', 'plate_z',
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 
    'release_spin_rate', 'release_extension', 'release_pos_y'
]

# Initialize the imputer with the median strategy
imputer = SimpleImputer(strategy='median')

# Apply imputation to the numerical columns
df_first_pitch[numerical_columns] = imputer.fit_transform(df_first_pitch[numerical_columns])

# Verify that missing values have been handled
df_first_pitch.isna().sum()

pitch_type           0
release_speed        0
launch_speed         0
launch_angle         0
pfx_z                0
plate_x              0
plate_z              0
vx0                  0
vy0                  0
vz0                  0
ax                   0
ay                   0
az                   0
sz_top               0
sz_bot               0
release_spin_rate    0
release_extension    0
release_pos_y        0
bb_type              0
target               0
dtype: int64

In [75]:
# One-hot encode 'pitch_type'
df_pitch_encoded = pd.get_dummies(df_first_pitch['pitch_type'], prefix='pitch_type')

# One-hot encode 'bb_type'
df_bb_type_encoded = pd.get_dummies(df_first_pitch['bb_type'], prefix='bb_type')

# Concatenate the new encoded columns with the main dataframe
df_first_pitch = pd.concat([df_first_pitch, df_pitch_encoded, df_bb_type_encoded], axis=1)

# Drop the original 'pitch_type' and 'bb_type' columns since they're now encoded
df_first_pitch = df_first_pitch.drop(columns=['pitch_type', 'bb_type'])

# Verify encoding worked
df_first_pitch.head()


Unnamed: 0,release_speed,launch_speed,launch_angle,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,...,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,bb_type_fly_ball,bb_type_ground_ball,bb_type_line_drive,bb_type_popup
46,95.4,88.2,40.0,1.75,-0.18,2.65,-9.273209,-138.455653,-6.649274,8.661045,...,False,True,False,False,False,False,True,False,False,False
153,92.4,85.9,20.0,1.26,-0.54,3.08,-8.287732,-134.298834,-3.612801,2.455684,...,False,False,False,False,False,False,False,False,True,False
159,89.0,84.5,40.0,0.61,1.03,1.91,-4.309089,-129.316231,-6.346179,18.105344,...,False,False,False,False,False,False,True,False,False,False
288,96.3,94.5,-27.0,0.49,-0.42,2.09,-5.893561,-139.856059,-6.387026,20.301146,...,False,True,False,False,False,False,False,True,False,False
307,87.1,81.4,30.0,0.42,0.38,1.74,-5.182791,-126.572237,-5.885035,14.418523,...,False,False,False,False,False,False,True,False,False,False


In [81]:
# List of numerical columns to scale
numerical_columns = [
    'release_speed', 'launch_speed', 'launch_angle', 'pfx_z', 'plate_x', 'plate_z',
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
    'release_spin_rate', 'release_extension', 'release_pos_y'
]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df_first_pitch[numerical_columns] = scaler.fit_transform(df_first_pitch[numerical_columns])

# Verify scaling
df_first_pitch.head()

Unnamed: 0,release_speed,launch_speed,launch_angle,pfx_z,plate_x,plate_z,vx0,vy0,vz0,ax,...,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,bb_type_fly_ball,bb_type_ground_ball,bb_type_line_drive,bb_type_popup
46,0.933593,-0.026802,0.883045,1.6743,-0.357528,0.457652,-2.066649,-0.901226,-1.082953,1.001185,...,False,True,False,False,False,False,True,False,False,False
153,0.439283,-0.172738,0.193549,0.922599,-1.092954,1.321427,-1.892359,-0.428338,0.053348,0.450314,...,False,False,False,False,False,False,False,False,True,False
159,-0.120936,-0.261568,0.883045,-0.074556,2.11432,-1.028845,-1.188703,0.138492,-0.96953,1.839586,...,False,False,False,False,False,False,True,False,False,False
288,1.081886,0.372933,-1.426768,-0.258646,-0.847812,-0.667265,-1.46893,-1.060539,-0.984815,2.034515,...,False,True,False,False,False,False,False,True,False,False
307,-0.433999,-0.458263,0.538297,-0.366032,0.786467,-1.370338,-1.343225,0.450654,-0.796962,1.512295,...,False,False,False,False,False,False,True,False,False,False


In [87]:
# Save the cleaned dataset to a CSV file
df_first_pitch.to_csv('Second_ML_Data_Set.csv', index=False)
