In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load full MLB dataset
df = pd.read_csv('../full_statcast_2024.csv')

# Define swing types (from 'description' column)
swing_types = ['hit_into_play', 'swinging_strike', 'foul']

# Filter for first-pitch swings
df_first_pitch = df[df['pitch_number'] == 1]

# Keep relevant columns
columns_to_keep = [
    'description', 'events', 'pitch_type', 'release_speed', 'launch_speed', 'pfx_z',
    'plate_x', 'plate_z', 'zone', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 
    'sz_top', 'sz_bot', 'release_spin_rate', 'release_extension', 'release_pos_y',
    'batter', 'pitcher', 'at_bat_number', 'game_date'
]
df_first_pitch = df_first_pitch[columns_to_keep]

# Keep only first-pitch swings based on 'description' column
df_first_pitch = df_first_pitch[df_first_pitch['description'].isin(swing_types)]

# Extract final pitch of each at-bat (final event outcome)
df_final_result = df.sort_values(by=['game_date', 'at_bat_number', 'pitch_number']) \
                    .drop_duplicates(subset=['game_date', 'at_bat_number'], keep='last') \
                    [['game_date', 'at_bat_number', 'events']]

# Merge to attach final at-bat outcome
df_first_pitch = df_first_pitch.merge(df_final_result, on=['game_date', 'at_bat_number'], suffixes=('', '_final'))

# Ensure we only override 'events' for swings that did NOT already end the at-bat
df_first_pitch.loc[df_first_pitch['description'] != 'hit_into_play', 'events'] = df_first_pitch['events_final']

# Drop the extra final event column (since we corrected 'events')
df_first_pitch = df_first_pitch.drop(columns=['events_final'])

# Save the dataset
df_first_pitch.to_csv('MLB_First_Pitch_Swings.csv', index=False)

print("Dataset saved: 'MLB_First_Pitch_Swings.csv'")
print("Total Rows:", len(df_first_pitch))
df_first_pitch.head()



Dataset saved: 'MLB_First_Pitch_Swings.csv'
Total Rows: 55660


Unnamed: 0,description,events,pitch_type,release_speed,launch_speed,pfx_z,plate_x,plate_z,zone,vx0,...,az,sz_top,sz_bot,release_spin_rate,release_extension,release_pos_y,batter,pitcher,at_bat_number,game_date
0,foul,field_out,SI,97.3,62.1,0.45,-1.48,2.7,11.0,4.203252,...,-25.465626,3.32,1.64,2089.0,6.7,53.82,608324,671345,73,2024-10-01
1,hit_into_play,double,SI,95.4,88.2,1.75,-0.18,2.65,5.0,-9.273209,...,-8.302247,3.43,1.65,2459.0,6.7,53.82,682985,623352,69,2024-10-01
2,foul,field_out,SI,91.0,72.7,1.04,-0.04,2.83,5.0,-6.99917,...,-19.267072,3.49,1.67,2201.0,6.4,54.07,663656,663947,67,2024-10-01
3,foul,walk,SL,86.8,71.1,0.71,0.46,3.07,3.0,3.347713,...,-24.264534,3.43,1.68,2140.0,6.4,54.09,663837,669854,63,2024-10-01
4,foul,field_out,FF,93.7,92.8,1.68,0.12,2.34,5.0,3.503582,...,-9.673175,3.64,1.72,2412.0,6.7,53.83,700242,669854,62,2024-10-01


In [3]:
df_ml = pd.read_csv('../CSVs/MLB_First_Pitch_Swings.csv')


In [5]:
print(df_ml.head())  # Display first few rows
print(df_ml.columns)  # Show all column names
print(df_ml.isna().sum())  # Count missing values per column




     description     events pitch_type  release_speed  launch_speed  pfx_z  \
0           foul  field_out         SI           97.3          62.1   0.45   
1  hit_into_play     double         SI           95.4          88.2   1.75   
2           foul  field_out         SI           91.0          72.7   1.04   
3           foul       walk         SL           86.8          71.1   0.71   
4           foul  field_out         FF           93.7          92.8   1.68   

   plate_x  plate_z  zone       vx0  ...         az  sz_top  sz_bot  \
0    -1.48     2.70  11.0  4.203252  ... -25.465626    3.32    1.64   
1    -0.18     2.65   5.0 -9.273209  ...  -8.302247    3.43    1.65   
2    -0.04     2.83   5.0 -6.999170  ... -19.267072    3.49    1.67   
3     0.46     3.07   3.0  3.347713  ... -24.264534    3.43    1.68   
4     0.12     2.34   5.0  3.503582  ...  -9.673175    3.64    1.72   

   release_spin_rate  release_extension  release_pos_y  batter  pitcher  \
0             2089.0         

In [7]:
# Fill missing launch_speed with its mean (explicit assignment to avoid warnings)
df_ml['launch_speed'] = df_ml['launch_speed'].fillna(df_ml['launch_speed'].mean())

# Fill other numerical columns with their mean using assignment
columns_to_fill = ['release_speed', 'pfx_z', 'plate_x', 'plate_z', 'zone', 'vx0', 'vy0', 'vz0',
                   'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'release_spin_rate', 
                   'release_extension', 'release_pos_y']

df_ml[columns_to_fill] = df_ml[columns_to_fill].apply(lambda x: x.fillna(x.mean()))

# Verify missing values are gone
df_ml.isna().sum()



description           0
events               19
pitch_type           18
release_speed         0
launch_speed          0
pfx_z                 0
plate_x               0
plate_z               0
zone                  0
vx0                   0
vy0                   0
vz0                   0
ax                    0
ay                    0
az                    0
sz_top                0
sz_bot                0
release_spin_rate     0
release_extension     0
release_pos_y         0
batter                0
pitcher               0
at_bat_number         0
game_date             0
dtype: int64

In [9]:
# Fill missing pitch_type with 'Unknown'
df_ml['pitch_type'] = df_ml['pitch_type'].fillna('Unknown')

# Verify missing values are gone
df_ml.isna().sum()


description           0
events               19
pitch_type            0
release_speed         0
launch_speed          0
pfx_z                 0
plate_x               0
plate_z               0
zone                  0
vx0                   0
vy0                   0
vz0                   0
ax                    0
ay                    0
az                    0
sz_top                0
sz_bot                0
release_spin_rate     0
release_extension     0
release_pos_y         0
batter                0
pitcher               0
at_bat_number         0
game_date             0
dtype: int64

In [11]:
# One-hot encode pitch_type
df_ml = pd.get_dummies(df_ml, columns=['pitch_type'], dummy_na=True)

# Verify the new columns
df_ml.head()


Unnamed: 0,description,events,release_speed,launch_speed,pfx_z,plate_x,plate_z,zone,vx0,vy0,...,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,pitch_type_nan
0,foul,field_out,97.3,62.1,0.45,-1.48,2.7,11.0,4.203252,-141.523309,...,False,False,False,False,True,False,False,False,False,False
1,hit_into_play,double,95.4,88.2,1.75,-0.18,2.65,5.0,-9.273209,-138.455653,...,False,False,False,False,True,False,False,False,False,False
2,foul,field_out,91.0,72.7,1.04,-0.04,2.83,5.0,-6.99917,-132.30675,...,False,False,False,False,True,False,False,False,False,False
3,foul,walk,86.8,71.1,0.71,0.46,3.07,3.0,3.347713,-126.350361,...,False,False,False,False,False,True,False,False,False,False
4,foul,field_out,93.7,92.8,1.68,0.12,2.34,5.0,3.503582,-136.173796,...,False,False,False,False,False,False,False,False,False,False


In [15]:
# Assign target BEFORE scaling
def classify_target(row):
    if row['description'] == 'swinging_strike':  
        return 'swing_and_miss'
    elif row['description'] == 'foul':
        return 'foul'
    elif row['launch_speed'] >= 95:
        return 'hard_contact'
    else:
        return 'weak_contact'

# Apply function to create 'target' column
df_ml['target'] = df_ml.apply(classify_target, axis=1)




In [17]:
# Scale numerical features AFTER target assignment
from sklearn.preprocessing import StandardScaler

# Select numeric columns to scale
numeric_features = ['release_speed', 'launch_speed', 'pfx_z', 'plate_x', 'plate_z', 
                    'zone', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 
                    'sz_bot', 'release_spin_rate', 'release_extension', 'release_pos_y']

# Initialize scaler
scaler = StandardScaler()

# Apply scaling
df_ml[numeric_features] = scaler.fit_transform(df_ml[numeric_features])

# Verify everything
df_ml.head()


Unnamed: 0,description,events,release_speed,launch_speed,pfx_z,plate_x,plate_z,zone,vx0,vy0,...,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,pitch_type_nan,target
0,foul,field_out,1.28074,-1.538068,-0.32303,-2.718627,0.465471,1.067556,0.314089,-1.284361,...,False,False,False,True,False,False,False,False,False,foul
1,hit_into_play,double,0.960678,0.418502,1.606665,-0.370886,0.386393,-0.479727,-2.083002,-0.927523,...,False,False,False,True,False,False,False,False,False,weak_contact
2,foul,field_out,0.219482,-0.743446,0.552755,-0.118052,0.671073,-0.479727,-1.678513,-0.212267,...,False,False,False,True,False,False,False,False,False,foul
3,foul,walk,-0.488023,-0.863389,0.062909,0.784925,1.050646,-0.995488,0.161912,0.480596,...,False,False,False,False,True,False,False,False,False,foul
4,foul,field_out,0.674307,0.763338,1.502758,0.170901,-0.103888,-0.479727,0.189637,-0.662092,...,False,False,False,False,False,False,False,False,False,foul


In [19]:
df_ml.drop(columns=['description', 'events', 'batter', 'pitcher', 'at_bat_number', 'game_date'], inplace=True)


In [21]:
df_ml.to_csv('../CSVs/MLB_First_Pitch_Swings_ML_Cleaned.csv', index=False)

df_ml.head()

Unnamed: 0,release_speed,launch_speed,pfx_z,plate_x,plate_z,zone,vx0,vy0,vz0,ax,...,pitch_type_KC,pitch_type_KN,pitch_type_SC,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV,pitch_type_Unknown,pitch_type_nan,target
0,1.28074,-1.538068,-0.32303,-2.718627,0.465471,1.067556,0.314089,-1.284361,0.034084,-1.791612,...,False,False,False,True,False,False,False,False,False,foul
1,0.960678,0.418502,1.606665,-0.370886,0.386393,-0.479727,-2.083002,-0.927523,-1.114983,1.008821,...,False,False,False,True,False,False,False,False,False,weak_contact
2,0.219482,-0.743446,0.552755,-0.118052,0.671073,-0.479727,-1.678513,-0.212267,-0.218773,1.743047,...,False,False,False,True,False,False,False,False,False,foul
3,-0.488023,-0.863389,0.062909,0.784925,1.050646,-0.995488,0.161912,0.480596,0.4507,0.306881,...,False,False,False,False,True,False,False,False,False,foul
4,0.674307,0.763338,1.502758,0.170901,-0.103888,-0.479727,0.189637,-0.662092,-1.779315,-0.414247,...,False,False,False,False,False,False,False,False,False,foul
