#### cleaning

In [1]:
import pybaseball
import pandas as pd
from pybaseball import statcast
import os
import numpy as np
pybaseball.cache.enable()

In [2]:
# data = pybaseball.statcast(start_dt="2024-03-01", end_dt="2024-10-01")
# data.to_csv('statcast_pitch_2024.csv')

### import, clean, and select cols

Used the pybaseball library to create the statcast dataset. It takes a while to load so just exported it to CSV and call from here. Will use more datapoints to create model

In [3]:
# set working directory
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')

# import datasets
data = pd.read_csv('./data/datasets/statcast_pitch_2024.csv')
count_data = pd.read_csv('./data/datasets/combined_counts.csv')
statcast = data.copy()

Clean the data to keep only the needed cols. For the scope of this project I only want to examine the value of a pitch irrespective of the prefromance of the batter. The choice of these stats reperesent this aim and create a dataset that is much more managable with only 25 features comapred to the orginal 113. 
<br> 
Also applied some basic feature engenering so it would be easier to anayzle swinging and called strikes which are the most vauleable outcomes a pitch can produce from a pure pitching perspecitve

In [4]:
# keep only the cols relevent to the actual pitch itself
cols_to_keep = ['player_name', 'pitch_name', 'effective_speed', 'release_pos_x', 'release_pos_z',
                 'description', 'type', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'vx0', 'vy0', 'ax', 'ay', 'az', 
                 'launch_speed', 'launch_angle', 'sz_top', 'sz_bot', 'release_spin_rate', 'release_pos_y', 'spin_axis', 
                 'estimated_woba_using_speedangle', 'arm_angle']
statcast = statcast[cols_to_keep]

# rename for easier use
statcast.rename(columns={'effective_speed': 'pitch_velo_adj', 'release_pos_x': 'horz_release_pos', 'release_pos_z': 'vertical_release_pos', 'pfx_x': 
                         'horz_movement', 'pfx_z': 'vertical_movement', 'plate_x': 'horz_position_of_pitch', 'plate_z': 'vertical_pos_of_pitch', 'vx0': 'velo_in_horz', 
                         'vy0': 'velo_in_vert', 'release_spin_rate': 'release_spin', 'release_pos_y': 'release_position', 'estimated_woba_using_speedangle': 'estimated_woba'}, inplace=True)

def clean_and_reorder(df):
    # Create boolean column for called strike
    df['called_strike'] = df['description'] == 'called_strike'
    
    # Create boolean column for swinging strike
    df['swinging_strike'] = df['description'] == 'swinging_strike'

    # Define the desired column order
    columns = ['player_name','pitch_name', 'description', 'called_strike', 'swinging_strike', 'estimated_woba', 'pitch_velo_adj', 
               'horz_position_of_pitch', 'vertical_pos_of_pitch','horz_release_pos', 'vertical_release_pos', 'horz_movement', 'vertical_movement', 
               'velo_in_horz', 'velo_in_vert', 'ax', 'ay', 'az', 'release_spin', 'release_position', 'spin_axis', 'launch_speed', 'launch_angle', 'sz_top', 'sz_bot']

    # Ensure all columns exist in the DataFrame
    for col in columns:
        if col not in df.columns:
            df[col] = None  # Or some other appropriate default value

    # Reorder the columns
    df = df[columns]
    
    return df

statcast = clean_and_reorder(statcast)

print(statcast.head())

     player_name       pitch_name      description  called_strike  \
0  Brieske, Beau         Changeup    hit_into_play          False   
1  Brieske, Beau         Changeup  swinging_strike          False   
2  Brieske, Beau         Changeup             ball          False   
3  Brieske, Beau  4-Seam Fastball             foul          False   
4  Brieske, Beau         Changeup     blocked_ball          False   

   swinging_strike  estimated_woba  pitch_velo_adj  horz_position_of_pitch  \
0            False           0.498            87.6                    0.21   
1             True             NaN            86.5                   -1.30   
2            False             NaN            89.1                    0.37   
3            False             NaN            96.5                   -0.40   
4            False             NaN            87.9                   -0.05   

   vertical_pos_of_pitch  horz_release_pos  ...         ax         ay  \
0                   2.19             -1.65 

### binary strike or ball

add a new col that is just binary of if the pitch is in or out of the strike zone

In [5]:
def is_strike(row):
    # Horizontal check
    horizontal_check = -0.70833 <= row['horz_position_of_pitch'] <= 0.70833
    
    # Vertical check
    vertical_check = row['sz_bot'] <= row['vertical_pos_of_pitch'] <= row['sz_top']
    
    return horizontal_check and vertical_check

statcast['in_strike_zone'] = statcast.apply(is_strike, axis=1)

print(statcast[['horz_position_of_pitch', 'vertical_pos_of_pitch', 'sz_top', 'sz_bot', 'in_strike_zone']].head())

   horz_position_of_pitch  vertical_pos_of_pitch  sz_top  sz_bot  \
0                    0.21                   2.19    3.55    1.68   
1                   -1.30                   2.39    3.55    1.68   
2                    0.37                   1.05    3.52    1.68   
3                   -0.40                   2.83    3.55    1.68   
4                   -0.05                   0.63    3.49    1.68   

   in_strike_zone  
0            True  
1           False  
2           False  
3            True  
4           False  


### export to csv

In [6]:
statcast.to_csv('./data/datasets/cleaned_pitch_unfilled.csv', index=False)