In [2]:
import pandas as pd
import numpy as np
import pybaseball
from datetime import datetime, timedelta
import warnings
from typing import Dict, List, Tuple, Optional
import pickle
import os
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np
import pybaseball
from datetime import datetime, timedelta
import warnings
from typing import Dict, List, Tuple, Optional
import pickle
import os

warnings.filterwarnings('ignore')

class StatcastProcessor:
    """
    Comprehensive Statcast data processor for pitch-level run value analysis.
    Implements both MVP (PA-level) and Pro (pitch-level) run expectancy methods.
    """
    
    def __init__(self, use_cache: bool = True):
        self.use_cache = use_cache
        self.re24_table = self._initialize_standard_re24_table()  # Standard RE24 table
        self.re288_table = self._initialize_standard_re288_table()  # Standard RE288 table
        self.data = None
        
    def fetch_statcast_data(self, 
                           start_date: str, 
                           end_date: str, 
                           pitcher_ids: Optional[List[int]] = None,
                           sample_pitchers: bool = True) -> pd.DataFrame:
        """
        Fetch Statcast data for specified date range and pitchers.
        
        Args:
            start_date: 'YYYY-MM-DD' format
            end_date: 'YYYY-MM-DD' format  
            pitcher_ids: List of pitcher IDs, if None will sample some pitchers
            sample_pitchers: Whether to sample a few pitchers for fast iteration
        """
        print(f"Fetching Statcast data from {start_date} to {end_date}...")
        
        if pitcher_ids is None and sample_pitchers:
            # Sample some well-known pitchers for testing (1 RHP, 1 LHP)
            pitcher_ids = [676979, 694973]
            
        if pitcher_ids:
            # Fetch specific pitchers
            all_data = []
            for pitcher_id in pitcher_ids:
                print(f"Fetching data for pitcher {pitcher_id}...")
                try:
                    pitcher_data = pybaseball.statcast_pitcher(start_date, end_date, pitcher_id)
                    if not pitcher_data.empty:
                        all_data.append(pitcher_data)
                except Exception as e:
                    print(f"Error fetching pitcher {pitcher_id}: {e}")
                    continue
            
            if all_data:
                data = pd.concat(all_data, ignore_index=True)
            else:
                print("No data retrieved for specified pitchers")
                return pd.DataFrame()
        else:
            # Fetch league-wide data (use with caution - very large)
            data = pybaseball.statcast(start_date, end_date)
            
        print(f"Retrieved {len(data)} pitches")
        return data
    
    def clean_and_subset_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Clean and subset raw Statcast data, keeping only relevant columns.
        """
        print("Cleaning and subsetting data...")
        
        # Define columns to keep
        required_cols = [
            'game_date', 'pitcher', 'batter', 'p_throws', 'stand', 'pitch_type',
            'release_speed', 'spin_rate', 'plate_x', 'plate_z', 'balls', 'strikes',
            'outs_when_up', 'on_1b', 'on_2b', 'on_3b', 'inning', 'events', 
            'description', 'home_score', 'away_score', 'at_bat_number', 'pitch_number',
            'game_pk', 'game_year'
        ]
        
        # Keep only columns that exist in the data
        available_cols = [col for col in required_cols if col in data.columns]
        data = data[available_cols].copy()
        
        # Filter out unwanted pitch types
        exclude_pitches = ['PO', 'IN']  # Pitchouts, Intentional balls
        if 'description' in data.columns:
            data = data[~data['description'].isin(exclude_pitches)].copy()
            
        # Remove obvious tracking errors
        if 'plate_x' in data.columns and 'plate_z' in data.columns:
            data = data[
                (data['plate_x'].between(-3, 3)) & 
                (data['plate_z'].between(-1, 5))
            ].copy()
            
        # Convert game_date to datetime
        if 'game_date' in data.columns:
            data['game_date'] = pd.to_datetime(data['game_date'])
            
        # Fill missing base runners with 0 (no runner)
        base_cols = ['on_1b', 'on_2b', 'on_3b']
        for col in base_cols:
            if col in data.columns:
                data[col] = data[col].fillna(0)
        
        print(f"Cleaned data: {len(data)} pitches remaining")
        return data.sort_values(['game_pk', 'at_bat_number', 'pitch_number']).reset_index(drop=True)
    
    def _initialize_standard_re24_table(self) -> Dict[int, float]:
        """
        Initialize the standard RE24 table provided by the user.
        Run environment set at 4.15 runs per game.
        
        States are encoded as: bases * 3 + outs
        where bases = on_1b*4 + on_2b*2 + on_3b*1
        """
        # Standard RE24 matrix from user
        re_matrix = {
            # (runners, outs): expected_runs
            # Empty bases
            (0, 0): 0.461, (0, 1): 0.243, (0, 2): 0.095,
            # Runner on 1st
            (1, 0): 0.831, (1, 1): 0.489, (1, 2): 0.214,
            # Runner on 2nd  
            (2, 0): 1.068, (2, 1): 0.644, (2, 2): 0.305,
            # Runners on 1st and 2nd
            (3, 0): 1.373, (3, 1): 0.908, (3, 2): 0.343,
            # Runner on 3rd
            (4, 0): 1.426, (4, 1): 0.865, (4, 2): 0.413,
            # Runners on 1st and 3rd
            (5, 0): 1.798, (5, 1): 1.140, (5, 2): 0.471,
            # Runners on 2nd and 3rd
            (6, 0): 1.920, (6, 1): 1.352, (6, 2): 0.570,
            # Bases loaded
            (7, 0): 2.282, (7, 1): 1.520, (7, 2): 0.736
        }
        
        # Convert to state IDs (0-23)
        re24_table = {}
        for (runners, outs), re_value in re_matrix.items():
            state_id = runners * 3 + outs
            re24_table[state_id] = re_value
            
        return re24_table
    
    def _initialize_standard_re288_table(self) -> Dict[int, float]:
        """
        Initialize the standard RE288 table provided by the user.
        This table includes all 288 base-out-count combinations.
        
        States are encoded as: (base_state * 3 + outs) * 12 + (balls * 3 + strikes)
        where base_state = on_1b*1 + on_2b*2 + on_3b*4
        """
        # Standard RE288 matrix from user - organized by base-out state and count
        # Format: [base_state][outs][count] where count is ordered as shown in table
        re288_data = {
            # Empty bases (---)
            (0, 0): [0.51, 0.46, 0.41, 0.56, 0.5, 0.44, 0.62, 0.55, 0.47, 0.75, 0.68, 0.57],  # 0 outs
            (0, 1): [0.27, 0.24, 0.2, 0.3, 0.27, 0.22, 0.34, 0.3, 0.25, 0.42, 0.37, 0.32],   # 1 out
            (0, 2): [0.1, 0.09, 0.06, 0.12, 0.1, 0.07, 0.14, 0.12, 0.09, 0.17, 0.15, 0.13],  # 2 outs
            
            # Runner on 1st (1--)
            (1, 0): [0.89, 0.82, 0.75, 0.98, 0.87, 0.81, 1.09, 0.97, 0.87, 1.25, 1.15, 1.07],
            (1, 1): [0.54, 0.49, 0.42, 0.57, 0.52, 0.44, 0.62, 0.58, 0.5, 0.77, 0.66, 0.62],
            (1, 2): [0.22, 0.18, 0.13, 0.26, 0.22, 0.17, 0.29, 0.24, 0.19, 0.36, 0.32, 0.26],
            
            # Runner on 2nd (-2-)
            (2, 0): [1.14, 1.05, 0.95, 1.19, 1.08, 0.96, 1.27, 1.21, 1.08, 1.41, 1.34, 1.17],
            (2, 1): [0.69, 0.65, 0.59, 0.72, 0.66, 0.59, 0.78, 0.71, 0.68, 0.92, 0.87, 0.71],
            (2, 2): [0.33, 0.29, 0.2, 0.35, 0.3, 0.22, 0.38, 0.31, 0.25, 0.51, 0.38, 0.3],
            
            # Runners on 1st and 2nd (12-)
            (3, 0): [1.47, 1.4, 1.31, 1.53, 1.45, 1.33, 1.67, 1.5, 1.46, 2.0, 1.77, 1.62],
            (3, 1): [0.95, 0.88, 0.79, 1.04, 0.96, 0.81, 1.06, 1.01, 0.87, 1.16, 1.19, 1.04],
            (3, 2): [0.44, 0.38, 0.28, 0.46, 0.41, 0.33, 0.53, 0.46, 0.39, 0.67, 0.58, 0.5],
            
            # Runner on 3rd (--3)
            (4, 0): [1.4, 1.28, 1.13, 1.53, 1.49, 1.47, 1.48, 1.46, 1.34, 1.45, 1.71, 1.49],
            (4, 1): [0.89, 0.91, 0.79, 1.04, 0.98, 0.82, 1.17, 1.03, 0.87, 1.36, 1.19, 0.92],
            (4, 2): [0.37, 0.31, 0.24, 0.4, 0.34, 0.25, 0.44, 0.41, 0.3, 0.46, 0.41, 0.39],
            
            # Runners on 1st and 3rd (1-3)
            (5, 0): [1.76, 1.71, 1.6, 1.78, 1.66, 1.63, 1.94, 1.81, 1.64, 2.09, 2.1, 1.85],
            (5, 1): [1.2, 1.11, 1.01, 1.25, 1.15, 1.12, 1.26, 1.22, 1.16, 1.24, 1.36, 1.25],
            (5, 2): [0.5, 0.43, 0.35, 0.55, 0.47, 0.37, 0.57, 0.54, 0.4, 0.65, 0.66, 0.46],
            
            # Runners on 2nd and 3rd (-23)
            (6, 0): [1.93, 1.89, 1.62, 2.03, 1.88, 1.73, 2.3, 2.02, 1.78, 2.14, 2.06, 1.65],
            (6, 1): [1.37, 1.26, 1.06, 1.42, 1.31, 1.16, 1.45, 1.4, 1.26, 1.53, 1.41, 1.34],
            (6, 2): [0.58, 0.5, 0.34, 0.59, 0.56, 0.41, 0.61, 0.65, 0.42, 0.85, 0.68, 0.56],
            
            # Bases loaded (123)
            (7, 0): [2.28, 2.24, 2.28, 2.34, 2.26, 2.21, 2.35, 2.26, 2.15, 2.81, 2.55, 2.49],
            (7, 1): [1.51, 1.36, 1.18, 1.6, 1.49, 1.3, 1.79, 1.59, 1.5, 2.15, 1.88, 1.53],
            (7, 2): [0.74, 0.59, 0.41, 0.88, 0.68, 0.45, 1.11, 0.86, 0.65, 1.35, 1.09, 1.04]
        }
        
        # Convert to state IDs (0-287)
        re288_table = {}
        count_combinations = [
            (0,0), (0,1), (0,2),  # 0-0, 0-1, 0-2
            (1,0), (1,1), (1,2),  # 1-0, 1-1, 1-2
            (2,0), (2,1), (2,2),  # 2-0, 2-1, 2-2
            (3,0), (3,1), (3,2)   # 3-0, 3-1, 3-2
        ]
        
        for base_state in range(8):  # 0-7 for all base combinations
            for outs in range(3):   # 0-2 outs
                if (base_state, outs) in re288_data:
                    re_values = re288_data[(base_state, outs)]
                    for count_idx, (balls, strikes) in enumerate(count_combinations):
                        # Calculate state ID: (base_state * 3 + outs) * 12 + (balls * 3 + strikes)
                        base_out_state = base_state * 3 + outs
                        count_state = balls * 3 + strikes
                        state_id = base_out_state * 12 + count_state
                        
                        if count_idx < len(re_values):
                            re288_table[state_id] = re_values[count_idx]
                        else:
                            # Fallback for missing values
                            re288_table[state_id] = 0.5
                else:
                    # Fallback for missing base-out combinations
                    for count_idx in range(12):
                        base_out_state = base_state * 3 + outs
                        state_id = base_out_state * 12 + count_idx
                        re288_table[state_id] = 0.5
        
        return re288_table
    
    def _get_runners_state(self, row) -> int:
        """Convert base runners to runners state (0-7)"""
        return int(bool(row.get('on_1b', 0))) * 1 + \
               int(bool(row.get('on_2b', 0))) * 2 + \
               int(bool(row.get('on_3b', 0))) * 4
        """Convert base runners to runners state (0-7)"""
        return int(bool(row.get('on_1b', 0))) * 1 + \
               int(bool(row.get('on_2b', 0))) * 2 + \
               int(bool(row.get('on_3b', 0))) * 4
    def build_base_out_state(self, row) -> int:
        """Convert base-out situation to state ID (0-23 for RE24)"""
        runners = self._get_runners_state(row)
        outs = int(row.get('outs_when_up', 0))
        return runners * 3 + outs
    
    def build_base_out_count_state(self, row) -> int:
        """Convert base-out-count situation to state ID (0-287 for RE288)"""
        base_out_state = self.build_base_out_state(row)
        balls = int(row.get('balls', 0))
        strikes = int(row.get('strikes', 0))
        count_state = balls * 3 + strikes  # 0-11 for possible counts
        return base_out_state * 12 + count_state
    
    def calculate_re24_table(self, data: pd.DataFrame) -> Dict[int, float]:
        """
        DEPRECATED: Use standard RE24 table instead.
        This method is kept for backward compatibility but is no longer used.
        The standard RE24 table is initialized in __init__.
        """
        print("Using standard RE24 table instead of calculating from data...")
        return self.re24_table
    
    def calculate_re288_table(self, data: pd.DataFrame) -> Dict[int, float]:
        """
        DEPRECATED: Use standard RE288 table instead.
        This method is kept for backward compatibility but is no longer used.
        The standard RE288 table is initialized in __init__.
        """
        print("Using standard RE288 table instead of calculating from data...")
        return self.re288_table
    
    def calculate_mvp_run_values(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate MVP (PA-terminal) run values using standard RE24 table.
        Formula: RE24 = (End_RE + Runs_Scored) - Start_RE
        """
        print("Calculating MVP run values using standard RE24 table...")
            
        data_copy = data.copy()
        data_copy['run_value'] = 0.0
        data_copy['before_pa_re'] = np.nan
        data_copy['after_pa_re'] = np.nan
        data_copy['runs_during_pa'] = 0
        
        # Group by plate appearances
        for game_id in data_copy['game_pk'].unique():
            game_data = data_copy[data_copy['game_pk'] == game_id].copy()
            
            for at_bat in game_data['at_bat_number'].unique():
                pa_data = game_data[game_data['at_bat_number'] == at_bat].copy()
                if pa_data.empty:
                    continue
                    
                pa_indices = pa_data.index
                first_pitch_idx = pa_indices[0]
                last_pitch_idx = pa_indices[-1]
                
                # Get before-PA state from first pitch
                first_pitch = pa_data.iloc[0]
                before_state = self.build_base_out_state(first_pitch)
                before_re = self.re24_table.get(before_state, 0.5)
                
                # Get after-PA state (simplified - would need more logic for actual state transitions)
                last_pitch = pa_data.iloc[-1]
                
                # Determine outcome and calculate runs (simplified)
                # In practice, you'd need to implement full state transition logic
                # based on the actual outcome (events column)
                runs_during_pa = 0
                after_re = 0.5  # Placeholder - would need proper state transition logic
                
                # Calculate PA run value using standard formula
                pa_run_value = (after_re + runs_during_pa) - before_re
                
                # Assign to terminal pitch only
                data_copy.loc[last_pitch_idx, 'run_value'] = pa_run_value
                data_copy.loc[pa_indices, 'before_pa_re'] = before_re
                data_copy.loc[last_pitch_idx, 'after_pa_re'] = after_re
                data_copy.loc[last_pitch_idx, 'runs_during_pa'] = runs_during_pa
        
        return data_copy
    
    def calculate_pro_run_values(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate Pro (pitch-level) run values using standard RE288 table.
        Formula: Pitch Run Value = (PostPitch_RE + Runs_On_Pitch) - PrePitch_RE
        """
        print("Calculating Pro run values using standard RE288 table...")
            
        data_copy = data.copy()
        data_copy['run_value'] = 0.0
        data_copy['pre_pitch_re'] = np.nan
        data_copy['post_pitch_re'] = np.nan
        data_copy['runs_on_pitch'] = 0
        
        # Calculate run value for each pitch
        for idx, row in data_copy.iterrows():
            # Pre-pitch state
            pre_state = self.build_base_out_count_state(row)
            pre_re = self.re288_table.get(pre_state, 0.5)
            
            # Post-pitch state (simplified - would need full state transition logic)
            # In practice, you'd need to implement full state transition logic
            # based on the actual outcome (description, events columns)
            post_re = 0.5  # Placeholder - would need proper state transition logic
            runs_on_pitch = 0  # Placeholder - would extract from actual play outcome
            
            # Calculate pitch run value using standard formula
            pitch_run_value = (post_re + runs_on_pitch) - pre_re
            
            data_copy.loc[idx, 'run_value'] = pitch_run_value
            data_copy.loc[idx, 'pre_pitch_re'] = pre_re
            data_copy.loc[idx, 'post_pitch_re'] = post_re
            data_copy.loc[idx, 'runs_on_pitch'] = runs_on_pitch
        
        return data_copy
    
    def engineer_features(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Engineer features for modeling.
        """
        print("Engineering features...")
        
        data_copy = data.copy()
        
        # Count feature
        data_copy['count'] = data_copy['balls'].astype(str) + '-' + data_copy['strikes'].astype(str)
        
        # Location bins
        if 'plate_x' in data_copy.columns and 'plate_z' in data_copy.columns:
            # Use quantile-based binning
            data_copy['plate_x_bin'] = pd.qcut(data_copy['plate_x'].dropna(), 
                                              q=7, labels=False, duplicates='drop')
            data_copy['plate_z_bin'] = pd.qcut(data_copy['plate_z'].dropna(), 
                                              q=5, labels=False, duplicates='drop')
            data_copy['loc_bin'] = (data_copy['plate_x_bin'].astype(str) + '|' + 
                                   data_copy['plate_z_bin'].astype(str))
        
        # Previous pitch type (lag-1 within pitcher & PA)
        data_copy = data_copy.sort_values(['game_pk', 'pitcher', 'at_bat_number', 'pitch_number'])
        data_copy['prev_pitch_type'] = data_copy.groupby(['game_pk', 'pitcher', 'at_bat_number'])['pitch_type'].shift(1)
        
        # Standardized velocity
        if 'release_speed' in data_copy.columns:
            data_copy['release_speed_z'] = (data_copy['release_speed'] - data_copy['release_speed'].mean()) / data_copy['release_speed'].std()
        
        # Game context features
        if 'home_score' in data_copy.columns and 'away_score' in data_copy.columns:
            data_copy['score_diff'] = data_copy['home_score'] - data_copy['away_score']
        
        # Base-out state ID
        data_copy['base_out_state'] = data_copy.apply(self.build_base_out_state, axis=1)
        
        print(f"Feature engineering complete. Shape: {data_copy.shape}")
        return data_copy
    
    def create_train_test_split(self, data: pd.DataFrame, 
                               train_years: List[int] = [2023],
                               test_years: List[int] = [2024]) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Create temporal train/test split.
        """
        print(f"Creating train/test split. Train: {train_years}, Test: {test_years}")
        
        if 'game_year' in data.columns:
            train_data = data[data['game_year'].isin(train_years)].copy()
            test_data = data[data['game_year'].isin(test_years)].copy()
        else:
            # Fallback to date-based split
            data['year'] = data['game_date'].dt.year
            train_data = data[data['year'].isin(train_years)].copy()
            test_data = data[data['year'].isin(test_years)].copy()
        
        print(f"Train set: {len(train_data)} pitches")
        print(f"Test set: {len(test_data)} pitches")
        
        return train_data, test_data
    
    def final_cleaning(self, data: pd.DataFrame, target_col: str = 'run_value') -> pd.DataFrame:
        """
        Final data cleaning and preparation.
        """
        print("Performing final cleaning...")
        
        data_copy = data.copy()
        
        # Drop rows with missing critical fields
        critical_fields = ['pitch_type', 'balls', 'strikes']
        if 'plate_x' in data_copy.columns:
            critical_fields.extend(['plate_x', 'plate_z'])
            
        initial_count = len(data_copy)
        data_copy = data_copy.dropna(subset=critical_fields)
        print(f"Dropped {initial_count - len(data_copy)} rows with missing critical fields")
        
        # Cap extreme target values
        if target_col in data_copy.columns:
            data_copy[target_col] = data_copy[target_col].clip(-3, 3)
            
        print(f"Final dataset: {len(data_copy)} pitches")
        return data_copy
    
    def run_full_pipeline(self, 
                         start_date: str,
                         end_date: str,
                         pitcher_ids: Optional[List[int]] = None,
                         method: str = 'mvp',  # 'mvp' or 'pro'
                         save_path: str = 'statcast_processed.parquet') -> pd.DataFrame:
        """
        Run the full pipeline from raw data to analysis-ready dataset.
        """
        print(f"Starting full pipeline with {method} method...")
        
        # Step 1: Fetch data
        raw_data = self.fetch_statcast_data(start_date, end_date, pitcher_ids)
        if raw_data.empty:
            print("No data fetched. Exiting.")
            return pd.DataFrame()
        
        # Step 2: Clean and subset
        clean_data = self.clean_and_subset_data(raw_data)
        
        # Step 3: Calculate run values
        if method.lower() == 'mvp':
            data_with_rv = self.calculate_mvp_run_values(clean_data)
        else:  # pro
            data_with_rv = self.calculate_pro_run_values(clean_data)
        
        # Step 4: Engineer features
        featured_data = self.engineer_features(data_with_rv)
        
        # Step 5: Final cleaning
        final_data = self.final_cleaning(featured_data)
        
        # Step 6: Save
        if save_path:
            if save_path.endswith('.parquet'):
                final_data.to_parquet(save_path, index=False)
            else:
                final_data.to_csv(save_path, index=False)
            print(f"Saved processed data to {save_path}")
        
        self.data = final_data
        print("Pipeline complete!")
        return final_data

# Example usage and testing
if __name__ == "__main__":
    # Initialize processor
    processor = StatcastProcessor()
    
    # Run pipeline for sample pitchers
    try:
        # Test with a small date range first
        processed_data = processor.run_full_pipeline(
            start_date='2025-04-01',
            end_date='2025-08-01',
            pitcher_ids=[676979, 694973],  
            method='mvp',
            save_path='test_statcast_data.parquet'
        )
        
        print("\nSample of processed data:")
        print(processed_data.head())
        print(f"\nFeatures available: {list(processed_data.columns)}")
        print(f"\nRun value stats:")
        if 'run_value' in processed_data.columns:
            print(processed_data['run_value'].describe())
            
        # Create train/test split
        train_data, test_data = processor.create_train_test_split(
            processed_data,
            train_years=[2025],
            test_years=[2025]  # Same year for demo
        )
        
        print(f"\nTrain/test split complete:")
        print(f"Training set: {len(train_data)} pitches")
        print(f"Test set: {len(test_data)} pitches")
        
    except Exception as e:
        print(f"Error running pipeline: {e}")
        print("Make sure you have pybaseball installed: pip install pybaseball")

Starting full pipeline with mvp method...
Fetching Statcast data from 2025-04-01 to 2025-08-01...
Fetching data for pitcher 676979...
Gathering Player Data
Fetching data for pitcher 694973...
Gathering Player Data
Retrieved 4090 pitches
Cleaning and subsetting data...
Cleaned data: 4083 pitches remaining
Calculating MVP run values using standard RE24 table...
Engineering features...
Feature engineering complete. Shape: (4083, 36)
Performing final cleaning...
Dropped 0 rows with missing critical fields
Final dataset: 4083 pitches
Saved processed data to test_statcast_data.parquet
Pipeline complete!

Sample of processed data:
   game_date  pitcher  batter p_throws stand pitch_type  release_speed  \
0 2025-07-27   694973  682998        R     L         FF           98.3   
1 2025-07-27   694973  682998        R     L         FF           97.3   
2 2025-07-27   694973  682998        R     L         FS           92.9   
3 2025-07-27   694973  682998        R     L         FF           97.5  

In [11]:
df = pd.read_parquet(r"C:\Users\dmari\Downloads\bbayes\notebooks\test_statcast_data.parquet")
pd.set_option('display.max_columns', None)
df

Unnamed: 0,game_date,pitcher,batter,p_throws,stand,pitch_type,release_speed,plate_x,plate_z,balls,strikes,outs_when_up,on_1b,on_2b,on_3b,inning,events,description,home_score,away_score,at_bat_number,pitch_number,game_pk,game_year,run_value,before_pa_re,after_pa_re,runs_during_pa,count,plate_x_bin,plate_z_bin,loc_bin,prev_pitch_type,release_speed_z,score_diff,base_out_state
0,2025-07-27,694973,682998,R,L,FF,98.3,0.20,2.84,0,0,0,0.0,0.0,0.0,1,,swinging_strike,0,0,1,1,776984,2025,0.000,0.461,,0,0-0,4,3,4|3,,1.053101,0,0
1,2025-07-27,694973,682998,R,L,FF,97.3,-1.21,3.34,0,1,0,0.0,0.0,0.0,1,,ball,0,0,1,2,776984,2025,0.000,0.461,,0,0-1,0,4,0|4,FF,0.868199,0,0
2,2025-07-27,694973,682998,R,L,FS,92.9,-0.98,1.52,1,1,0,0.0,0.0,0.0,1,,ball,0,0,1,3,776984,2025,0.000,0.461,,0,1-1,0,0,0|0,FF,0.054628,0,0
3,2025-07-27,694973,682998,R,L,FF,97.5,-0.06,3.17,2,1,0,0.0,0.0,0.0,1,,swinging_strike,0,0,1,4,776984,2025,0.000,0.461,,0,2-1,3,4,3|4,FS,0.905179,0,0
4,2025-07-27,694973,682998,R,L,ST,84.9,0.07,2.45,2,2,0,0.0,0.0,0.0,1,strikeout,swinging_strike,0,0,1,5,776984,2025,0.039,0.461,0.5,0,2-2,3,2,3|2,FF,-1.424590,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4078,2025-04-02,676979,622761,L,R,CH,88.6,1.25,2.19,0,1,0,602104.0,0.0,0.0,8,,ball,0,3,58,2,778483,2025,0.000,0.831,,0,0-1,6,2,6|2,FF,-0.740452,-3,3
4079,2025-04-02,676979,622761,L,R,FC,89.3,0.69,2.76,1,1,0,602104.0,0.0,0.0,8,,called_strike,0,3,58,3,778483,2025,0.000,0.831,,0,1-1,5,3,5|3,CH,-0.611020,-3,3
4080,2025-04-02,676979,622761,L,R,FF,94.7,0.09,3.81,1,2,0,602104.0,0.0,0.0,8,field_out,hit_into_play,0,3,58,4,778483,2025,-0.331,0.831,0.5,0,1-2,3,4,3|4,FC,0.387453,-3,3
4081,2025-04-02,676979,656775,L,L,SI,94.9,1.08,2.98,0,0,1,602104.0,0.0,0.0,8,,foul,0,3,59,1,778483,2025,0.000,0.489,,0,0-0,6,3,6|3,,0.424433,-3,4
