In [1]:
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
import warnings
import numpy as np

# Suppress pandas warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

class DataProcessor:
    def __init__(self, raw_data, config_path, one_hot = True, seed=42):
        np.random.seed(seed)

        self.raw_data = raw_data
        self.config = self.load_config(config_path)
        self.selected_columns = list(self.config.keys())
        self.processed_data = self.raw_data[self.selected_columns].copy()
        self.one_hot = one_hot
        
        print(f"Selected Columns: {self.selected_columns}")
        print(f'Raw Data Shape: {self.raw_data.shape}')


    
    def load_config(self, config_path):
        with open(config_path, 'r') as file:
            config = json.load(file)
        return config
    
    def handle_missing_values(self):
        for column, settings in self.config.items():
            if column == 'events':  # Handle 'events' column separately
                events_to_map = set(['caught_stealing_2b', 'caught_stealing_3b', 'caught_stealing_home', 'stolen_base_2b', 'stolen_base_3b', 'stolen_base_home', 'passed_ball'])
                self.processed_data[column] = self.raw_data.apply(
                    lambda row: row['type'] if pd.isnull(row[column]) or row[column] in events_to_map else row[column], axis=1)

            initial_missing_count = self.processed_data[column].isnull().sum()
            strategy = settings.get('missing_value_strategy', 'drop')
            if strategy == 'drop':
                self.processed_data.dropna(subset=[column], inplace=True)
            elif strategy in ['mean', 'median', 'mode']:
                if strategy == 'mean':
                    fill_value = self.processed_data[column].mean()
                elif strategy == 'median':
                    fill_value = self.processed_data[column].median()
                elif strategy == 'mode':
                    fill_value = self.processed_data[column].mode().iloc[0]
                self.processed_data[column].fillna(fill_value, inplace=True)
            elif isinstance(strategy, (int, float, str)):
                self.processed_data[column].fillna(strategy, inplace=True)

            final_missing_count = self.processed_data[column].isnull().sum()
            handled_count = initial_missing_count - final_missing_count
            print(f"Handled {handled_count} missing values in column '{column}'")
            print(f"Missing values after handling in column '{column}': {final_missing_count}")

    def convert_data_types(self):
        for column, settings in self.config.items():
            if settings.get('categorical', False):
                initial_row_count = self.processed_data.shape[0]
                if 'value_map' in settings:
                    value_map = settings['value_map']
                    self.processed_data[column] = self.processed_data[column].replace(value_map)
                if 'drop_values' in settings:
                    drop_values = settings['drop_values']
                    self.processed_data = self.processed_data[~self.processed_data[column].isin(drop_values)]
                    final_row_count = self.processed_data.shape[0]
                    dropped_rows = initial_row_count - final_row_count
                    print(f"Dropped {dropped_rows} rows for column '{column}' due to drop values")
                self.processed_data[column] = self.processed_data[column].astype('category')
            elif 'datetime_format' in settings:
                self.processed_data[column] = pd.to_datetime(self.processed_data[column], format=settings['datetime_format'])
        
        print(f"Processed Data Shape after convert_data_types: {self.processed_data.shape}")

    def standardize_or_normalize(self):
        scaler = StandardScaler()
        for column, settings in self.config.items():
            if settings.get('standardize', False):
                self.processed_data[[column]] = scaler.fit_transform(self.processed_data[[column]])
            elif settings.get('normalize', False):
                self.processed_data[[column]] = (self.processed_data[column] - self.processed_data[column].min()) / (self.processed_data[column].max() - self.processed_data[column].min())
        print(f"Processed Data Shape after standardize_or_normalize: {self.processed_data.shape}")

    def one_hot_encode(self):
        categorical_columns = [col for col, settings in self.config.items() if settings.get('categorical', False) and not settings.get('metadata', False)]

        if categorical_columns:
            # Check for NaN values before encoding
            for col in categorical_columns:
                missing_values_count = self.processed_data[col].isnull().sum()
                if missing_values_count > 0:
                    print(f"Column '{col}' has {missing_values_count} missing values before encoding.")
            
            # Perform one-hot encoding using pd.get_dummies
            self.processed_data = pd.get_dummies(data=self.processed_data, columns=categorical_columns, prefix=categorical_columns, drop_first=True)
            

        print(f"Processed Data Shape after one_hot_encode: {self.processed_data.shape}")


    def get_processed_data(self):
        self.handle_missing_values()
        self.convert_data_types()
        self.standardize_or_normalize()
        if self.one_hot:
            self.one_hot_encode()
        print(f'New Data Shape: {self.processed_data.shape}')
        return self.processed_data

In [2]:
raw_data = pd.read_csv('statcast_2015-2024.csv')

  raw_data = pd.read_csv('statcast_2015-2024.csv')


In [3]:
#save 2023-2024 for validation
train = raw_data[raw_data['game_date'] < "2023-04-01"]
valid = raw_data[raw_data['game_date'] >= "2023-04-01"]

In [4]:
valid['pitch_type'].value_counts()

pitch_type
FF    355782
SL    187129
SI    173401
CH    118728
FC     88417
CU     74616
ST     58292
FS     28223
KC     20579
SV      3705
FA      1438
FO       927
KN       807
EP       696
SC        79
PO        71
CS        57
Name: count, dtype: int64

In [5]:
train['pitch_type'].value_counts()

pitch_type
FF    1914736
SI     989378
SL     875871
CH     579166
CU     429280
FC     338488
KC     136480
FS      85735
ST      56702
SV      13162
KN      11704
IN       6353
FA       2900
EP       1536
FO        846
PO        789
CS        606
SC        151
AB          3
Name: count, dtype: int64

In [6]:
config_path = 'config.json'

train_processor = DataProcessor(train, config_path)
train_data = train_processor.get_processed_data()

valid_processor = DataProcessor(valid, config_path)
valid_data = valid_processor.get_processed_data()

Selected Columns: ['events', 'launch_speed', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'batter', 'pitcher', 'stand', 'p_throws', 'hit_location', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'outs_when_up', 'hc_x', 'hc_y', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'launch_angle', 'release_spin_rate', 'release_extension', 'game_pk', 'release_pos_y', 'at_bat_number', 'batter_name', 'pitcher_name']
Raw Data Shape: (5527000, 87)
Handled 0 missing values in column 'events'
Missing values after handling in column 'events': 0
Handled 3984227 missing values in column 'launch_speed'
Missing values after handling in column 'launch_speed': 0
Handled 83114 missing values in column 'pitch_type'
Missing values after handling in column 'pitch_type': 0
Handled 0 missing values in column 'game_date'
Missing values after handling in column 'game_date': 0
Handled 80910 missing values in column 'release_speed'
Missing values after handling in col

In [7]:
set(train_data.columns).difference(set(valid_data.columns))


set()

In [8]:
valid_data.columns

Index(['launch_speed', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'batter', 'pitcher', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'hc_x', 'hc_y', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
       'sz_bot', 'launch_angle', 'release_spin_rate', 'release_extension',
       'game_pk', 'release_pos_y', 'at_bat_number', 'batter_name',
       'pitcher_name', 'events_S', 'events_double', 'events_field_out',
       'events_hit_by_pitch', 'events_home_run', 'events_single',
       'events_strikeout', 'events_triple', 'events_walk', 'pitch_type_CS',
       'pitch_type_CU', 'pitch_type_EP', 'pitch_type_FA', 'pitch_type_FC',
       'pitch_type_FF', 'pitch_type_FO', 'pitch_type_FS', 'pitch_type_KC',
       'pitch_type_KN', 'pitch_type_PO', 'pitch_type_SC', 'pitch_type_SI',
       'pitch_type_SL', 'pitch_type_ST', 'pitch_type_SV', 'stand_R',
       'p_throws_R', 'hit_location_1.0', 'hit_location_2.0',
       'hit_location_3.0', 'hit_location_4.0', 'hit_location_5.0',
  

In [9]:
train_data.to_csv("statcast_2015-2023_cleaned.csv",index=False)
valid_data.to_csv("statcast_2023-2024_cleaned.csv",index=False)