In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb
import pybaseball as pyb
import seaborn as sns

In [2]:
data = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2021_data.csv")
data_2022 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2022_data.csv")
data_2023 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2023_MLB_Seaspm.csv")

In [3]:
testing_data = pd.concat([data_2022, data_2023], axis=0)

In [26]:
def preprocess_data(df):
    # Optimize data types for memory efficiency
    df = df.loc[df['release_speed'].notnull()].copy()
    int_cols = df.select_dtypes(include=['int64']).columns
    float_cols = df.select_dtypes(include=['float64']).columns
    df[int_cols] = df[int_cols].astype('int32')
    df[float_cols] = df[float_cols].astype('float32')

    # Condense pitch types using a mapping dictionary
    pitch_type_map = {
        "FF": "FF", "FT": "SI", "FC": "CT",
        "SL": "SL", "ST": "SL", "SV": "SL",
        "CH": "CH", "FS": "CH",
        "CU": "CB", "KC": "CB", "CS": "CB", "CB": "CB",
        "SI": "SI", "KN": "KN"
    }
    df['pitch_type_condensed'] = df['pitch_type'].map(pitch_type_map).fillna("FAHCK")

    # Calculate pitcher stats
    df.sort_values(by=['game_pk', 'pitcher', 'pitch_type_condensed', 'pitch_number'], inplace=True)
    df['rolling_avg_velo'] = df.groupby(['game_pk', 'pitcher', 'pitch_type_condensed'])['release_speed'].transform(lambda x: x.rolling(5, min_periods=1).mean())
    seasonal_avg = df.groupby(['pitcher', 'pitch_type_condensed'])['release_speed'].mean().rename('seasonal_avg_velo')
    df = df.join(seasonal_avg, on=['pitcher', 'pitch_type_condensed'])
    df['rolling_avg_velo'].fillna(df['seasonal_avg_velo'], inplace=True)

    # Define multiple conditions
    contact_conditions = ["foul_tip", "foul_bunt", "foul", "foul_pitchout", "hit_into_play", "hit_into_play_no_out", "hit_into_play_score", "pitchout_hit_into_play_score"]
    swing_conditions = ["swinging_strike", "swinging_strike_blocked", "missed_bunt", "swinging_pitchout"] + contact_conditions
    whiff_conditions = ["swinging_strike", "swinging_strike_blocked", "missed_bunt", "swinging_pitchout"]
    foul_conditions = ["foul_tip", "foul_pitchout", "foul", "foul_pitchout"]
    strike_conditions = ["called_strike", "swinging_strike", "swinging_strike_blocked", "foul_tip", "foul", "foul_bunt", "foul_pitchout"]

    # Apply conditions
    df['swing'] = df['description'].isin(swing_conditions).astype(int)
    df['whiff'] = df['description'].isin(whiff_conditions).astype(int)
    df['contact'] = df['description'].isin(contact_conditions).astype(int)
    df['strike'] = df['description'].isin(strike_conditions).astype(int)
    df['foul'] = df['description'].isin(foul_conditions).astype(int)
    df['fly_ball'] = df['bb_type'].isin(["fly_ball", "popup"]).astype(int)
    df['ground_ball'] = (df['bb_type'] == "ground_ball").astype(int)
    df['line_drive'] = (df['bb_type'] == "line_drive").astype(int)

    # Calculate additional metrics
    df['total_movement'] = np.sqrt(df['pfx_x']**2 + df['pfx_z']**2)
    df['release_pos_y'] = 60.5 - df['release_extension']
    df['height_ratio'] = df['release_pos_z'] / df['height_numeric']
    # Create 'pitch_id_raw'
    df['pitch_id_raw'] = df['game_pk'].astype(str) + "_" + df['batter'].astype(str) + "_" + df['pitcher'].astype(str) + "_" + df['pitch_number'].astype(str) + "_" + df['at_bat_number'].astype(str) + "_" + df['inning'].astype(str)

    return df

In [27]:
# Apply the function to the 'pitch_type' column
breaking_ball = preprocess_data(data)

In [28]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2021_processed.csv", index=False)

In [29]:
breaking_ball = preprocess_data(data_2022)

In [30]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2022_processed.csv", index=False)

In [31]:
breaking_ball = preprocess_data(data_2023)

In [32]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2023_processed.csv", index=False)