In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb
import pybaseball as pyb
import seaborn as sns

In [2]:
data = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2021_processed.csv")
data_2022 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2022_processed.csv")
data_2023 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2023_processed.csv")

In [3]:
testing_data = pd.concat([data_2022, data_2023], axis=0)

In [4]:
def compute_differences(df):
    # Filter for relevant pitch types
    relevant_pitch_types = ['FF', 'SI', 'CT']
    relevant_df = df[df['pitch_type_condensed'].isin(relevant_pitch_types)].copy()

    # Columns for which to compute differences
    cols_to_diff = ['spin_axis', 'pfx_x', 'pfx_z', 'release_pos_z', 'release_speed']

    # Shift the columns in the relevant subset
    shifted_df = relevant_df.groupby('pitcher')[cols_to_diff].shift()

    # Merge the shifted values back into the relevant subset
    for col in cols_to_diff:
        relevant_df[f'last_{col}'] = shifted_df[col]

    # Merge the relevant subset back into the original dataframe on pitcher and pitch number
    df = df.merge(relevant_df[['game_pk', 'pitcher', 'pitch_number'] + [f'last_{col}' for col in cols_to_diff]], 
                  on=['game_pk', 'pitcher', 'pitch_number'], 
                  how='left')

    # Compute the differences for all pitches
    for col in cols_to_diff:
        df[f'diff_{col}'] = df[col] - df[f'last_{col}']

    int_cols = df.select_dtypes(include=['int64']).columns
    float_cols = df.select_dtypes(include=['float64']).columns
    df[int_cols] = df[int_cols].astype('int32')
    df[float_cols] = df[float_cols].astype('float32')
 
    return df


In [5]:
# Apply the function to the 'pitch_type' column
breaking_ball = compute_differences(data)

In [6]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2021_off_processed.csv", index=False)

In [9]:
breaking_ball = compute_differences(data_2022)

In [10]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2022_off_processed.csv", index=False)

In [None]:
breaking_ball = compute_differences(data_2023)

In [None]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2023_off_processed.csv", index=False)