In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb
#import pybaseball as pyb
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2021_processed.csv")
data_2022 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2022_processed.csv")
data_2023 = pd.read_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/2023_processed.csv")

In [3]:
def compute_differences_with_ff_si_comparison(df):
    # Filter for relevant pitch types (fastball and sinker)
    relevant_pitch_types = ['FF', 'SI']
    
    # Columns for which to compute differences
    cols_to_diff = ['spin_axis', 'pfx_x', 'pfx_z', 'release_pos_z', 'release_speed']
    
    # Initialize new columns for the last values of the relevant pitches
    for col in cols_to_diff:
        df[f'last_{col}'] = None
    
    # Iterate through each row to compute differences
    last_values = {col: None for col in cols_to_diff}
    
    last_pitch_type = None
    
    for index, row in df.iterrows():
        
        if row['pitch_type'] in relevant_pitch_types:
            
            # Calculate differences if the last pitch was also a fastball or sinker
            if last_pitch_type in relevant_pitch_types:
                
                for col in cols_to_diff:
                    
                    if last_values[col] is not None:
                        df.at[index, f'diff_{col}'] = row[col] - last_values[col]
            
            # Update last values and last pitch type
            for col in cols_to_diff:
            
                last_values[col] = row[col]
            
            last_pitch_type = row['pitch_type']
        
        else:

            # Assign the last values to the new columns and compute differences
            for col in cols_to_diff:
                
                df.at[index, f'last_{col}'] = last_values[col]
                
                # Compute the differences if last value is not None
                if last_values[col] is not None:
                    
                    df.at[index, f'diff_{col}'] = row[col] - last_values[col]
    
    # Convert data types for optimization
    int_cols = df.select_dtypes(include=['int64']).columns
    float_cols = df.select_dtypes(include=['float64']).columns
    df[int_cols] = df[int_cols].astype('int32')
    df[float_cols] = df[float_cols].astype('float32')
    
    return df

In [4]:
# Apply the function to the 'pitch_type' column
breaking_ball = compute_differences_with_ff_si_comparison(data)

In [5]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2021_off_processed.csv", index=False)

In [6]:
breaking_ball = compute_differences_with_ff_si_comparison(data_2022)

In [7]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2022_off_processed.csv", index=False)

In [8]:
breaking_ball = compute_differences_with_ff_si_comparison(data_2023)

In [9]:
breaking_ball.to_csv("Y:/departments/research_and_development/baseball_operations/clayton_goodiez/csv/off_speed/2023_off_processed.csv", index=False)