In [2]:
import datetime
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from skopt import BayesSearchCV

In [3]:
csv_dir = '../../SEC Trackman Data/'
all_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

df_list = []

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_pitches = pd.concat(df_list, ignore_index=True)

print(all_pitches.shape)
print(all_pitches.columns)

(270589, 167)
Index(['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher',
       'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter',
       ...
       'ThrowTrajectoryZc1', 'ThrowTrajectoryZc2', 'PitchReleaseConfidence',
       'PitchLocationConfidence', 'PitchMovementConfidence',
       'HitLaunchConfidence', 'HitLandingConfidence',
       'CatcherThrowCatchConfidence', 'CatcherThrowReleaseConfidence',
       'CatcherThrowLocationConfidence'],
      dtype='object', length=167)


In [30]:
print(all_pitches['TaggedPitchType'].unique())
print(all_pitches['AutoPitchType'].unique())

['Fastball' 'Slider' 'ChangeUp' 'Curveball' 'Cutter' 'Other' 'Undefined'
 'Sinker' 'Knuckleball' 'Splitter' 'FourSeamFastBall' 'TwoSeamFastBall'
 'OneSeamFastBall']
['Sinker' 'Four-Seam' 'Curveball' 'Changeup' 'Slider' 'Cutter' nan
 'Splitter' 'Other']


In [42]:
len(all_pitches[all_pitches['TaggedPitchType']=='FourSeamFastBall'])

2563

In [31]:
ffs = all_pitches[all_pitches['AutoPitchType'].isin(['Four-Seam'])]
print(ffs.shape)

(91835, 167)


In [32]:
ffs['whiff'] = np.where(ffs['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs['whiff'] = np.where(ffs['PitchCall']=='StrikeSwinging',1,0)


In [33]:
preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [34]:
ffs_X = ffs[preds]
ffs_y = ffs['whiff']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(ffs_X, ffs_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict([('colsample_bytree', 0.7779330049204607), ('gamma', 9), ('learning_rate', 0.014285310742471472), ('max_depth', 6), ('n_estimators', 97), ('reg_alpha', 4.259148185658276e-05), ('reg_lambda', 1.752203051791065e-07), ('subsample', 0.9006210125361986)])
Accuracy (Training): 0.9019
ROC AUC (Training): 0.5000
Accuracy (Test): 0.9022
ROC AUC (Test): 0.5000


In [36]:
ffs['Date'] = pd.to_datetime(ffs['Date'], format = 'mixed')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs['Date'] = pd.to_datetime(ffs['Date'], format = 'mixed')


In [37]:
aub_ffs_2024 = ffs[(ffs['PitcherTeam']=='AUB_TIG') & (ffs['Date'] > datetime.datetime(2024,1,1))]

In [38]:
# Get unique values
unique_values = ffs['Pitcher'].unique()
aub_unique_values = aub_ffs_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs[ffs['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_2024[aub_ffs_2024['Pitcher'] == value] for value in aub_unique_values}

['Allsup, Chase' 'Tilly, Cameron' 'Gonzalez, Joseph' 'Crotchfelt, Zach'
 'Cannon, Will' 'Myers, Carson' 'Herberholz, Christian' 'McBride, Connor'
 'Murphy, Hayden' 'Graves, Griffin' 'Watts, Dylan' 'Carlson, Parker'
 'Armstrong, John' 'Petrovic, Alexander' 'Nelson, Drew' 'Copeland, Konner'
 'Schorr, Ben' 'Keshock, Cameron' 'Bauman, Tanner']


In [39]:
def aub_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam stuff+: {np.mean(stuff * 100)}")

In [40]:
for pitcher in aub_unique_values:
    aub_ffs_stuff(pitcher)

Allsup, Chase Four Seam stuff+: 111.99798583984375
Tilly, Cameron Four Seam stuff+: 94.66725158691406
Gonzalez, Joseph Four Seam stuff+: 83.2686767578125
Crotchfelt, Zach Four Seam stuff+: 101.47782897949219
Cannon, Will Four Seam stuff+: 102.28593444824219
Myers, Carson Four Seam stuff+: 84.28953552246094
Herberholz, Christian Four Seam stuff+: 103.02664184570312
McBride, Connor Four Seam stuff+: 99.83373260498047
Murphy, Hayden Four Seam stuff+: 99.209228515625
Graves, Griffin Four Seam stuff+: 115.5823745727539
Watts, Dylan Four Seam stuff+: 87.9808349609375
Carlson, Parker Four Seam stuff+: 89.33833312988281
Armstrong, John Four Seam stuff+: 105.04164123535156
Petrovic, Alexander Four Seam stuff+: 89.3357925415039
Nelson, Drew Four Seam stuff+: 95.1705322265625
Copeland, Konner Four Seam stuff+: 90.31842041015625
Schorr, Ben Four Seam stuff+: 79.26956176757812
Keshock, Cameron Four Seam stuff+: 143.6160888671875
Bauman, Tanner Four Seam stuff+: 69.08319854736328


In [45]:
import joblib

joblib.dump(best_model, '../../models/Four Seam Models/fourseam_all_model.pkl')

['../../models/Four Seam Models/fourseam_all_model.pkl']

In [46]:
ffs_rr = ffs[(ffs['PitcherThrows'] == 'Right') & (ffs['BatterSide'] == 'Right')]
ffs_rl = ffs[(ffs['PitcherThrows'] == 'Right') & (ffs['BatterSide'] == 'Left')]
ffs_lr = ffs[(ffs['PitcherThrows'] == 'Left') & (ffs['BatterSide'] == 'Right')]
ffs_ll = ffs[(ffs['PitcherThrows'] == 'Left') & (ffs['BatterSide'] == 'Left')]

print(ffs_rr.shape)
print(ffs_rl.shape)
print(ffs_lr.shape)
print(ffs_ll.shape)

(36666, 168)
(26985, 168)
(18281, 168)
(9772, 168)


In [47]:
ffs_rr['whiff'] = np.where(ffs_rr['PitchCall']=='StrikeSwinging',1,0)
ffs_rl['whiff'] = np.where(ffs_rl['PitchCall']=='StrikeSwinging',1,0)
ffs_lr['whiff'] = np.where(ffs_lr['PitchCall']=='StrikeSwinging',1,0)
ffs_ll['whiff'] = np.where(ffs_ll['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rr['whiff'] = np.where(ffs_rr['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rl['whiff'] = np.where(ffs_rl['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_lr['whiff'] = np.where(ffs_lr['PitchCall']=='StrikeSwinging',

In [48]:
preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [49]:
ffs_rr_X = ffs_rr[preds]
ffs_rr_y = ffs_rr['whiff']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(ffs_rr_X, ffs_rr_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict([('colsample_bytree', 0.6214987446240974), ('gamma', 1), ('learning_rate', 0.037595418101548164), ('max_depth', 8), ('n_estimators', 192), ('reg_alpha', 7.173506161250544e-05), ('reg_lambda', 0.05577909761565971), ('subsample', 0.6848019443859761)])
Accuracy (Training): 0.9233
ROC AUC (Training): 0.5781
Accuracy (Test): 0.9058
ROC AUC (Test): 0.5028


In [65]:
import joblib

joblib.dump(best_model, '../../models/Four Seam Models/fourseam_rr_model.pkl')

['../../models/Four Seam Models/fourseam_rr_model.pkl']

In [52]:
ffs_rr['Date'] = pd.to_datetime(ffs_rr['Date'], format = 'mixed')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rr['Date'] = pd.to_datetime(ffs_rr['Date'], format = 'mixed')


In [53]:
aub_ffs_rr_2024 = ffs_rr[(ffs_rr['PitcherTeam']=='AUB_TIG') & (ffs_rr['Date'] > datetime.datetime(2024,1,1))]

In [54]:
# Get unique values
unique_values = ffs_rr['Pitcher'].unique()
aub_unique_values = aub_ffs_rr_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_rr[ffs_rr['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_rr_2024[aub_ffs_rr_2024['Pitcher'] == value] for value in aub_unique_values}

['Allsup, Chase' 'Cannon, Will' 'Herberholz, Christian' 'McBride, Connor'
 'Murphy, Hayden' 'Tilly, Cameron' 'Watts, Dylan' 'Carlson, Parker'
 'Gonzalez, Joseph' 'Petrovic, Alexander' 'Schorr, Ben' 'Armstrong, John'
 'Keshock, Cameron']


In [55]:
def aub_rr_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_rr_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} RR Four Seam stuff+: {np.mean(stuff * 100)}")

In [56]:
for pitcher in aub_unique_values:
    aub_rr_ffs_stuff(pitcher)

Allsup, Chase RR Four Seam stuff+: 118.7047348022461
Cannon, Will RR Four Seam stuff+: 104.28482818603516
Herberholz, Christian RR Four Seam stuff+: 108.99883270263672
McBride, Connor RR Four Seam stuff+: 87.47892761230469
Murphy, Hayden RR Four Seam stuff+: 69.03852081298828
Tilly, Cameron RR Four Seam stuff+: 94.46532440185547
Watts, Dylan RR Four Seam stuff+: 85.68556213378906
Carlson, Parker RR Four Seam stuff+: 68.18839263916016
Gonzalez, Joseph RR Four Seam stuff+: 136.07276916503906
Petrovic, Alexander RR Four Seam stuff+: 57.50715637207031
Schorr, Ben RR Four Seam stuff+: 62.089141845703125
Armstrong, John RR Four Seam stuff+: 14.755199432373047
Keshock, Cameron RR Four Seam stuff+: 260.8507385253906


In [57]:
ffs_rl_X = ffs_rl[preds]
ffs_rl_y = ffs_rl['whiff']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(ffs_rl_X, ffs_rl_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict([('colsample_bytree', 0.6), ('gamma', 0), ('learning_rate', 0.01), ('max_depth', 9), ('n_estimators', 252), ('reg_alpha', 1e-08), ('reg_lambda', 0.00042750556612102723), ('subsample', 0.8310000673126994)])
Accuracy (Training): 0.9091
ROC AUC (Training): 0.5224
Accuracy (Test): 0.9014
ROC AUC (Test): 0.5006


In [67]:
import joblib

joblib.dump(best_model, '../../models/Four Seam Models/fourseam_rl_model.pkl')

['../../models/Four Seam Models/fourseam_rl_model.pkl']

In [60]:
ffs_rl['Date'] = pd.to_datetime(ffs_rl['Date'], format='mixed')
aub_ffs_rl_2024 = ffs_rl[(ffs_rl['PitcherTeam']=='AUB_TIG') & (ffs_rl['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rl['Date'] = pd.to_datetime(ffs_rl['Date'], format='mixed')


In [61]:
# Get unique values
unique_values = ffs_rl['Pitcher'].unique()
aub_unique_values = aub_ffs_rl_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_rl[ffs_rl['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_rl_2024[aub_ffs_rl_2024['Pitcher'] == value] for value in aub_unique_values}

['Allsup, Chase' 'Tilly, Cameron' 'Gonzalez, Joseph' 'Cannon, Will'
 'Herberholz, Christian' 'McBride, Connor' 'Carlson, Parker'
 'Murphy, Hayden' 'Watts, Dylan' 'Armstrong, John' 'Petrovic, Alexander'
 'Schorr, Ben']


In [62]:
def aub_rl_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_rl_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam RL stuff+: {np.mean(stuff * 100)}")

In [63]:
for pitcher in aub_unique_values:
    aub_rl_ffs_stuff(pitcher)

Allsup, Chase Four Seam RL stuff+: 118.23831939697266
Tilly, Cameron Four Seam RL stuff+: 92.6011734008789
Gonzalez, Joseph Four Seam RL stuff+: 126.34913635253906
Cannon, Will Four Seam RL stuff+: 100.24156188964844
Herberholz, Christian Four Seam RL stuff+: 102.43070220947266
McBride, Connor Four Seam RL stuff+: 139.75607299804688
Carlson, Parker Four Seam RL stuff+: 69.0164794921875
Murphy, Hayden Four Seam RL stuff+: 122.46546173095703
Watts, Dylan Four Seam RL stuff+: 88.46366882324219
Armstrong, John Four Seam RL stuff+: 96.18131256103516
Petrovic, Alexander Four Seam RL stuff+: 94.58028411865234
Schorr, Ben Four Seam RL stuff+: 78.57306671142578


In [68]:
ffs_lr_X = ffs_lr[preds]
ffs_lr_y = ffs_lr['whiff']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(ffs_lr_X, ffs_lr_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict([('colsample_bytree', 0.9349553422213137), ('gamma', 9), ('learning_rate', 0.02806554771929606), ('max_depth', 10), ('n_estimators', 266), ('reg_alpha', 3.151399971510153e-08), ('reg_lambda', 1.2778159542148633e-07), ('subsample', 0.7414349590513672)])
Accuracy (Training): 0.8846
ROC AUC (Training): 0.5011
Accuracy (Test): 0.8899
ROC AUC (Test): 0.5007


In [70]:
import joblib

joblib.dump(best_model, '../../models/Four Seam Models/fourseam_lr_model.pkl')

['../../models/Four Seam Models/fourseam_lr_model.pkl']

In [71]:
ffs_lr['Date'] = pd.to_datetime(ffs_lr['Date'], format='mixed')
aub_ffs_lr_2024 = ffs_lr[(ffs_lr['PitcherTeam']=='AUB_TIG') & (ffs_lr['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_lr['Date'] = pd.to_datetime(ffs_lr['Date'], format='mixed')


In [72]:
# Get unique values
unique_values = ffs_lr['Pitcher'].unique()
aub_unique_values = aub_ffs_lr_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_lr[ffs_lr['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_lr_2024[aub_ffs_lr_2024['Pitcher'] == value] for value in aub_unique_values}

['Crotchfelt, Zach' 'Myers, Carson' 'Murphy, Hayden' 'Graves, Griffin'
 'Nelson, Drew' 'Copeland, Konner' 'Bauman, Tanner']


In [73]:
def aub_lr_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_lr_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam LR stuff+: {np.mean(stuff * 100)}")

In [74]:
for pitcher in aub_unique_values:
    aub_lr_ffs_stuff(pitcher)

Crotchfelt, Zach Four Seam LR stuff+: 106.07128143310547
Myers, Carson Four Seam LR stuff+: 79.92832946777344
Murphy, Hayden Four Seam LR stuff+: 68.79878234863281
Graves, Griffin Four Seam LR stuff+: 90.716064453125
Nelson, Drew Four Seam LR stuff+: 79.02799224853516
Copeland, Konner Four Seam LR stuff+: 80.22103881835938
Bauman, Tanner Four Seam LR stuff+: 50.878082275390625


In [60]:
ffs_ll_X = ffs_ll[preds]
ffs_ll_y = ffs_ll['whiff']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(ffs_ll_X, ffs_ll_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.7779330049204607, 'gamma': 9, 'learning_rate': 0.014285310742471472, 'max_depth': 6, 'n_estimators': 97, 'reg_alpha': 4.259148185658276e-05, 'reg_lambda': 1.752203051791065e-07, 'subsample': 0.9006210125361986})
Accuracy (Training): 0.9202
ROC AUC (Training): 0.5000
Accuracy (Test): 0.9170
ROC AUC (Test): 0.5000


In [67]:
import joblib

joblib.dump(best_model, '../../models/Four Seam Models/fourseam_ll_model.pkl')

['../../models/fourseam_ll_model.pkl']

In [63]:
ffs_ll['Date'] = pd.to_datetime(ffs_ll['Date'], format='mixed')
aub_ffs_ll_2024 = ffs_ll[(ffs_ll['PitcherTeam']=='AUB_TIG') & (ffs_ll['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_ll['Date'] = pd.to_datetime(ffs_ll['Date'], format='mixed')


In [64]:
# Get unique values
unique_values = ffs_ll['Pitcher'].unique()
aub_unique_values = aub_ffs_ll_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_ll[ffs_ll['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_ll_2024[aub_ffs_ll_2024['Pitcher'] == value] for value in aub_unique_values}

['Myers, Carson' 'Graves, Griffin' 'Nelson, Drew' 'Copeland, Konner'
 'Bauman, Tanner' 'Crotchfelt, Zach']


In [65]:
def aub_ll_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_ll_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam LL stuff+: {np.mean(stuff * 100)}")

In [66]:
for pitcher in aub_unique_values:
    aub_ll_ffs_stuff(pitcher)

Myers, Carson Four Seam LL stuff+: 80.69529724121094
Graves, Griffin Four Seam LL stuff+: 154.1129150390625
Nelson, Drew Four Seam LL stuff+: 111.88551330566406
Copeland, Konner Four Seam LL stuff+: 97.95450592041016
Bauman, Tanner Four Seam LL stuff+: 77.89878845214844
Crotchfelt, Zach Four Seam LL stuff+: 102.84469604492188
