In [12]:
import datetime
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from skopt import BayesSearchCV

In [2]:
csv_dir = '../../SEC Trackman Data/'
all_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

df_list = []

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_pitches = pd.concat(df_list, ignore_index=True)

print(all_pitches.shape)
print(all_pitches.columns)

(270589, 167)
Index(['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher',
       'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter',
       ...
       'ThrowTrajectoryZc1', 'ThrowTrajectoryZc2', 'PitchReleaseConfidence',
       'PitchLocationConfidence', 'PitchMovementConfidence',
       'HitLaunchConfidence', 'HitLandingConfidence',
       'CatcherThrowCatchConfidence', 'CatcherThrowReleaseConfidence',
       'CatcherThrowLocationConfidence'],
      dtype='object', length=167)


In [3]:
ffs = all_pitches[all_pitches['TaggedPitchType'].isin(['FourSeamFastBall', 'Fastball'])]
print(ffs.shape)

(125719, 167)


In [4]:
ffs['whiff'] = np.where(ffs['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs['whiff'] = np.where(ffs['PitchCall']=='StrikeSwinging',1,0)


In [5]:
preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [6]:
ffs_X = ffs[preds]
ffs_y = ffs['whiff']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(ffs_X, ffs_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.817361227076125, 'gamma': 9, 'learning_rate': 0.05411797281355388, 'max_depth': 9, 'n_estimators': 131, 'reg_alpha': 2.5649353875576796e-08, 'reg_lambda': 0.0003683175570745102, 'subsample': 0.6103204340214085})
Accuracy (Training): 0.9113
ROC AUC (Training): 0.5020
Accuracy (Test): 0.9093
ROC AUC (Test): 0.5006


In [9]:
ffs['Date'] = pd.to_datetime(ffs['Date'], format = 'mixed')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs['Date'] = pd.to_datetime(ffs['Date'], format = 'mixed')


In [14]:
aub_ffs_2024 = ffs[(ffs['PitcherTeam']=='AUB_TIG') & (ffs['Date'] > datetime.datetime(2024,1,1))]

In [16]:
# Get unique values
unique_values = ffs['Pitcher'].unique()
aub_unique_values = aub_ffs_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs[ffs['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_2024[aub_ffs_2024['Pitcher'] == value] for value in aub_unique_values}

['Myers, Carson' 'Tilly, Cameron' 'Allsup, Chase' 'McBride, Connor'
 'Carlson, Parker' 'Graves, Griffin' 'Watts, Dylan' 'Murphy, Hayden'
 'Crotchfelt, Zach' 'Armstrong, John' 'Petrovic, Alexander' 'Schorr, Ben'
 'Cannon, Will' 'Herberholz, Christian' 'Nelson, Drew' 'Copeland, Konner'
 'Bauman, Tanner' 'Gonzalez, Joseph']


In [21]:
def aub_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam stuff+: {np.mean(stuff * 100)}")

In [22]:
for pitcher in aub_unique_values:
    aub_ffs_stuff(pitcher)

Myers, Carson Four Seam stuff+: 70.63833618164062
Tilly, Cameron Four Seam stuff+: 99.46634674072266
Allsup, Chase Four Seam stuff+: 128.18869018554688
McBride, Connor Four Seam stuff+: 104.86662292480469
Carlson, Parker Four Seam stuff+: 87.67110443115234
Graves, Griffin Four Seam stuff+: 144.9161834716797
Watts, Dylan Four Seam stuff+: 87.02266693115234
Murphy, Hayden Four Seam stuff+: 102.93563079833984
Crotchfelt, Zach Four Seam stuff+: 117.78130340576172
Armstrong, John Four Seam stuff+: 108.89517211914062
Petrovic, Alexander Four Seam stuff+: 84.0595932006836
Schorr, Ben Four Seam stuff+: 70.13426208496094
Cannon, Will Four Seam stuff+: 111.48564910888672
Herberholz, Christian Four Seam stuff+: 108.4026870727539
Nelson, Drew Four Seam stuff+: 87.67931365966797
Copeland, Konner Four Seam stuff+: 77.89179229736328
Bauman, Tanner Four Seam stuff+: 60.70023727416992
Gonzalez, Joseph Four Seam stuff+: 69.5010986328125


In [23]:
import joblib

joblib.dump(best_model, '../../models/fourseam_all_model.pkl')

['../../models/fourseam_all_model.pkl']

In [24]:
ffs_rr = ffs[(ffs['PitcherThrows'] == 'Right') & (ffs['BatterSide'] == 'Right')]
ffs_rl = ffs[(ffs['PitcherThrows'] == 'Right') & (ffs['BatterSide'] == 'Left')]
ffs_lr = ffs[(ffs['PitcherThrows'] == 'Left') & (ffs['BatterSide'] == 'Right')]
ffs_ll = ffs[(ffs['PitcherThrows'] == 'Left') & (ffs['BatterSide'] == 'Left')]

print(ffs_rr.shape)
print(ffs_rl.shape)
print(ffs_lr.shape)
print(ffs_ll.shape)

(47555, 168)
(36801, 168)
(26799, 168)
(14427, 168)


In [25]:
ffs_rr['whiff'] = np.where(ffs_rr['PitchCall']=='StrikeSwinging',1,0)
ffs_rl['whiff'] = np.where(ffs_rl['PitchCall']=='StrikeSwinging',1,0)
ffs_lr['whiff'] = np.where(ffs_lr['PitchCall']=='StrikeSwinging',1,0)
ffs_ll['whiff'] = np.where(ffs_ll['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rr['whiff'] = np.where(ffs_rr['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rl['whiff'] = np.where(ffs_rl['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_lr['whiff'] = np.where(ffs_lr['PitchCall']=='StrikeSwinging',

In [26]:
preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [27]:
ffs_rr_X = ffs_rr[preds]
ffs_rr_y = ffs_rr['whiff']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(ffs_rr_X, ffs_rr_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.8468317434009265, 'gamma': 8, 'learning_rate': 0.03390942329293072, 'max_depth': 9, 'n_estimators': 198, 'reg_alpha': 0.0005714263732942848, 'reg_lambda': 0.0015496016767113624, 'subsample': 0.768833034380279})
Accuracy (Training): 0.9168
ROC AUC (Training): 0.5011
Accuracy (Test): 0.9112
ROC AUC (Test): 0.5004


In [29]:
import joblib

joblib.dump(best_model, '../../models/fourseam_rr_model.pkl')

['../../models/fourseam_rr_model.pkl']

In [30]:
ffs_rr['Date'] = pd.to_datetime(ffs_rr['Date'], format = 'mixed')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rr['Date'] = pd.to_datetime(ffs_rr['Date'], format = 'mixed')


In [39]:
aub_ffs_rr_2024 = ffs_rr[(ffs_rr['PitcherTeam']=='AUB_TIG') & (ffs_rr['Date'] > datetime.datetime(2024,1,1))]

In [40]:
# Get unique values
unique_values = ffs_rr['Pitcher'].unique()
aub_unique_values = aub_ffs_rr_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_rr[ffs_rr['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_rr_2024[aub_ffs_rr_2024['Pitcher'] == value] for value in aub_unique_values}

['Tilly, Cameron' 'Allsup, Chase' 'McBride, Connor' 'Carlson, Parker'
 'Watts, Dylan' 'Murphy, Hayden' 'Petrovic, Alexander' 'Armstrong, John'
 'Cannon, Will' 'Schorr, Ben' 'Herberholz, Christian' 'Gonzalez, Joseph']


In [41]:
def aub_rr_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_rr_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} RR Four Seam stuff+: {np.mean(stuff * 100)}")

In [42]:
for pitcher in aub_unique_values:
    aub_rr_ffs_stuff(pitcher)

Tilly, Cameron RR Four Seam stuff+: 108.41560363769531
Allsup, Chase RR Four Seam stuff+: 138.9651641845703
McBride, Connor RR Four Seam stuff+: 107.19381713867188
Carlson, Parker RR Four Seam stuff+: 75.41413116455078
Watts, Dylan RR Four Seam stuff+: 86.21150970458984
Murphy, Hayden RR Four Seam stuff+: 119.8906021118164
Petrovic, Alexander RR Four Seam stuff+: 101.44122314453125
Armstrong, John RR Four Seam stuff+: 94.03411102294922
Cannon, Will RR Four Seam stuff+: 127.3408203125
Schorr, Ben RR Four Seam stuff+: 72.4330062866211
Herberholz, Christian RR Four Seam stuff+: 115.09632110595703
Gonzalez, Joseph RR Four Seam stuff+: 65.45948791503906


In [43]:
ffs_rl_X = ffs_rl[preds]
ffs_rl_y = ffs_rl['whiff']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(ffs_rl_X, ffs_rl_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.9349553422213137, 'gamma': 9, 'learning_rate': 0.02806554771929606, 'max_depth': 10, 'n_estimators': 266, 'reg_alpha': 3.151399971510153e-08, 'reg_lambda': 1.2778159542148633e-07, 'subsample': 0.7414349590513672})
Accuracy (Training): 0.9111
ROC AUC (Training): 0.5009
Accuracy (Test): 0.9079
ROC AUC (Test): 0.5004


In [38]:
import joblib

joblib.dump(best_model, '../../models/fourseam_rl_model.pkl')

['../../models/fourseam_rl_model.pkl']

In [46]:
ffs_rl['Date'] = pd.to_datetime(ffs_rl['Date'], format='mixed')
aub_ffs_rl_2024 = ffs_rl[(ffs_rl['PitcherTeam']=='AUB_TIG') & (ffs_rl['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_rl['Date'] = pd.to_datetime(ffs_rl['Date'], format='mixed')


In [48]:
# Get unique values
unique_values = ffs_rl['Pitcher'].unique()
aub_unique_values = aub_ffs_rl_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_rl[ffs_rl['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_rl_2024[aub_ffs_rl_2024['Pitcher'] == value] for value in aub_unique_values}

['Tilly, Cameron' 'Allsup, Chase' 'McBride, Connor' 'Carlson, Parker'
 'Murphy, Hayden' 'Watts, Dylan' 'Armstrong, John' 'Petrovic, Alexander'
 'Schorr, Ben' 'Cannon, Will' 'Herberholz, Christian' 'Gonzalez, Joseph']


In [49]:
def aub_rl_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_rl_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam RL stuff+: {np.mean(stuff * 100)}")

In [50]:
for pitcher in aub_unique_values:
    aub_rl_ffs_stuff(pitcher)

Tilly, Cameron Four Seam RL stuff+: 103.46245574951172
Allsup, Chase Four Seam RL stuff+: 138.93927001953125
McBride, Connor Four Seam RL stuff+: 139.33816528320312
Carlson, Parker Four Seam RL stuff+: 80.01825714111328
Murphy, Hayden Four Seam RL stuff+: 129.45579528808594
Watts, Dylan Four Seam RL stuff+: 96.42069244384766
Armstrong, John Four Seam RL stuff+: 112.68742370605469
Petrovic, Alexander Four Seam RL stuff+: 100.37000274658203
Schorr, Ben Four Seam RL stuff+: 69.6596450805664
Cannon, Will Four Seam RL stuff+: 119.47042083740234
Herberholz, Christian Four Seam RL stuff+: 113.9951400756836
Gonzalez, Joseph Four Seam RL stuff+: 83.19973754882812


In [51]:
ffs_lr_X = ffs_lr[preds]
ffs_lr_y = ffs_lr['whiff']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(ffs_lr_X, ffs_lr_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.8468317434009265, 'gamma': 8, 'learning_rate': 0.03390942329293072, 'max_depth': 9, 'n_estimators': 198, 'reg_alpha': 0.0005714263732942848, 'reg_lambda': 0.0015496016767113624, 'subsample': 0.768833034380279})
Accuracy (Training): 0.8990
ROC AUC (Training): 0.5036
Accuracy (Test): 0.8987
ROC AUC (Test): 0.5004


In [53]:
import joblib

joblib.dump(best_model, '../../models/fourseam_lr_model.pkl')

['../../models/fourseam_lr_model.pkl']

In [54]:
ffs_lr['Date'] = pd.to_datetime(ffs_lr['Date'], format='mixed')
aub_ffs_lr_2024 = ffs_lr[(ffs_lr['PitcherTeam']=='AUB_TIG') & (ffs_lr['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_lr['Date'] = pd.to_datetime(ffs_lr['Date'], format='mixed')


In [55]:
# Get unique values
unique_values = ffs_lr['Pitcher'].unique()
aub_unique_values = aub_ffs_lr_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_lr[ffs_lr['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_lr_2024[aub_ffs_lr_2024['Pitcher'] == value] for value in aub_unique_values}

['Myers, Carson' 'Graves, Griffin' 'Crotchfelt, Zach' 'Nelson, Drew'
 'Copeland, Konner' 'Bauman, Tanner' 'Murphy, Hayden']


In [58]:
def aub_lr_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_lr_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam LR stuff+: {np.mean(stuff * 100)}")

In [59]:
for pitcher in aub_unique_values:
    aub_lr_ffs_stuff(pitcher)

Myers, Carson Four Seam LR stuff+: 74.96627044677734
Graves, Griffin Four Seam LR stuff+: 104.0830307006836
Crotchfelt, Zach Four Seam LR stuff+: 117.79011535644531
Nelson, Drew Four Seam LR stuff+: 78.60383605957031
Copeland, Konner Four Seam LR stuff+: 80.51910400390625
Bauman, Tanner Four Seam LR stuff+: 57.797489166259766
Murphy, Hayden Four Seam LR stuff+: 83.45811462402344


In [60]:
ffs_ll_X = ffs_ll[preds]
ffs_ll_y = ffs_ll['whiff']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(ffs_ll_X, ffs_ll_y, test_size=.33, random_state=25)

# Set up the XGBoost model
xgb_model = XGBClassifier(random_state=25)

# Define the parameter space for Bayesian hyperparameter tuning for XGBoost
param_space = {
    'learning_rate': (0.01, 0.3, 'log-uniform'),  # Learning rate
    'max_depth': (3, 10),  # Maximum depth of a tree
    'n_estimators': (50, 300),  # Number of boosting rounds
    'subsample': (0.6, 1.0),  # Subsample ratio of the training instances
    'colsample_bytree': (0.6, 1.0),  # Subsample ratio of columns for each tree
    'gamma': (0, 10),  # Minimum loss reduction required to make a further partition
    'reg_alpha': (1e-8, 1.0, 'log-uniform'),  # L1 regularization term
    'reg_lambda': (1e-8, 1.0, 'log-uniform'),  # L2 regularization term
}

# Set up Bayesian search using BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=25,  # Number of iterations
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Scoring metric
    n_jobs=-1,  # Use all available cores
    verbose=0,
    random_state=42
)

# Fit the model using Bayesian hyperparameter tuning
bayes_search.fit(X_train, y_train)

# Best hyperparameters found
best_params = bayes_search.best_params_
print(f"Best hyperparameters for Whiffs: {best_params}")

# Make predictions on the training and test sets using the best model
best_model = bayes_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and R-squared for both training and test sets
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test = roc_auc_score(y_test, y_test_pred)

print(f'Accuracy (Training): {accuracy_train:.4f}')
print(f'ROC AUC (Training): {auc_train:.4f}')
print(f'Accuracy (Test): {accuracy_test:.4f}')
print(f'ROC AUC (Test): {auc_test:.4f}')

Best hyperparameters for Whiffs: OrderedDict({'colsample_bytree': 0.7779330049204607, 'gamma': 9, 'learning_rate': 0.014285310742471472, 'max_depth': 6, 'n_estimators': 97, 'reg_alpha': 4.259148185658276e-05, 'reg_lambda': 1.752203051791065e-07, 'subsample': 0.9006210125361986})
Accuracy (Training): 0.9202
ROC AUC (Training): 0.5000
Accuracy (Test): 0.9170
ROC AUC (Test): 0.5000


In [67]:
import joblib

joblib.dump(best_model, '../../models/fourseam_ll_model.pkl')

['../../models/fourseam_ll_model.pkl']

In [63]:
ffs_ll['Date'] = pd.to_datetime(ffs_ll['Date'], format='mixed')
aub_ffs_ll_2024 = ffs_ll[(ffs_ll['PitcherTeam']=='AUB_TIG') & (ffs_ll['Date'] > datetime.datetime(2024,1,1))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ffs_ll['Date'] = pd.to_datetime(ffs_ll['Date'], format='mixed')


In [64]:
# Get unique values
unique_values = ffs_ll['Pitcher'].unique()
aub_unique_values = aub_ffs_ll_2024['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: ffs_ll[ffs_ll['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_ffs_ll_2024[aub_ffs_ll_2024['Pitcher'] == value] for value in aub_unique_values}

['Myers, Carson' 'Graves, Griffin' 'Nelson, Drew' 'Copeland, Konner'
 'Bauman, Tanner' 'Crotchfelt, Zach']


In [65]:
def aub_ll_ffs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = best_model.predict_proba(ffs_ll_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = best_model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Four Seam LL stuff+: {np.mean(stuff * 100)}")

In [66]:
for pitcher in aub_unique_values:
    aub_ll_ffs_stuff(pitcher)

Myers, Carson Four Seam LL stuff+: 80.69529724121094
Graves, Griffin Four Seam LL stuff+: 154.1129150390625
Nelson, Drew Four Seam LL stuff+: 111.88551330566406
Copeland, Konner Four Seam LL stuff+: 97.95450592041016
Bauman, Tanner Four Seam LL stuff+: 77.89878845214844
Crotchfelt, Zach Four Seam LL stuff+: 102.84469604492188
