In [1]:
import os
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import matplotlib.pyplot as plt
import seaborn as sns
import shap

In [2]:
csv_dir = '../../SEC Trackman Data/'
all_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

df_list = []

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_pitches = pd.concat(df_list, ignore_index=True)

print(all_pitches.shape)
print(all_pitches.columns)

(270589, 167)
Index(['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher',
       'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter',
       ...
       'ThrowTrajectoryZc1', 'ThrowTrajectoryZc2', 'PitchReleaseConfidence',
       'PitchLocationConfidence', 'PitchMovementConfidence',
       'HitLaunchConfidence', 'HitLandingConfidence',
       'CatcherThrowCatchConfidence', 'CatcherThrowReleaseConfidence',
       'CatcherThrowLocationConfidence'],
      dtype='object', length=167)


In [3]:
fs = all_pitches[all_pitches['TaggedPitchType'].isin(['Sinker', 'TwoSeamFastBall'])]
print(fs.shape)

(18740, 167)


In [4]:
fs_rr = fs[(fs['PitcherThrows'] == 'Right') & (fs['BatterSide'] == 'Right')]
fs_rl = fs[(fs['PitcherThrows'] == 'Right') & (fs['BatterSide'] == 'Left')]
fs_lr = fs[(fs['PitcherThrows'] == 'Left') & (fs['BatterSide'] == 'Right')]
fs_ll = fs[(fs['PitcherThrows'] == 'Left') & (fs['BatterSide'] == 'Left')]

print(fs_rr.shape)
print(fs_rl.shape)
print(fs_lr.shape)
print(fs_ll.shape)

(8453, 167)
(6374, 167)
(2511, 167)
(1398, 167)


In [5]:
fs_rr['whiff'] = np.where(fs_rr['PitchCall']=='StrikeSwinging',1,0)
fs_rl['whiff'] = np.where(fs_rl['PitchCall']=='StrikeSwinging',1,0)
fs_lr['whiff'] = np.where(fs_lr['PitchCall']=='StrikeSwinging',1,0)
fs_ll['whiff'] = np.where(fs_ll['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fs_rr['whiff'] = np.where(fs_rr['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fs_rl['whiff'] = np.where(fs_rl['PitchCall']=='StrikeSwinging',1,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fs_lr['whiff'] = np.where(fs_lr['PitchCall']=='StrikeSwinging',1,0)
A

In [7]:
preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [8]:
fs_rr_X = fs_rr[preds]
fs_rr_y = fs_rr['whiff']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(fs_rr_X, fs_rr_y, test_size=.33, random_state=25)

model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 92.19%


In [21]:
aub_fs_rr = fs_rr[fs_rr['PitcherTeam']=='AUB_TIG']
print(aub_fs_rr.shape)

(431, 168)


In [24]:
# Get unique values
unique_values = fs_rr['Pitcher'].unique()
aub_unique_values = aub_fs_rr['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: fs_rr[fs_rr['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: fs_rr[fs_rr['Pitcher'] == value] for value in aub_unique_values}

['Armstrong, John' 'Keshock, Cameron' 'Gonzalez, Joseph' 'Watts, Dylan'
 'Carlson, Parker' 'Herberholz, Christian' 'Cannon, Will']


In [28]:
def aub_rr_fs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = model.predict_proba(fs_rr_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} sinker stuff+: {np.mean(stuff * 100)}")

In [29]:
for pitcher in aub_unique_values:
    aub_rr_fs_stuff(pitcher)

Armstrong, John sinker stuff+: 86.91795349121094
Keshock, Cameron sinker stuff+: 121.16152954101562
Gonzalez, Joseph sinker stuff+: 85.65171813964844
Watts, Dylan sinker stuff+: 116.86866760253906
Carlson, Parker sinker stuff+: 42.93781661987305
Herberholz, Christian sinker stuff+: 39.542694091796875
Cannon, Will sinker stuff+: 43.248191833496094
