In [1]:
import os
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import matplotlib.pyplot as plt
import seaborn as sns
import shap

In [2]:
csv_dir = '../SEC Trackman Data/'
all_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

df_list = []

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_pitches = pd.concat(df_list, ignore_index=True)

print(all_pitches.shape)
print(all_pitches.columns)

(270589, 167)
Index(['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher',
       'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter',
       ...
       'ThrowTrajectoryZc1', 'ThrowTrajectoryZc2', 'PitchReleaseConfidence',
       'PitchLocationConfidence', 'PitchMovementConfidence',
       'HitLaunchConfidence', 'HitLandingConfidence',
       'CatcherThrowCatchConfidence', 'CatcherThrowReleaseConfidence',
       'CatcherThrowLocationConfidence'],
      dtype='object', length=167)


In [3]:
breaking_balls = all_pitches[all_pitches['TaggedPitchType'].isin(['Slider','Curveball','Knuckleball'])]

In [4]:
breaking_balls['whiff'] = np.where(breaking_balls['PitchCall']=='StrikeSwinging',1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  breaking_balls['whiff'] = np.where(breaking_balls['PitchCall']=='StrikeSwinging',1,0)


In [5]:
bb_preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [6]:

breaking_balls_X = breaking_balls[bb_preds]
breaking_balls_y = breaking_balls['whiff']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(breaking_balls_X, breaking_balls_y, test_size=.33)

model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 86.54%


In [10]:
# Get unique values
unique_values = breaking_balls['Pitcher'].unique()

#Print list of pitchers to choose from
print(unique_values, len(unique_values))

# Create a dictionary to store subsets
subsets = {value: breaking_balls[breaking_balls['Pitcher'] == value] for value in unique_values}

['Dettmer, Nathan' 'Tidwell, Blade' 'Menefee, Joseph' ...
 'Pingeton, Anthony' 'Causey, A.J.' "O'Brien Cameron"] 2034


In [23]:
all_preds = model.predict_proba(breaking_balls_X)[:,1]
mean_pred = np.mean(all_preds)


c_myers = subsets['Myers, Carson']
c_myers_X = c_myers[bb_preds]

whiff_preds = model.predict_proba(c_myers_X)[:,1]

stuff = whiff_preds/mean_pred

print(f"Carson Myers Breaking Ball stuff+:{np.mean(stuff*100)}")

Carson Myers Breaking Ball stuff+:107.67224884033203


In [24]:
print(c_myers['TaggedPitchType'].unique())

['Slider']


In [27]:
cm_spd_avg = np.mean(c_myers['RelSpeed'])
cm_spd_std = np.std(c_myers['RelSpeed'])

cm_hgt_avg = np.mean(c_myers['RelHeight'])
cm_hgt_std = np.std(c_myers['RelHeight'])

cm_side_avg = np.mean(c_myers['RelSide'])
cm_side_std = np.std(c_myers['RelSide'])

In [28]:
similar_sliders = breaking_balls[(breaking_balls['RelSpeed'] >= cm_spd_avg - cm_spd_std) &
                              (breaking_balls['RelSpeed'] <= cm_spd_avg + cm_spd_std) &
                              (breaking_balls['RelHeight'] >= cm_hgt_avg - cm_hgt_std) &
                              (breaking_balls['RelHeight'] <= cm_hgt_avg + cm_hgt_std) &
                              (breaking_balls['RelSide'] >= cm_side_avg - cm_side_std) &
                              (breaking_balls['RelSide'] <= cm_side_avg + cm_side_std)]

print(similar_sliders.shape)

(382, 168)


In [17]:
similar_sliders['stuff_plus'] = (model.predict_proba(similar_sliders[bb_preds])[:,1])/mean_pred * 100

print(f"Average Stuff+ of similar sliders:{np.mean(similar_sliders['stuff_plus'])}")

Average Stuff+ of similar sliders:118.46222686767578


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_sliders['stuff_plus'] = (model.predict_proba(similar_sliders[bb_preds])[:,1])/mean_pred * 100
