In [None]:
import os
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import shap

In [None]:
csv_dir = '../../SEC Trackman Data/'
all_files = [os.path.join(csv_dir, f) for f in os.listdir(csv_dir) if f.endswith('.csv')]

df_list = []

for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)

all_pitches = pd.concat(df_list, ignore_index=True)

print(all_pitches.shape)
print(all_pitches.columns)

In [None]:
tagged_pitch_types = all_pitches['TaggedPitchType'].unique()
print(tagged_pitch_types)

auto_pitch_types = all_pitches['AutoPitchType'].unique()
print(auto_pitch_types)

In [None]:
fastballs = all_pitches[all_pitches['TaggedPitchType'].isin(['Fastball','Cutter','Sinker','FourSeamFastBall','TwoSeamFastBall','OneSeamFastBall'])]
off_speed = all_pitches[all_pitches['TaggedPitchType'].isin(['ChangeUp','Splitter'])]
breaking_balls = all_pitches[all_pitches['TaggedPitchType'].isin(['Slider','Curveball','Knuckleball'])]

print(fastballs.shape)
print(off_speed.shape)
print(breaking_balls.shape)

In [None]:
fastballs['whiff'] = np.where(fastballs['PitchCall']=='StrikeSwinging',1,0)
off_speed['whiff'] = np.where(off_speed['PitchCall']=='StrikeSwinging',1,0)
breaking_balls['whiff'] = np.where(breaking_balls['PitchCall']=='StrikeSwinging',1,0)

In [None]:
numeric_cols = fastballs.select_dtypes(include=['number']).columns
print(numeric_cols)

In [None]:
fb_preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']

In [None]:
fastballs_X = fastballs[fb_preds]
fastballs_y = fastballs['whiff']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(fastballs_X, fastballs_y, test_size=.33, random_state=25)

model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(10, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

In [None]:
y_pred_prob = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, predictions, multi_class='ovr')

print(f'ROC AUC: {roc_auc:.4f}')

# Plotting ROC Curve for one class (e.g., class 0)
fpr, tpr, _ = roc_curve(y_test == 0, y_pred_prob)
plt.plot(fpr, tpr, label='Class 0 ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()

In [None]:
xgboost.plot_importance(model, max_num_features=20)

In [None]:
# Step 1: Initialize the SHAP explainer
explainer = shap.Explainer(model, X_train)

# Step 2: Calculate SHAP values for the training data
shap_values = explainer(X_train)

# Step 3: Plot a summary plot
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns)

In [None]:
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, plot_type='bar')

In [None]:
aub_fbs = fastballs[fastballs['PitcherTeam']=='AUB_TIG']

In [None]:
# Get unique values
unique_values = fastballs['Pitcher'].unique()
aub_unique_values = aub_fbs['Pitcher'].unique()

print(aub_unique_values)

# Create a dictionary to store subsets
all_subsets = {value: fastballs[fastballs['Pitcher'] == value] for value in unique_values}

aub_subsets = {value: aub_fbs[aub_fbs['Pitcher'] == value] for value in aub_unique_values}

In [None]:
def aub_fbs_stuff(name):
    
    preds = ['RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','pfxx','pfxz','x0','y0','z0','vx0','vy0','vz0','ax0','ay0','az0','SpeedDrop','PitchTrajectoryXc1','PitchTrajectoryXc2','PitchTrajectoryYc0','PitchTrajectoryYc1','PitchTrajectoryYc2','PitchTrajectoryZc0','PitchTrajectoryZc1','PitchTrajectoryZc2']
    
    all_preds = model.predict_proba(fastballs_X)[:,1]
    mean_pred = np.mean(all_preds)

    pitcher = aub_subsets[name]
    pitcher_x = pitcher[preds]

    whiff_preds = model.predict_proba(pitcher_x)[:,1]

    stuff = whiff_preds/mean_pred
    
    print(f"{name} Fastball stuff+: {np.mean(stuff * 100)}")

In [None]:
for pitcher in aub_unique_values:
    aub_fbs_stuff(pitcher)

In [None]:
ps_spd_avg = np.mean(p_skenes['RelSpeed'])
ps_spd_std = np.std(p_skenes['RelSpeed'])

ps_hgt_avg = np.mean(p_skenes['RelHeight'])
ps_hgt_std = np.std(p_skenes['RelHeight'])

ps_side_avg = np.mean(p_skenes['RelSide'])
ps_side_std = np.std(p_skenes['RelSide'])

In [None]:
similar_fastballs = fastballs[(fastballs['RelSpeed'] >= ps_spd_avg - ps_spd_std) &
                              (fastballs['RelSpeed'] <= ps_spd_avg + ps_spd_std) &
                              (fastballs['RelHeight'] >= ps_hgt_avg - ps_hgt_std) &
                              (fastballs['RelHeight'] <= ps_hgt_avg + ps_hgt_std) &
                              (fastballs['RelSide'] >= ps_side_avg - ps_side_std) &
                              (fastballs['RelSide'] <= ps_side_avg + ps_side_std)]

print(similar_fastballs.shape)

this should include both his fastballs and sinkers

In [None]:
similar_fastballs['stuff_plus'] = (model.predict_proba(similar_fastballs[fb_preds])[:,1])/(mean_pred) * 100
print(similar_fastballs['stuff_plus'].head(10))


In [None]:
# Create scatter plot
plt.figure(figsize=(8, 6))
hb = plt.hexbin(x=similar_fastballs['VertBreak'], y=similar_fastballs['HorzBreak'], 
                C=similar_fastballs['stuff_plus'], cmap='inferno', reduce_C_function=np.mean,
                vmin=10, vmax=300)

plt.colorbar(hb, label='')

# Add labels and title
plt.xlabel('Horizontal Break')
plt.ylabel("Vertical Break")
plt.title("Scatter Plot")

# Show plot
plt.show()