In [77]:
import tensorflow

In [78]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, average_precision_score, \
    roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import xgboost as xgb
from xgboost import XGBRFClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

In [79]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

# Import the data
import pandas as pd
verlander_df = pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0)

X_whole = pd.read_csv("./VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("./VerlanderOnly_y_train.csv").to_numpy()

verlander_cols = np.array(pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [83]:
def keras_mlp(train_data, train_labels, val_data, val_labels):
    num_features = train_data.shape[1]
    le = LabelEncoder()
    le.fit(train_labels)
    encoded_y_train = le.transform(train_labels)
    encoded_y_test = le.transform(val_labels)
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(encoded_y_train),
                                                      y=encoded_y_train)
    num_classes = 5
    weights = {}
    for i in range(num_classes):
        weights[i] = class_weights[i]
    model = Sequential()
    # First outer layer expects input of shape (num_features, ) and has a first hidden layer with 12 nodes with relu activation
    model.add(Dense(100, input_shape=(num_features,), activation='relu'))
    # Second hidden layer has 30 nodes with relu activation
    model.add(Dense(50, activation='relu'))
    model.add(Dense(25, activation='relu'))
    # Final layer has num_classes nodes and done with softmax
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
    model.fit(train_data, encoded_y_train, class_weight=weights, epochs=300, batch_size=100, verbose=0)
    test_results = model.evaluate(val_data, encoded_y_test, verbose=0)
    print("Test Loss", test_results[0])
    print("Test Accuracy", test_results[1])
    train_results = model.evaluate(train_data, encoded_y_train, verbose=0)
    print("Train Loss", train_results[0])
    print("Train Accuracy", train_results[1])
    return model, test_results, train_results

In [None]:
keras_mlp_models_x_whole = []
keras_mlp_test_accuracies_x_whole = []
keras_mlp_test_loss_x_whole = []
keras_mlp_train_accuracies_x_whole = []
keras_mlp_train_loss_x_whole = []
for train_index, test_index in skf.split(X_whole, y):
    print("Whole")
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_whole.append(model)
    keras_mlp_test_accuracies_x_whole.append(test_results[1])
    keras_mlp_test_loss_x_whole.append(test_results[0])
    keras_mlp_train_accuracies_x_whole.append(train_results[1])
    keras_mlp_train_loss_x_whole.append(train_results[0])
    
keras_mlp_models_x_game_sit = []
keras_mlp_test_accuracies_x_game_sit = []
keras_mlp_test_loss_x_game_sit = []
keras_mlp_train_accuracies_x_game_sit = []
keras_mlp_train_loss_x_game_sit = []
for train_index, test_index in skf.split(X_game_sit, y):
    print("Situational")
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_game_sit.append(model)
    keras_mlp_test_accuracies_x_game_sit.append(test_results[1])
    keras_mlp_test_loss_x_game_sit.append(test_results[0])
    keras_mlp_train_accuracies_x_game_sit.append(train_results[1])
    keras_mlp_train_loss_x_game_sit.append(train_results[0])

keras_mlp_models_x_wo_recurse = []
keras_mlp_test_accuracies_x_wo_recurse = []
keras_mlp_test_loss_x_wo_recurse = []
keras_mlp_train_accuracies_x_wo_recurse = []
keras_mlp_train_loss_x_wo_recurse = []
for train_index, test_index in skf.split(X_wo_recurse, y):
    print("Without Recursive")
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_wo_recurse.append(model)
    keras_mlp_test_accuracies_x_wo_recurse.append(test_results[1])
    keras_mlp_test_loss_x_wo_recurse.append(test_results[0])
    keras_mlp_train_accuracies_x_wo_recurse.append(train_results[1])
    keras_mlp_train_loss_x_wo_recurse.append(train_results[0])


Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.2240400314331055
Test Accuracy 0.41884443163871765
Train Loss 0.684012234210968
Train Accuracy 0.6927145719528198
Whole
Test Loss 2.421747922897339
Test Accuracy 0.41724443435668945
Train Loss 0.7355958819389343
Train Accuracy 0.6753789186477661
Whole
Test Loss 2.3301026821136475
Test Accuracy 0.4150071144104004
Train Loss 0.7298135757446289
Train Accuracy 0.6891279220581055
Whole
Test Loss 2.51381254196167
Test Accuracy 0.40629446506500244
Train Loss 0.7760492563247681
Train Accuracy 0.6674370765686035
Whole
Test Loss 2.288818597793579
Test Accuracy 0.4386557638645172
Train Loss 0.6284338235855103
Train Accuracy 0.7169081568717957
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.5356237888336182
Test Accuracy 0.345244437456131
Train Loss 1.038975715637207
Train Accuracy 0.49375471472740173
Situational
Test Loss 1.5548750162124634
Test Accuracy 0.35199999809265137
Train Loss 1.0893429517745972
Train Accuracy 0.46672889590263367
Situational
Test Loss 1.5680396556854248
Test Accuracy 0.31738975644111633
Train Loss 1.1781487464904785
Train Accuracy 0.4226153492927551
Situational
Test Loss 1.533976674079895
Test Accuracy 0.3547297418117523
Train Loss 1.0553096532821655
Train Accuracy 0.4934660792350769
Situational
Test Loss 1.6109873056411743
Test Accuracy 0.3325035572052002
Train Loss 1.16200852394104
Train Accuracy 0.44803982973098755
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.534987449645996
Test Accuracy 0.3763555586338043
Train Loss 0.964859127998352
Train Accuracy 0.524692177772522
Without Recursive
Test Loss 1.5468347072601318
Test Accuracy 0.3824000060558319
Train Loss 0.956275224685669
Train Accuracy 0.5224252343177795
Without Recursive
Test Loss 1.576246738433838
Test Accuracy 0.36788761615753174
Train Loss 0.98403400182724
Train Accuracy 0.5136456489562988
Without Recursive
Test Loss 1.6846352815628052
Test Accuracy 0.375177800655365
Train Loss 1.06391441822052
Train Accuracy 0.5122233033180237
Without Recursive
Test Loss 1.5599395036697388
Test Accuracy 0.393669992685318
Train Loss 0.9444283246994019
Train Accuracy 0.5357365012168884


In [87]:
print("X whole train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_whole)
print(avg_train_accuracy)
print("X whole test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_whole)
print(avg_test_accuracy)
print("X sit train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_game_sit)
print(avg_train_accuracy)
print("X sit test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_game_sit)
print(avg_test_accuracy)
print("X without recursive train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_wo_recurse)
print(avg_train_accuracy)
print("X whole test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_wo_recurse)
print(avg_test_accuracy)

X whole train accuracy
0.6883133292198181
X whole test accuracy
0.41920924186706543
X sit train accuracy
0.464920973777771
X sit test accuracy
0.34037349820137025
X without recursive train accuracy
0.5217445731163025
X whole test accuracy
0.3790981948375702


In [89]:
import pickle
with open('normal-weights-nodropout-whole.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_whole, f)


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:35:31         2697
metadata.json                                  2022-12-06 19:35:31           64
variables.h5                                   2022-12-06 19:35:33       226

In [90]:
with open('normal-weights-nodropout-sit.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_game_sit, f)
with open('normal-weights-nodropout-worecursive.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_wo_recurse, f)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:36:47         2695
metadata.json                                  2022-12-06 19:36:47           64
variables.h5                                   2022-12-06 19:36:47       126

............1
......dense_3
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:36:49         2695
metadata.json                                  2022-12-06 19:36:49           64
variables.h5                                   2022-12-06 19:36:49       170136
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
..........

In [97]:
print(verlander_cols)

['pitch_type' 'month' 'year' 'inning' 'inning_topbot' 'outs' 'strikes'
 'balls' 'pitch_number' 'on_1b' 'on_2b' 'on_3b' 'score_diff' 'of_std'
 'of_strat' 'if_std' 'if_strat' 'if_shift' 'Pitcher_Tend_FF'
 'Pitcher_Tend_CU' 'Pitcher_Tend_CH' 'Pitcher_Tend_SL' 'Pitcher_Tend_SI'
 'Pitcher_Strike_Tend_FF' 'Pitcher_Strike_Tend_CU'
 'Pitcher_Strike_Tend_CH' 'Pitcher_Strike_Tend_SL'
 'Pitcher_Strike_Tend_SI' 'PrevPitch_FF' 'PrevPitch_CU' 'PrevPitch_CH'
 'PrevPitch_SL' 'PrevPitch_SI' 'PrevPitch_Strike' 'PrevPitch_Ball'
 'PrevPitch_InPlay' 'Prev5_Pcnt_FF' 'Prev5_FF_Strike' 'Prev5_Pcnt_CU'
 'Prev5_CU_Strike' 'Prev5_Pcnt_CH' 'Prev5_CH_Strike' 'Prev5_Pcnt_SL'
 'Prev5_SL_Strike' 'Prev5_Pcnt_SI' 'Prev5_SI_Strike' 'Prev10_Pcnt_FF'
 'Prev10_FF_Strike' 'Prev10_Pcnt_CU' 'Prev10_CU_Strike' 'Prev10_Pcnt_CH'
 'Prev10_CH_Strike' 'Prev10_Pcnt_SL' 'Prev10_SL_Strike' 'Prev10_Pcnt_SI'
 'Prev10_SI_Strike' 'Prev20_Pcnt_FF' 'Prev20_FF_Strike' 'Prev20_Pcnt_CU'
 'Prev20_CU_Strike' 'Prev20_Pcnt_CH' 'Prev20_CH_Strike' '

In [104]:
print(X_whole[1])

[ 8.00000000e+00  5.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  3.00000000e+00  5.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00 -1.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  5.90393208e-01  1.71983914e-01  1.55138517e-01
  7.50670241e-02  7.41733691e-03  5.10406418e-01  4.39075084e-01
  4.38076037e-01  4.98809524e-01  4.03614458e-01  1.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  6.00000000e-01
  4.00000000e-01  0.00000000e+00  0.00000000e+00  4.00000000e-01
  0.00000000e+00  0.00000000e+00  5.00000000e-01  0.00000000e+00
  0.00000000e+00  5.00000000e-01  4.28571429e-01  1.00000000e-01
  0.00000000e+00  2.00000000e-01  0.00000000e+00  2.00000000e-01
  5.00000000e-01  0.00000000e+00  0.00000000e+00  5.83333333e-01
  4.28571429e-01  8.33333333e-02  0.00000000e+00  1.66666667e-01
  0.00000000e+00  1.66666

In [111]:
print(np.sum(y == 'SL'))
print(np.sum(y == 'CH'))
print(np.sum(y == 'CU'))
print(np.sum(y == 'FF'))
print(np.sum(y == 'SI'))

3062
3787
4694
16388
191
