In [15]:
import tensorflow

In [16]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, average_precision_score, \
    roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import xgboost as xgb
from xgboost import XGBRFClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

In [17]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

# Import the data
import pandas as pd
verlander_df = pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0)

X_whole = pd.read_csv("./VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("./VerlanderOnly_y_train.csv").to_numpy()

verlander_cols = np.array(pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [18]:
def keras_mlp(train_data, train_labels, val_data, val_labels):
    num_features = train_data.shape[1]
    le = LabelEncoder()
    le.fit(train_labels)
    encoded_y_train = le.transform(train_labels)
    encoded_y_test = le.transform(val_labels)
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(encoded_y_train),
                                                      y=encoded_y_train)
    num_classes = 5
    weights = {}
    for i in range(num_classes):
        weights[i] = class_weights[i]
    model = Sequential()
    # First outer layer expects input of shape (num_features, ) and has a first hidden layer with 12 nodes with relu activation
    model.add(Dense(100, input_shape=(num_features,), activation='relu'))
    model.add(Dropout(0.1))
    # Second hidden layer has 30 nodes with relu activation
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(25, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
    model.fit(train_data, encoded_y_train, class_weight=weights, epochs=300, batch_size=100, verbose=0)
    test_results = model.evaluate(val_data, encoded_y_test, verbose=0)
    print("Test Loss", test_results[0])
    print("Test Accuracy", test_results[1])
    train_results = model.evaluate(train_data, encoded_y_train, verbose=0)
    print("Train Loss", train_results[0])
    print("Train Accuracy", train_results[1])
    return model, test_results, train_results

In [19]:
def normalize_data(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    x_bar = scaler.transform(data)
    return x_bar


In [20]:
keras_mlp_models_x_whole = []
keras_mlp_test_accuracies_x_whole = []
keras_mlp_test_loss_x_whole = []
keras_mlp_train_accuracies_x_whole = []
keras_mlp_train_loss_x_whole = []
for train_index, test_index in skf.split(X_whole, y):
    print("Whole")
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    X_train, X_test = normalize_data(X_train), normalize_data(X_test)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_whole.append(model)
    keras_mlp_test_accuracies_x_whole.append(test_results[1])
    keras_mlp_test_loss_x_whole.append(test_results[0])
    keras_mlp_train_accuracies_x_whole.append(train_results[1])
    keras_mlp_train_loss_x_whole.append(train_results[0])
    

keras_mlp_models_x_game_sit = []
keras_mlp_test_accuracies_x_game_sit = []
keras_mlp_test_loss_x_game_sit = []
keras_mlp_train_accuracies_x_game_sit = []
keras_mlp_train_loss_x_game_sit = []
for train_index, test_index in skf.split(X_game_sit, y):
    print("Situational")
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    X_train, X_test = normalize_data(X_train), normalize_data(X_test)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_game_sit.append(model)
    keras_mlp_test_accuracies_x_game_sit.append(test_results[1])
    keras_mlp_test_loss_x_game_sit.append(test_results[0])
    keras_mlp_train_accuracies_x_game_sit.append(train_results[1])
    keras_mlp_train_loss_x_game_sit.append(train_results[0])
    
keras_mlp_models_x_wo_recurse = []
keras_mlp_test_accuracies_x_wo_recurse = []
keras_mlp_test_loss_x_wo_recurse = []
keras_mlp_train_accuracies_x_wo_recurse = []
keras_mlp_train_loss_x_wo_recurse = []
for train_index, test_index in skf.split(X_wo_recurse, y):
    print("Without Recursive")
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    X_train, X_test = normalize_data(X_train), normalize_data(X_test)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_wo_recurse.append(model)
    keras_mlp_test_accuracies_x_wo_recurse.append(test_results[1])
    keras_mlp_test_loss_x_wo_recurse.append(test_results[0])
    keras_mlp_train_accuracies_x_wo_recurse.append(train_results[1])
    keras_mlp_train_loss_x_wo_recurse.append(train_results[0])



Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3549636602401733
Test Accuracy 0.34933334589004517
Train Loss 1.0057902336120605
Train Accuracy 0.46223941445350647
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.387795090675354
Test Accuracy 0.322133332490921
Train Loss 1.0593754053115845
Train Accuracy 0.434813529253006
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.339845895767212
Test Accuracy 0.35010668635368347
Train Loss 0.9731037020683289
Train Accuracy 0.4832429587841034
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.4292593002319336
Test Accuracy 0.3360597491264343
Train Loss 0.9738661050796509
Train Accuracy 0.4835096597671509
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.364197850227356
Test Accuracy 0.35188478231430054
Train Loss 1.0048736333847046
Train Accuracy 0.45910748839378357
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3587042093276978
Test Accuracy 0.31626665592193604
Train Loss 1.2222893238067627
Train Accuracy 0.37493887543678284
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3928056955337524
Test Accuracy 0.30631110072135925
Train Loss 1.2629684209823608
Train Accuracy 0.35435834527015686
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3721797466278076
Test Accuracy 0.3246799409389496
Train Loss 1.245854377746582
Train Accuracy 0.3717663884162903
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3287607431411743
Test Accuracy 0.31187766790390015
Train Loss 1.2078508138656616
Train Accuracy 0.36963286995887756
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3529958724975586
Test Accuracy 0.3246799409389496
Train Loss 1.2336807250976562
Train Accuracy 0.3665214776992798
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3378562927246094
Test Accuracy 0.3189333379268646
Train Loss 1.209864854812622
Train Accuracy 0.3607591986656189
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3484258651733398
Test Accuracy 0.31040000915527344
Train Loss 1.1931394338607788
Train Accuracy 0.36502644419670105
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.349979281425476
Test Accuracy 0.3181009888648987
Train Loss 1.1738431453704834
Train Accuracy 0.3772779703140259
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3453091382980347
Test Accuracy 0.3161450922489166
Train Loss 1.1606333255767822
Train Accuracy 0.3779447078704834
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.3217631578445435
Test Accuracy 0.3161450922489166
Train Loss 1.197927713394165
Train Accuracy 0.35754290223121643


In [21]:
print("X whole train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_whole)
print(avg_train_accuracy)
print("X whole test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_whole)
print(avg_test_accuracy)
print("X sit train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_game_sit)
print(avg_train_accuracy)
print("X sit test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_game_sit)
print(avg_test_accuracy)
print("X without recursive train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_wo_recurse)
print(avg_train_accuracy)
print("X without recursive test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_wo_recurse)
print(avg_test_accuracy)

X whole train accuracy
0.46458261013031005
X whole test accuracy
0.3419035792350769
X sit train accuracy
0.36744359135627747
X sit test accuracy
0.3167630612850189
X without recursive train accuracy
0.36771024465560914
X without recursive test accuracy
0.315944904088974


In [22]:
import pickle
with open('stdz-weights-dropout-whole.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_whole, f)
with open('stdz-weights-dropout-sit.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_game_sit, f)
with open('stdz-weights-dropout-worecursive.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_wo_recurse, f)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:40:08         3136
metadata.json                                  2022-12-06 19:40:08  

File Name                                             Modified             Size
config.json                                    2022-12-06 19:40:10         3133
metadata.json                                  2022-12-06 19:40:10           64
variables.h5                                   2022-12-06 19:40:11       132192
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4