In [10]:
import tensorflow

In [11]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, average_precision_score, \
    roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import xgboost as xgb
from xgboost import XGBRFClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

In [12]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

# Import the data
import pandas as pd
verlander_df = pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0)

X_whole = pd.read_csv("./VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("./VerlanderOnly_y_train.csv").to_numpy()

verlander_cols = np.array(pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [13]:
def keras_mlp(train_data, train_labels, val_data, val_labels):
    num_features = train_data.shape[1]
    le = LabelEncoder()
    le.fit(train_labels)
    encoded_y_train = le.transform(train_labels)
    encoded_y_test = le.transform(val_labels)
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(encoded_y_train),
                                                      y=encoded_y_train)
    num_classes = 5
    weights = {}
    for i in range(num_classes):
        weights[i] = class_weights[i]
    model = Sequential()
    # First outer layer expects input of shape (num_features, ) and has a first hidden layer with 12 nodes with relu activation
    model.add(Dense(100, input_shape=(num_features,), activation='relu'))
    # Second hidden layer has 30 nodes with relu activation
    model.add(Dense(50, activation='relu'))
    model.add(Dense(25, activation='relu'))
    # Final layer has num_classes nodes and done with softmax
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
    model.fit(train_data, encoded_y_train, class_weight=weights, epochs=300, batch_size=100, verbose=0)
    test_results = model.evaluate(val_data, encoded_y_test, verbose=0)
    print("Test Loss", test_results[0])
    print("Test Accuracy", test_results[1])
    train_results = model.evaluate(train_data, encoded_y_train, verbose=0)
    print("Train Loss", train_results[0])
    print("Train Accuracy", train_results[1])
    return model, test_results, train_results

In [14]:
def reduce_data_dimensionality(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    x_bar = scaler.transform(data)
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_data = pca.fit_transform(x_bar)
    return reduced_data


In [15]:
keras_mlp_models_x_whole = []
keras_mlp_test_accuracies_x_whole = []
keras_mlp_test_loss_x_whole = []
keras_mlp_train_accuracies_x_whole = []
keras_mlp_train_loss_x_whole = []
for train_index, test_index in skf.split(X_whole, y):
    print("Whole")
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
    
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_whole.append(model)
    keras_mlp_test_accuracies_x_whole.append(test_results[1])
    keras_mlp_test_loss_x_whole.append(test_results[0])
    keras_mlp_train_accuracies_x_whole.append(train_results[1])
    keras_mlp_train_loss_x_whole.append(train_results[0])

keras_mlp_models_x_game_sit = []
keras_mlp_test_accuracies_x_game_sit = []
keras_mlp_test_loss_x_game_sit = []
keras_mlp_train_accuracies_x_game_sit = []
keras_mlp_train_loss_x_game_sit = []
for train_index, test_index in skf.split(X_game_sit, y):
    print("Situational")
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_game_sit.append(model)
    keras_mlp_test_accuracies_x_game_sit.append(test_results[1])
    keras_mlp_test_loss_x_game_sit.append(test_results[0])
    keras_mlp_train_accuracies_x_game_sit.append(train_results[1])
    keras_mlp_train_loss_x_game_sit.append(train_results[0])

keras_mlp_models_x_wo_recurse = []
keras_mlp_test_accuracies_x_wo_recurse = []
keras_mlp_test_loss_x_wo_recurse = []
keras_mlp_train_accuracies_x_wo_recurse = []
keras_mlp_train_loss_x_wo_recurse = []
for train_index, test_index in skf.split(X_wo_recurse, y):
    print("Without Recursive")
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_wo_recurse.append(model)
    keras_mlp_test_accuracies_x_wo_recurse.append(test_results[1])
    keras_mlp_test_loss_x_wo_recurse.append(test_results[0])
    keras_mlp_train_accuracies_x_wo_recurse.append(train_results[1])
    keras_mlp_train_loss_x_wo_recurse.append(train_results[0])



Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.81075119972229
Test Accuracy 0.3957333266735077
Train Loss 0.6601589918136597
Train Accuracy 0.7115615606307983
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.5834877490997314
Test Accuracy 0.41600000858306885
Train Loss 0.7037045955657959
Train Accuracy 0.697204053401947
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.6674411296844482
Test Accuracy 0.4018492102622986
Train Loss 0.6864692568778992
Train Accuracy 0.7036181092262268
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.709087610244751
Test Accuracy 0.41287338733673096
Train Loss 0.6699575781822205
Train Accuracy 0.7134856581687927
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 2.782548427581787
Test Accuracy 0.4214082360267639
Train Loss 0.6776745915412903
Train Accuracy 0.7162414193153381
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.5236833095550537
Test Accuracy 0.37119999527931213
Train Loss 1.0735305547714233
Train Accuracy 0.4887762665748596
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.5236902236938477
Test Accuracy 0.329066663980484
Train Loss 1.114698886871338
Train Accuracy 0.4639285206794739
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.497226595878601
Test Accuracy 0.3527738153934479
Train Loss 1.1058611869812012
Train Accuracy 0.48386523127555847
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.5747476816177368
Test Accuracy 0.3321479260921478
Train Loss 1.145969271659851
Train Accuracy 0.46377456188201904
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.4769039154052734
Test Accuracy 0.37571123242378235
Train Loss 1.066857099533081
Train Accuracy 0.50115567445755
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.7674362659454346
Test Accuracy 0.36231112480163574
Train Loss 0.9299919009208679
Train Accuracy 0.5591856837272644
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.6751458644866943
Test Accuracy 0.3939555585384369
Train Loss 0.9271185994148254
Train Accuracy 0.5625194311141968
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.5767836570739746
Test Accuracy 0.3668207824230194
Train Loss 0.9773209691047668
Train Accuracy 0.536225438117981
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.6352648735046387
Test Accuracy 0.3846017122268677
Train Loss 0.8892704248428345
Train Accuracy 0.5719619393348694
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.7045115232467651
Test Accuracy 0.3801564574241638
Train Loss 0.9505825042724609
Train Accuracy 0.5474708676338196


Situational
Loss 1.4792380332946777
Accuracy 0.34532782435417175
Situational
Loss 1.4939175844192505
Accuracy 0.35172805190086365
Situational
Loss 1.4827440977096558
Accuracy 0.3501635491847992
Situational


In [16]:
print("X whole train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_whole)
print(avg_train_accuracy)
print("X whole test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_whole)
print(avg_test_accuracy)
print("X sit train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_game_sit)
print(avg_train_accuracy)
print("X sit test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_game_sit)
print(avg_test_accuracy)
print("X without recursive train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_wo_recurse)
print(avg_train_accuracy)
print("X without recursive test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_wo_recurse)
print(avg_test_accuracy)

X whole train accuracy
0.7084221601486206
X whole test accuracy
0.409572833776474
X sit train accuracy
0.4803000509738922
X sit test accuracy
0.35217992663383485
X without recursive train accuracy
0.5554726719856262
X without recursive test accuracy
0.3775691270828247


In [17]:
import pickle
with open('pca-weights-nodropout-whole.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_whole, f)
with open('pca-weights-nodropout-sit.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_game_sit, f)
with open('pca-weights-nodropout-worecursive.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_wo_recurse, f)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:41:38         2694
metadata.json                                  2022-12-06 19:41:38           64
variables.h5                                   2022-12-06 19:41:39       148

File Name                                             Modified             Size
config.json                                    2022-12-06 19:41:41         2694
metadata.json                                  2022-12-06 19:41:41           64
variables.h5                                   2022-12-06 19:41:41       120936
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive savi