In [15]:
import tensorflow

In [16]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, top_k_accuracy_score, average_precision_score, \
    roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import xgboost as xgb
from xgboost import XGBRFClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

In [17]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

# Import the data
import pandas as pd
verlander_df = pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0)

X_whole = pd.read_csv("./VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("./VerlanderOnly_y_train.csv").to_numpy()

verlander_cols = np.array(pd.read_csv("./VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [18]:
def keras_mlp(train_data, train_labels, val_data, val_labels):
    num_features = train_data.shape[1]
    le = LabelEncoder()
    le.fit(train_labels)
    encoded_y_train = le.transform(train_labels)
    encoded_y_test = le.transform(val_labels)
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(encoded_y_train),
                                                      y=encoded_y_train)
    num_classes = 5
    weights = {}
    for i in range(num_classes):
        weights[i] = class_weights[i]
    model = Sequential()
    # First outer layer expects input of shape (num_features, ) and has a first hidden layer with 12 nodes with relu activation
    model.add(Dense(100, input_shape=(num_features,), activation='relu'))
    model.add(Dropout(0.1))
    # Second hidden layer has 30 nodes with relu activation
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(25, activation='relu'))
    model.add(Dropout(0.1))
    # Final layer has num_classes nodes and done with softmax
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tensorflow.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
    model.fit(train_data, encoded_y_train, epochs=300, batch_size=100, verbose=0)
    test_results = model.evaluate(val_data, encoded_y_test, verbose=0)
    print("Test Loss", test_results[0])
    print("Test Accuracy", test_results[1])
    train_results = model.evaluate(train_data, encoded_y_train, verbose=0)
    print("Train Loss", train_results[0])
    print("Train Accuracy", train_results[1])
    return model, test_results, train_results

In [19]:
def reduce_data_dimensionality(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    x_bar = scaler.transform(data)
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_data = pca.fit_transform(x_bar)
    return reduced_data


In [21]:
keras_mlp_models_x_whole = []
keras_mlp_test_accuracies_x_whole = []
keras_mlp_test_loss_x_whole = []
keras_mlp_train_accuracies_x_whole = []
keras_mlp_train_loss_x_whole = []
for train_index, test_index in skf.split(X_whole, y):
    print("Whole")
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
    
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_whole.append(model)
    keras_mlp_test_accuracies_x_whole.append(test_results[1])
    keras_mlp_test_loss_x_whole.append(test_results[0])
    keras_mlp_train_accuracies_x_whole.append(train_results[1])
    keras_mlp_train_loss_x_whole.append(train_results[0])

keras_mlp_models_x_game_sit = []
keras_mlp_test_accuracies_x_game_sit = []
keras_mlp_test_loss_x_game_sit = []
keras_mlp_train_accuracies_x_game_sit = []
keras_mlp_train_loss_x_game_sit = []
for train_index, test_index in skf.split(X_game_sit, y):
    print("Situational")
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_game_sit.append(model)
    keras_mlp_test_accuracies_x_game_sit.append(test_results[1])
    keras_mlp_test_loss_x_game_sit.append(test_results[0])
    keras_mlp_train_accuracies_x_game_sit.append(train_results[1])
    keras_mlp_train_loss_x_game_sit.append(train_results[0])

keras_mlp_models_x_wo_recurse = []
keras_mlp_test_accuracies_x_wo_recurse = []
keras_mlp_test_loss_x_wo_recurse = []
keras_mlp_train_accuracies_x_wo_recurse = []
keras_mlp_train_loss_x_wo_recurse = []
for train_index, test_index in skf.split(X_wo_recurse, y):
    print("Without Recursive")
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    x_train_bar = scaler.transform(X_train)
    x_test_bar = scaler.transform(X_test)
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(x_train_bar)
    X_train = pca.transform(x_train_bar)
    X_test = pca.transform(x_test_bar)
    y_train, y_test = y[train_index], y[test_index]
   
   
    model, test_results, train_results = keras_mlp(X_train, y_train, X_test, y_test)
    keras_mlp_models_x_wo_recurse.append(model)
    keras_mlp_test_accuracies_x_wo_recurse.append(test_results[1])
    keras_mlp_test_loss_x_wo_recurse.append(test_results[0])
    keras_mlp_train_accuracies_x_wo_recurse.append(train_results[1])
    keras_mlp_train_loss_x_wo_recurse.append(train_results[0])


Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.1490204334259033
Test Accuracy 0.5660444498062134
Train Loss 0.6291082501411438
Train Accuracy 0.7329421639442444
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.1536647081375122
Test Accuracy 0.5610666871070862
Train Loss 0.6122778654098511
Train Accuracy 0.7363203763961792
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.142343282699585
Test Accuracy 0.558143675327301
Train Loss 0.6115859746932983
Train Accuracy 0.7431327104568481
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.1529185771942139
Test Accuracy 0.558854877948761
Train Loss 0.6194565892219543
Train Accuracy 0.7389990091323853
Whole


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.1609429121017456
Test Accuracy 0.5552987456321716
Train Loss 0.6212546229362488
Train Accuracy 0.7398435473442078
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0954097509384155
Test Accuracy 0.5735111236572266
Train Loss 0.8859256505966187
Train Accuracy 0.6229274868965149
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.1019786596298218
Test Accuracy 0.5745777487754822
Train Loss 0.8898309469223022
Train Accuracy 0.6147041916847229
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0882712602615356
Test Accuracy 0.5650782585144043
Train Loss 0.8893768787384033
Train Accuracy 0.6188994646072388
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0813462734222412
Test Accuracy 0.5757467746734619
Train Loss 0.8912433981895447
Train Accuracy 0.6197440028190613
Situational


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0888766050338745
Test Accuracy 0.5812588930130005
Train Loss 0.8940016627311707
Train Accuracy 0.6093430519104004
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0535609722137451
Test Accuracy 0.5637333393096924
Train Loss 0.7750316262245178
Train Accuracy 0.6558652520179749
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.044082760810852
Test Accuracy 0.5777778029441833
Train Loss 0.7626246809959412
Train Accuracy 0.6491976976394653
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0404220819473267
Test Accuracy 0.575035572052002
Train Loss 0.7778336405754089
Train Accuracy 0.643079400062561
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0511950254440308
Test Accuracy 0.5693456530570984
Train Loss 0.7657018303871155
Train Accuracy 0.6567695140838623
Without Recursive


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Test Loss 1.0533021688461304
Test Accuracy 0.5704125165939331
Train Loss 0.7653228044509888
Train Accuracy 0.656991720199585


In [23]:
print("X whole train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_whole)
print(avg_train_accuracy)
print("X whole test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_whole)
print(avg_test_accuracy)
print("X sit train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_game_sit)
print(avg_train_accuracy)
print("X sit test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_game_sit)
print(avg_test_accuracy)
print("X without recursive train accuracy")
avg_train_accuracy = np.mean(keras_mlp_train_accuracies_x_wo_recurse)
print(avg_train_accuracy)
print("X without recursive test accuracy")
avg_test_accuracy = np.mean(keras_mlp_test_accuracies_x_wo_recurse)
print(avg_test_accuracy)

X whole train accuracy
0.738247561454773
X whole test accuracy
0.5598816871643066
X sit train accuracy
0.6171236395835876
X sit test accuracy
0.5740345597267151
X without recursive train accuracy
0.6523807168006897
X without recursive test accuracy
0.5712609767913819


In [25]:
import pickle
with open('pca-noweights-dropout-whole.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_whole, f)
with open('pca-noweights-dropout-sit.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_game_sit, f)
with open('pca-noweights-dropout-worecursive.pkl', 'wb') as f:
    pickle.dump(keras_mlp_models_x_wo_recurse, f)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2022-12-06 19:43:15         3138
metadata.json                                  2022-12-06 19:43:15  

File Name                                             Modified             Size
config.json                                    2022-12-06 19:43:16         3140
metadata.json                                  2022-12-06 19:43:16           64
variables.h5                                   2022-12-06 19:43:16       126192
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4