In [1]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

In [3]:
# Import the data
X_whole = pd.read_csv("../data/VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("../data/VerlanderOnly_y_train.csv").to_numpy().T[0]

verlander_cols = np.array(pd.read_csv("../data/VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

## Non-Standardized

In [4]:
# Without Recursive Features: Non-standardized (53 features)
mlp_wo_recurse_models = []
mlp_wo_recurse_train_accs = []
mlp_wo_recurse_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = MLPClassifier(hidden_layer_sizes=[70, 35, 20], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_wo_recurse_train_accs.append(train_acc)
    mlp_wo_recurse_test_accs.append(test_acc)
    mlp_wo_recurse_models.append(model)

Train Accuracy:  0.583233319998222
Test Accuracy:  0.5825777777777777
Predictions:
['CU' 'FF']
[ 208 5417]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5824776636884919
Test Accuracy:  0.5790222222222222
Predictions:
['CU' 'FF']
[ 390 5235]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5835629833763001
Test Accuracy:  0.5839260312944523
Predictions:
['CU' 'FF']
[ 218 5406]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5834740865854743
Test Accuracy:  0.5837482219061166
Predictions:
['CU' 'FF']
[ 223 5401]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5826740154680417
Test Accuracy:  0.5841038406827881
Predictions:
['CU' 'FF']
[ 136 5488]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


In [5]:
# Game Situational Features: Non-standardized (17 features)
mlp_game_sit_models = []
mlp_game_sit_train_accs = []
mlp_game_sit_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = MLPClassifier(hidden_layer_sizes=[50, 25, 10], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_game_sit_train_accs.append(train_acc)
    mlp_game_sit_test_accs.append(test_acc)
    mlp_game_sit_models.append(model)

Train Accuracy:  0.5841667777925945
Test Accuracy:  0.5841777777777778
Predictions:
['CU' 'FF']
[ 240 5385]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5847890829888429
Test Accuracy:  0.5836444444444444
Predictions:
['CU' 'FF']
[ 185 5440]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5839185705396035
Test Accuracy:  0.5828591749644382
Predictions:
['CU' 'FF']
[  95 5529]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5831629478175838
Test Accuracy:  0.5857041251778093
Predictions:
['CU' 'FF']
[ 122 5502]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5837852253533647
Test Accuracy:  0.5817923186344239
Predictions:
['CU' 'FF']
[  84 5540]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


In [6]:
# All Features: Non-standardized (100 features)
mlp_whole_models = []
mlp_whole_train_accs = []
mlp_whole_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = MLPClassifier(hidden_layer_sizes=[100, 50, 25], 
                          learning_rate_init=0.0001,
                          alpha=1,
                          batch_size=50, 
                          max_iter=1000)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_whole_train_accs.append(train_acc)
    mlp_whole_test_accs.append(test_acc)
    mlp_whole_models.append(model)

Train Accuracy:  0.5851891363292884
Test Accuracy:  0.5852444444444445
Predictions:
['CU' 'FF' 'SL']
[ 127 5486   12]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5858114415255368
Test Accuracy:  0.5845333333333333
Predictions:
['CU' 'FF' 'SL']
[ 140 5477    8]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5848964352386878
Test Accuracy:  0.5833926031294452
Predictions:
['CU' 'FF']
[ 138 5486]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5851186772157525
Test Accuracy:  0.5858819345661451
Predictions:
['CU' 'FF' 'SL']
[ 216 5402    6]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5862298871010756
Test Accuracy:  0.5849928876244666
Predictions:
['CU' 'FF' 'SL']
[ 127 5467   30]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


## Regularized

In [7]:
# Without Recursive Features: Standardized (53 features)
mlp_wo_recurse_stdzd_models = []
mlp_wo_recurse_stdzd_train_accs = []
mlp_wo_recurse_stdzd_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = MLPClassifier(hidden_layer_sizes=[70, 35, 20], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_wo_recurse_stdzd_train_accs.append(train_acc)
    mlp_wo_recurse_stdzd_test_accs.append(test_acc)
    mlp_wo_recurse_stdzd_models.append(model)

Train Accuracy:  0.5826999155442948
Test Accuracy:  0.5832888888888889
Predictions:
['CU' 'FF']
[  80 5545]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5833666711117038
Test Accuracy:  0.5825777777777777
Predictions:
['CU' 'FF']
[  79 5546]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5838741221441905
Test Accuracy:  0.582325746799431
Predictions:
['CU' 'FF']
[  83 5541]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5832073962129967
Test Accuracy:  0.5826813655761024
Predictions:
['CU' 'FF']
[  47 5577]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5827184638634545
Test Accuracy:  0.583570412517781
Predictions:
['CU' 'FF']
[  23 5601]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


In [8]:
# Game Situational Features: Standardized (17 features)
mlp_game_sit_stdzd_models = []
mlp_game_sit_stdzd_train_accs = []
mlp_game_sit_stdzd_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = MLPClassifier(hidden_layer_sizes=[50, 25, 10], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_game_sit_stdzd_train_accs.append(train_acc)
    mlp_game_sit_stdzd_test_accs.append(test_acc)
    mlp_game_sit_stdzd_models.append(model)

Train Accuracy:  0.5826554651731342
Test Accuracy:  0.5838222222222222
Predictions:
['CU' 'FF']
[  65 5560]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5826554651731342
Test Accuracy:  0.5818666666666666
Predictions:
['CU' 'FF']
[  41 5584]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5832962930038226
Test Accuracy:  0.5830369843527738
Predictions:
['CU' 'FF']
[  28 5596]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5831184994221709
Test Accuracy:  0.5833926031294452
Predictions:
['CU' 'FF']
[  37 5587]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5829851542359321
Test Accuracy:  0.5825035561877667
Predictions:
['CU' 'FF']
[  70 5554]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


In [9]:
# All Features: Standardized (100 features)
mlp_whole_stdzd_models = []
mlp_whole_stdzd_train_accs = []
mlp_whole_stdzd_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = MLPClassifier(hidden_layer_sizes=[100, 50, 25], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_whole_stdzd_train_accs.append(train_acc)
    mlp_whole_stdzd_test_accs.append(test_acc)
    mlp_whole_stdzd_models.append(model)

Train Accuracy:  0.5927456994265902
Test Accuracy:  0.5909333333333333
Predictions:
['CH' 'CU' 'FF' 'SL']
[  26  346 5200   53]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
Train Accuracy:  0.5909232342090056
Test Accuracy:  0.5884444444444444
Predictions:
['CH' 'CU' 'FF' 'SL']
[  39  250 5317   19]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
Train Accuracy:  0.5905413814561294
Test Accuracy:  0.5896159317211949
Predictions:
['CH' 'CU' 'FF' 'SL']
[  30  220 5366    8]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5903635878744777
Test Accuracy:  0.5889046941678521
Predictions:
['CH' 'CU' 'FF' 'SL']
[  30  178 5381   35]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
Train Accuracy:  0.5912970041781491
Test Accuracy:  0.5881934566145093
Predictions:
['CH' 'CU' 'FF' 'SL']
[  22  160 5412   30]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612]


In [10]:
# Without Recursive Features: PCA 95% (53 features)
mlp_wo_recurse_pca_models = []
mlp_wo_recurse_pca_train_accs = []
mlp_wo_recurse_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    print("PCA Reduced Dimensions: ", reduced_X_train.shape)
    
    model = MLPClassifier(hidden_layer_sizes=[70, 35, 20], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    train_acc = model.score(reduced_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_wo_recurse_pca_train_accs.append(train_acc)
    mlp_wo_recurse_pca_test_accs.append(test_acc)
    mlp_wo_recurse_pca_models.append(model)

PCA Reduced Dimensions:  (22497, 18)
Train Accuracy:  0.5838556251944704
Test Accuracy:  0.5843555555555555
Predictions:
['CU' 'FF']
[ 103 5522]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
PCA Reduced Dimensions:  (22497, 19)
Train Accuracy:  0.5838556251944704
Test Accuracy:  0.5831111111111111
Predictions:
['CU' 'FF']
[  83 5542]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
PCA Reduced Dimensions:  (22498, 18)
Train Accuracy:  0.5846297448662103
Test Accuracy:  0.5812588904694168
Predictions:
['CU' 'FF']
[  87 5537]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 19)
Train Accuracy:  0.5833851897946484
Test Accuracy:  0.582325746799431
Predictions:
['CU' 'FF']
[  79 5545]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 19)
Train Accuracy:  0.5829407058405192
Test Accuracy:  0.5842816500711238
Predictions:
['CU' 'FF']
[  83 5541]
Actual:
['CH' 'CU' 'FF' 'SI'

In [26]:
X_game_sit.shape

(28122, 17)

In [11]:
# Game Situational Features: PCA 95% (17 features)
mlp_game_sit_pca_models = []
mlp_game_sit_pca_train_accs = []
mlp_game_sit_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    print("PCA Reduced Dimensions: ", reduced_X_train.shape)
    
    model = MLPClassifier(hidden_layer_sizes=[50, 25, 10], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)
    
    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    train_acc = model.score(reduced_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_game_sit_pca_train_accs.append(train_acc)
    mlp_game_sit_pca_test_accs.append(test_acc)
    mlp_game_sit_pca_models.append(model)

PCA Reduced Dimensions:  (22497, 12)
Train Accuracy:  0.5829666177712584
Test Accuracy:  0.5831111111111111
Predictions:
['CU' 'FF']
[  33 5592]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
PCA Reduced Dimensions:  (22497, 12)
Train Accuracy:  0.5826110148019736
Test Accuracy:  0.5827555555555556
Predictions:
['CU' 'FF']
[  30 5595]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
PCA Reduced Dimensions:  (22498, 12)
Train Accuracy:  0.5834296381900613
Test Accuracy:  0.5821479374110953
Predictions:
['CU' 'FF']
[  62 5562]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 12)
Train Accuracy:  0.5831629478175838
Test Accuracy:  0.5846372688477952
Predictions:
['CU' 'FF']
[  94 5530]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 12)
Train Accuracy:  0.5831629478175838
Test Accuracy:  0.582325746799431
Predictions:
['CU' 'FF']
[  26 5598]
Actual:
['CH' 'CU' 'FF' 'SI'

In [12]:
# All Features: PCA 95% (100 features)
mlp_whole_pca_models = []
mlp_whole_pca_train_accs = []
mlp_whole_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    print("PCA Reduced Dimensions: ", reduced_X_train.shape)
    
    model = MLPClassifier(hidden_layer_sizes=[100, 50, 25], 
                          learning_rate_init=0.0001, 
                          alpha=1,
                          batch_size=100, 
                          max_iter=1000)

    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    train_acc = model.score(reduced_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1])
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1])
    
    mlp_whole_pca_train_accs.append(train_acc)
    mlp_whole_pca_test_accs.append(test_acc)
    mlp_whole_pca_models.append(model)

PCA Reduced Dimensions:  (22497, 35)
Train Accuracy:  0.5925678979419479
Test Accuracy:  0.5909333333333333
Predictions:
['CH' 'CU' 'FF' 'SL']
[  19  312 5243   51]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613]
PCA Reduced Dimensions:  (22497, 35)
Train Accuracy:  0.5920789438591813
Test Accuracy:  0.5904
Predictions:
['CH' 'CU' 'FF' 'SL']
[  28  281 5270   46]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613]
PCA Reduced Dimensions:  (22498, 35)
Train Accuracy:  0.5915192461552138
Test Accuracy:  0.5897937411095305
Predictions:
['CH' 'CU' 'FF' 'SL']
[  28  249 5282   65]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 35)
Train Accuracy:  0.5915192461552138
Test Accuracy:  0.5899715504978663
Predictions:
['CH' 'CU' 'FF' 'SL']
[  24  177 5360   63]
Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612]
PCA Reduced Dimensions:  (22498, 35)
Train Accuracy:  0.591385900968975
Test Accuracy:  0.58766002844950

## Ensemble

In [15]:
# Ensemble of LogReg Models
ensemble_models = []
ensemble_train_accs = []
ensemble_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(skf.split(X_whole, y)):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    mlp_pca_model = mlp_whole_pca_models[fold]
    mlp_stdz_model = mlp_whole_stdzd_models[fold]
    mlp_model = mlp_whole_models[fold]

    pca_mod_probs_train = mlp_pca_model.predict(reduced_X_train)
    stdz_mod_probs_train = mlp_stdz_model.predict(scaled_X_train)
    mod_probs_train = mlp_model.predict(X_train)
    
    pca_mod_probs_test = mlp_pca_model.predict(reduced_X_test)
    stdz_mod_probs_test = mlp_stdz_model.predict(scaled_X_test)
    mod_probs_test = mlp_model.predict(X_test)
    
#     print("Agreement PCA-STDZ: ", np.mean(pca_mod_probs_test == stdz_mod_probs_test))
#     print("Agreement PCA-REGULAR: ", np.mean(pca_mod_probs_test == mod_probs_test))
#     print("Agreement STDZ-REGULAR: ", np.mean(stdz_mod_probs_test == mod_probs_test))
    
    model_prob_votes_train = np.hstack([pca_mod_probs_train.reshape(-1, 1), stdz_mod_probs_train.reshape(-1, 1), mod_probs_train.reshape(-1, 1)])
    model_prob_votes_test = np.hstack([pca_mod_probs_test.reshape(-1, 1), stdz_mod_probs_test.reshape(-1, 1), mod_probs_test.reshape(-1, 1)])
    
    train_pred = []
    for votes in model_prob_votes_train:
        values, counts = np.unique(votes, return_counts=True)
        train_pred.append(values[np.argmax(counts)])
    train_pred = np.array(train_pred)
    
    test_pred = []
    for votes in model_prob_votes_test:
        values, counts = np.unique(votes, return_counts=True)
        test_pred.append(values[np.argmax(counts)])
    test_pred = np.array(test_pred)
    
    train_acc = np.mean(train_pred == y_train)
    test_acc = np.mean(test_pred == y_test)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc, "\n")
    
    print("Predictions:")
    print(np.unique(test_pred))
    print(np.unique(test_pred, return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")

Train Accuracy:  0.5920344934880206
Test Accuracy:  0.5905777777777778 

Predictions:
['CH' 'CU' 'FF' 'SL']
[  19  289 5274   43] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.5901675778992754
Test Accuracy:  0.5879111111111112 

Predictions:
['CH' 'CU' 'FF' 'SL']
[  26  238 5341   20] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.5897413103386968
Test Accuracy:  0.5878378378378378 

Predictions:
['CH' 'CU' 'FF' 'SL']
[  23  210 5383    8] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.5893857231753934
Test Accuracy:  0.5892603129445235 

Predictions:
['CH' 'CU' 'FF' 'SL']
[  22  178 5390   34] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.5903635878744777
Test Accuracy:  0.5876600284495022 

Predictions:
['CH' 'CU' 'FF' 'SL']
[  14  163 5419   28] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612] 



In [21]:
print(np.mean(mlp_whole_train_accs), "\t", np.mean(mlp_whole_test_accs))

print(np.mean(mlp_wo_recurse_train_accs), "\t", np.mean(mlp_wo_recurse_test_accs))

print(np.mean(mlp_game_sit_train_accs), "\t", np.mean(mlp_game_sit_test_accs))

print(np.mean(mlp_whole_stdzd_train_accs), "\t", np.mean(mlp_whole_stdzd_test_accs))

print(np.mean(mlp_wo_recurse_stdzd_train_accs), "\t", np.mean(mlp_wo_recurse_stdzd_test_accs))

print(np.mean(mlp_game_sit_stdzd_train_accs), "\t", np.mean(mlp_game_sit_stdzd_test_accs))

print(np.mean(mlp_whole_pca_train_accs), "\t", np.mean(mlp_whole_pca_test_accs))

print(np.mean(mlp_wo_recurse_pca_train_accs), "\t", np.mean(mlp_wo_recurse_pca_test_accs))

print(np.mean(mlp_game_sit_pca_train_accs), "\t", np.mean(mlp_game_sit_pca_test_accs))

0.5854491154820682 	 0.584809040619567
0.583084413823306 	 0.5826756187766714
0.5839645208983979 	 0.5836355681997787
0.5911741814288705 	 0.5892183720562668
0.5831733137753281 	 0.5828888383119961
0.5829421754016387 	 0.582924406511775
0.5918142470161063 	 0.5897517306780464
0.5837333781780638 	 0.5830665908013277
0.5830666332796921 	 0.5829955239449977


In [22]:
print(np.mean(mlp_whole_pca_train_accs), "\t", np.mean(mlp_whole_pca_test_accs))

0.5918142470161063 	 0.5897517306780464


In [25]:
len(X_whole) + len(X_test)

33746