In [1]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
# Constants
game_sit_feat = ['month', 'year', 'inning', 'inning_topbot', 
                 'outs', 'strikes', 'balls', 'pitch_number', 
                 'on_1b', 'on_2b', 'on_3b', 'score_diff', 
                 'of_std', 'of_strat', 'if_std', 'if_strat', 'if_shift']
wo_recursive_feat = game_sit_feat + ['Pitcher_Tend_FF', 'Pitcher_Tend_CU', 'Pitcher_Tend_CH', 'Pitcher_Tend_SL', 'Pitcher_Tend_SI', 
                                     'Pitcher_Strike_Tend_FF', 'Pitcher_Strike_Tend_CU', 'Pitcher_Strike_Tend_CH', 'Pitcher_Strike_Tend_SL', 'Pitcher_Strike_Tend_SI', 
                                     'batter_stance', 'Strike_Tend_FF', 'Strike_Tend_CU', 'Strike_Tend_CH', 'Strike_Tend_SL', 'Strike_Tend_SI', 'Overall_Strike_Tend', 
                                     'Slug_Avg_FF', 'Slug_Avg_CU', 'Slug_Avg_CH', 'Slug_Avg_SL', 'Slug_Avg_SI', 'Overall_Slug_Avg', 
                                     'Zone_1_Strike_Pcnt', 'Zone_2_Strike_Pcnt', 'Zone_3_Strike_Pcnt', 'Zone_4_Strike_Pcnt', 'Zone_5_Strike_Pcnt', 'Zone_6_Strike_Pcnt', 'Zone_7_Strike_Pcnt', 'Zone_8_Strike_Pcnt', 'Zone_9_Strike_Pcnt', 'Zone_11_Strike_Pcnt', 'Zone_12_Strike_Pcnt', 'Zone_13_Strike_Pcnt', 'Zone_14_Strike_Pcnt']

In [3]:
# Import the data
X_whole = pd.read_csv("../data/VerlanderOnly_X_train.csv").to_numpy()
y = pd.read_csv("../data/VerlanderOnly_y_train.csv").to_numpy().T[0]

verlander_cols = np.array(pd.read_csv("../data/VerlanderOnlyDataset.csv", index_col=0).columns)
game_sit_cols = np.where(np.isin(verlander_cols, game_sit_feat))[0] - 1
wo_recursive_cols = np.where(np.isin(verlander_cols, wo_recursive_feat))[0] - 1

X_game_sit = X_whole[:, game_sit_cols]
X_wo_recurse = X_whole[:, wo_recursive_cols]

## Non-Regularized

In [4]:
# Logistic Regression - Without Recursive Features: Non-regularized (53 features)
log_reg_wo_recurse_models = []
log_reg_wo_recurse_train_accs = []
log_reg_wo_recurse_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_wo_recurse_train_accs.append(train_acc)
    log_reg_wo_recurse_test_accs.append(test_acc)
    log_reg_wo_recurse_models.append(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.32706583099968883
Test Accuracy:  0.32995555555555556
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1624 1085  965  457 1494] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.3341334400142241
Test Accuracy:  0.3319111111111111
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1585 1068 1062  432 1478] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.3322517557116188
Test Accuracy:  0.3204125177809388
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1512 1061  993  479 1579] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.32780691617032626
Test Accuracy:  0.31721194879089615
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1620 1024  994  498 1488] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.33176282336207663
Test Accuracy:  0.33908250355618774
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1620  983 1065  436 1520] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# Logistic Regression - Game Situational Features: Non-regularized (17 features)
log_reg_game_sit_models = []
log_reg_game_sit_train_accs = []
log_reg_game_sit_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_game_sit_train_accs.append(train_acc)
    log_reg_game_sit_test_accs.append(test_acc)
    log_reg_game_sit_models.append(model)

Train Accuracy:  0.27803707160954794
Test Accuracy:  0.2721777777777778
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 759 1137 1048 1180 1501] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.28341556651998046
Test Accuracy:  0.29084444444444446
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 734 1142 1197 1128 1424] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.2806471686372122
Test Accuracy:  0.27507112375533427
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 753 1130 1042 1118 1581] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.2803360298693217
Test Accuracy:  0.2748933143669986
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 731 1131 1102 1156 1504] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.27904702640234685
Test Accuracy:  0.2757823613086771
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 780 1053 1082 1153 1556] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 75

In [6]:
# Logistic Regression - All Features: Non-regularized (100 features)
log_reg_whole_models = []
log_reg_whole_train_accs = []
log_reg_whole_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(X_train, y_train)
    
    test_acc = model.score(X_test, y_test)
    train_acc = model.score(X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(X_test)))
    print(np.unique(model.predict(X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_whole_train_accs.append(train_acc)
    log_reg_whole_test_accs.append(test_acc)
    log_reg_whole_models.append(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.38907409876872473
Test Accuracy:  0.38204444444444446
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1515 1094 1333  240 1443] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.39360803662710586
Test Accuracy:  0.3927111111111111
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1463 1082 1403  228 1449] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.38927904702640237
Test Accuracy:  0.37357752489331436
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1389 1099 1325  272 1539] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.38959018579429283
Test Accuracy:  0.38015647226173543
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1541 1024 1328  261 1470] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.3921237443328296
Test Accuracy:  0.3869132290184922
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1496 1005 1395  286 1442] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Regularized

In [7]:
# Logistic Regression - Without Recursive Features: Regularized (53 features)
log_reg_wo_recurse_reg_models = []
log_reg_wo_recurse_reg_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_wo_recurse_reg_test_accs.append(test_acc)
    log_reg_wo_recurse_reg_models.append(model)

Train Accuracy:  0.3296439525270036
Test Accuracy:  0.33013333333333333
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1613 1073  959  436 1544] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.3349335466951149
Test Accuracy:  0.33582222222222224
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1579 1063 1051  409 1523] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.3336741043648324
Test Accuracy:  0.3198790896159317
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1521 1061  967  468 1607] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.329140368032714
Test Accuracy:  0.32094594594594594
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1604 1010 1000  495 1515] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.3334963107831807
Test Accuracy:  0.34210526315789475
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1610  977 1075  425 1537] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758 

In [8]:
# Logistic Regression - Game Situational Features: Regularized (17 features)
log_reg_game_sit_reg_models = []
log_reg_game_sit_reg_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_game_sit_reg_test_accs.append(test_acc)
    log_reg_game_sit_reg_models.append(model)

Train Accuracy:  0.27705916344401477
Test Accuracy:  0.272
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 762 1136 1046 1181 1500] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.28310441392185626
Test Accuracy:  0.29031111111111113
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 734 1146 1198 1132 1415] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.28069161703262513
Test Accuracy:  0.2748933143669986
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 754 1132 1039 1121 1578] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.28073606542803803
Test Accuracy:  0.27507112375533427
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 733 1135 1098 1155 1503] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.27918037158858566
Test Accuracy:  0.2759601706970128
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 778 1053 1082 1159 1552] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278

In [9]:
# Logistic Regression - All Features: Regularized (100 features)
log_reg_whole_reg_models = []
log_reg_whole_reg_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(scaled_X_train, y_train)
    
    test_acc = model.score(scaled_X_test, y_test)
    train_acc = model.score(scaled_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(scaled_X_test)))
    print(np.unique(model.predict(scaled_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_whole_reg_test_accs.append(test_acc)
    log_reg_whole_reg_models.append(model)

Train Accuracy:  0.38840734320131576
Test Accuracy:  0.37866666666666665
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1517 1088 1302  236 1482] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.39071876250166687
Test Accuracy:  0.3893333333333333
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1458 1084 1396  226 1461] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.38856787269979554
Test Accuracy:  0.3742887624466572
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1381 1092 1345  269 1537] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.38963463418970573
Test Accuracy:  0.3758890469416785
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1536 1016 1300  266 1506] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.39105698284291934
Test Accuracy:  0.3874466571834993
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1489  987 1395  284 1469] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612] 



## PCA

In [10]:
# Logistic Regression - Without Recursive Features: PCA 95% (53 features)
log_reg_wo_recurse_pca_models = []
log_reg_wo_recurse_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_wo_recurse, y):
    X_train, X_test = X_wo_recurse[train_index], X_wo_recurse[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    print("Train Accuracy: ", model.score(reduced_X_train, y_train))
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_wo_recurse_pca_test_accs.append(test_acc)
    log_reg_wo_recurse_pca_models.append(model)

Train Accuracy:  0.31635329154998443
Test Accuracy:  0.31662222222222225
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1647 1068  913  475 1522] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.327599235453616
Test Accuracy:  0.3239111111111111
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1569 1082 1039  459 1476] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.3187394435060894
Test Accuracy:  0.3157894736842105
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1524 1069  943  469 1619] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.32416214774646634
Test Accuracy:  0.31258890469416784
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1585 1025  991  525 1498] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.3244732865143568
Test Accuracy:  0.3298364153627312
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1624  998 1014  459 1529] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758 

In [11]:
# Logistic Regression - Game Situational Features: PCA 95% (53 features)
log_reg_game_sit_pca_models = []
log_reg_game_sit_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_game_sit, y):
    X_train, X_test = X_game_sit[train_index], X_game_sit[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    print("Train Accuracy: ", model.score(reduced_X_train, y_train))
    print("Test Accuracy: ", test_acc)
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_game_sit_pca_test_accs.append(test_acc)
    log_reg_game_sit_pca_models.append(model)

Train Accuracy:  0.28154865093123527
Test Accuracy:  0.2789333333333333
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 862 1199 1107 1161 1296] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.28674934435702537
Test Accuracy:  0.2869333333333333
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 771 1191 1211 1141 1311] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.2824251044537292
Test Accuracy:  0.2795163584637269
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 821 1181 1077 1105 1440] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.28162503333629657
Test Accuracy:  0.2802275960170697
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 787 1156 1142 1123 1416] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.27989154591519244
Test Accuracy:  0.28236130867709813
Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 886 1094 1105 1135 1404] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 75

In [12]:
# Logistic Regression - Whole Features: PCA 95% (53 features)
log_reg_whole_pca_models = []
log_reg_whole_pca_train_accs = []
log_reg_whole_pca_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X_whole, y):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classes, class_counts = np.unique(y_train, return_counts=True)
    class_weights = dict(zip(classes, len(y_train) / (len(classes) * class_counts)))
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    model = LogisticRegression(max_iter=1000, class_weight=class_weights)
    
    model.fit(reduced_X_train, y_train)
    
    test_acc = model.score(reduced_X_test, y_test)
    train_acc = model.score(reduced_X_train, y_train)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc, "\n")
    
    print("Predictions:")
    print(np.unique(model.predict(reduced_X_test)))
    print(np.unique(model.predict(reduced_X_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    log_reg_whole_pca_train_accs.append(train_acc)
    log_reg_whole_pca_test_accs.append(test_acc)
    log_reg_whole_pca_models.append(model)

Train Accuracy:  0.37613904076099036
Test Accuracy:  0.3701333333333333 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1510 1048 1306  281 1480] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 

Train Accuracy:  0.37782815486509314
Test Accuracy:  0.3758222222222222 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1483 1050 1375  275 1442] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 

Train Accuracy:  0.3746555249355498
Test Accuracy:  0.3712660028449502 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1395 1069 1315  312 1533] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.37576673482087297
Test Accuracy:  0.3712660028449502 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1538  997 1306  300 1483] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.37932260645390703
Test Accuracy:  0.38051209103840683 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[1458  998 1388  332 1448] 

Actual:
['CH' 'CU' 'FF' 'SI' 

## Ensemble

In [14]:
# Ensemble of LogReg Models
ensemble_models = []
ensemble_train_accs = []
ensemble_test_accs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(skf.split(X_whole, y)):
    X_train, X_test = X_whole[train_index], X_whole[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    pca = PCA(n_components=0.95, svd_solver='full')
    reduced_X_train = pca.fit_transform(scaled_X_train)
    reduced_X_test = pca.transform(scaled_X_test)
    
    log_reg_pca_model = log_reg_whole_pca_models[fold]
    log_reg_stdz_model = log_reg_whole_reg_models[fold]
    log_reg_model = log_reg_whole_models[fold]
    
    pca_mod_probs_train = log_reg_pca_model.predict_proba(reduced_X_train)
    stdz_mod_probs_train = log_reg_stdz_model.predict_proba(scaled_X_train)
    mod_probs_train = log_reg_model.predict_proba(X_train)
    
    pca_mod_probs_test = log_reg_pca_model.predict_proba(reduced_X_test)
    stdz_mod_probs_test = log_reg_stdz_model.predict_proba(scaled_X_test)
    mod_probs_test = log_reg_model.predict_proba(X_test)
    
#     print("Agreement PCA-STDZ: ", np.mean(pca_mod_probs_test == stdz_mod_probs_test))
#     print("Agreement PCA-REGULAR: ", np.mean(pca_mod_probs_test == mod_probs_test))
#     print("Agreement STDZ-REGULAR: ", np.mean(stdz_mod_probs_test == mod_probs_test))
    
    model_prob_votes_train = np.hstack([pca_mod_probs_train, stdz_mod_probs_train, mod_probs_train])
    model_prob_votes_test = np.hstack([pca_mod_probs_test, stdz_mod_probs_test, mod_probs_test])
    
    ensemble = LogisticRegression()
    ensemble.fit(model_prob_votes_train, y_train)
    
    train_acc = ensemble.score(model_prob_votes_train, y_train) 
    test_acc = ensemble.score(model_prob_votes_test, y_test)
    print("Train Accuracy: ", train_acc)
    print("Test Accuracy: ", test_acc, "\n")
    
    print("Predictions:")
    print(np.unique(ensemble.predict(model_prob_votes_test)))
    print(np.unique(ensemble.predict(model_prob_votes_test), return_counts=True)[1], "\n")
    print("Actual:")
    print(np.unique(y_test))
    print(np.unique(y_test, return_counts=True)[1], "\n")
    
    ensemble_train_accs.append(train_acc)
    ensemble_test_accs.append(test_acc)
    ensemble_models.append(ensemble)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.5851446859581277
Test Accuracy:  0.5818666666666666 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[  54  402 4974   41  154] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  939 3277   38  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.5851002355869671
Test Accuracy:  0.5854222222222222 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[  90  354 4968   40  173] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3277   39  613] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.584807538447862
Test Accuracy:  0.5755689900426743 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[  83  382 4920   45  194] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy:  0.5844075028891457
Test Accuracy:  0.5782361308677099 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[  71  339 4963   43  208] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 757  939 3278   38  612] 

Train Accuracy:  0.5839185705396035
Test Accuracy:  0.5832147937411095 

Predictions:
['CH' 'CU' 'FF' 'SI' 'SL']
[  68  311 5015   39  191] 

Actual:
['CH' 'CU' 'FF' 'SI' 'SL']
[ 758  938 3278   38  612] 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
print("\t", np.mean(log_reg_whole_test_accs))

print("\t", np.mean(log_reg_wo_recurse_test_accs))

print("\t", np.mean(log_reg_game_sit_test_accs))

print("\t", np.mean(log_reg_whole_reg_test_accs))

print("\t", np.mean(log_reg_wo_recurse_reg_test_accs))

print("\t", np.mean(log_reg_game_sit_reg_test_accs))

print(np.mean(log_reg_whole_pca_train_accs), "\t", np.mean(log_reg_whole_pca_test_accs))

print("\t", np.mean(log_reg_wo_recurse_pca_test_accs))

print("\t", np.mean(log_reg_game_sit_pca_test_accs))

	 0.38308055634581956
	 0.32771472735893786
	 0.2777538043306465
	 0.381124893314367
	 0.3297771708550656
	 0.27764714398609136
0.3767424123672826 	 0.3737999304567725
	 0.31974962541488855
	 0.28159438596491226
