# ANN 5x2 cv t-test

### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [2]:
pip install --upgrade xlrd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 3.6 MB/s 
[?25hInstalling collected packages: xlrd
  Attempting uninstall: xlrd
    Found existing installation: xlrd 1.1.0
    Uninstalling xlrd-1.1.0:
      Successfully uninstalled xlrd-1.1.0
Successfully installed xlrd-2.0.1


# Data Preprocessing

### Importing the dataset

In [3]:
data = pd.read_excel('Combined_Bergman.xls')
data.pop('Director')
data.pop('Year') # Does not serve as an input variale to the classification models
data.pop('COUNTRY') # Does not serve as an input variale to the classification models

X = data.iloc[: , 1:-1].values # These are the predictor, independent variables (the 1:-1 excludes the first and last columns)
y = data.iloc[: , -1].values # These are the target variables (or classes, in this case)
data.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,Total,...,POV,INS,BCU,CU,MCU,MS,MLS,LS,VLS,BERGMAN
0,10 Things I Hate About You,12,3,6,20,32,18,10,1,102,...,4.0,2.0,64,224,82,37,36,53,3,0
1,"Almost Perfect Affair, An",22,3,14,6,11,1,2,1,60,...,12.0,7.0,70,199,93,51,26,51,9,0
2,"Amityville Horror, The",23,13,29,10,18,1,18,4,116,...,7.0,18.0,138,141,50,34,41,90,6,0
3,Angela's Ashes,21,3,14,9,4,1,0,1,53,...,4.0,10.0,52,152,72,60,68,88,4,0
4,Another Thin Man,41,6,6,15,52,2,0,0,122,...,7.0,4.0,20,137,99,93,84,64,3,0


In [4]:
# print(X)

In [5]:
# print(y)

### Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)
# random_state: Pass an int for reproducible output across multiple function calls

### Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train) # Feature scale even the one-hot-encoded variables for deep learning
X_test = sc.transform(X_test) # Not fitted to the test set to avoid information leakage
                              # Using the same scaler (fit mean and std dev) as the training data because 
                              # the test data is "unavailable"

In [8]:
# print(X_test)

In [9]:
# Feature scaling for hypothesis testing
X = sc.fit_transform(X)

# Construct Adam Scikit Learn Neural Network models

In [10]:
from sklearn.neural_network import MLPClassifier

clf_adam_1 = MLPClassifier(solver='adam', alpha=0.01,
                     hidden_layer_sizes=(28), max_iter=31)
clf_adam_2 = MLPClassifier(solver='adam', alpha=0.01,
                     hidden_layer_sizes=(28), max_iter=31)

score1 = clf_adam_1.fit(X_train, y_train).score(X_test, y_test)
score2 = clf_adam_2.fit(X_train, y_train).score(X_test, y_test)

print('22-neuron Adam model prediction accuracy: %.2f%%' % (score1*100))
print('28-neuron Adam model prediction accuracy: %.2f%%' % (score2*100))

22-neuron Adam model prediction accuracy: 76.56%
28-neuron Adam model prediction accuracy: 82.81%




# Implement the 5x2 cv paired t-test

H0: Both models have the same performance on the dataset.

H1: Both models doesn’t have the same performance on the dataset.

Paired-samples t-test:

1. If the result of the test suggests that there is insufficient evidence to reject the null hypothesis, then any observed difference in model skill is likely due to statistical chance.
2. If the result of the test suggests that there is sufficient evidence to reject the null hypothesis, then any observed difference in model skill is likely due to a difference in the models.

In [11]:
from mlxtend.evaluate import paired_ttest_5x2cv

t, p = paired_ttest_5x2cv(estimator1=clf_adam_1,
                          estimator2=clf_adam_2,
                          X=X, y=y,
                          random_seed=1)

# If p > α (0.05), we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different
print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

# interpret the result
if p <= 0.05:
	print('At a confidence of level of 0.95, the result of the test suggests that there is sufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to a difference in the models.')
else:
	print('At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.')



t statistic: -0.271
p value: 0.797
At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.




# Construct L-BFGS Scikit Learn Neural Network models

In [12]:
from sklearn.neural_network import MLPClassifier

clf_lbfgs_1 = MLPClassifier(solver='lbfgs', alpha=0.01,
                     hidden_layer_sizes=(28), max_iter=12)
clf_lbfgs_2 = MLPClassifier(solver='lbfgs', alpha=0.01,
                     hidden_layer_sizes=(28), max_iter=12)

score1 = clf_lbfgs_1.fit(X_train, y_train).score(X_test, y_test)
score2 = clf_lbfgs_2.fit(X_train, y_train).score(X_test, y_test)

print('22-neuron model prediction accuracy: %.2f%%' % (score1*100))
print('28-neuron model prediction accuracy: %.2f%%' % (score2*100))

22-neuron model prediction accuracy: 93.75%
28-neuron model prediction accuracy: 93.75%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Implement the 5x2 cv paired t-test

In [13]:
from mlxtend.evaluate import paired_ttest_5x2cv

t, p = paired_ttest_5x2cv(estimator1=clf_lbfgs_1,
                          estimator2=clf_lbfgs_2,
                          X=X, y=y,
                          random_seed=1)

# If p > α (0.05), we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different
print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

# interpret the result
if p <= 0.05:
	print('At a confidence of level of 0.95, the result of the test suggests that there is sufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to a difference in the models.')
else:
	print('At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

t statistic: 0.000
p value: 1.000
At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [14]:
# Implement the 5x2 cv paired t-test

In [15]:
from mlxtend.evaluate import paired_ttest_5x2cv

t, p = paired_ttest_5x2cv(estimator1=clf_lbfgs_2,
                          estimator2=clf_adam_2,
                          X=X, y=y,
                          random_seed=1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [16]:
# If p > α (0.05), we cannot reject the null hypothesis and may conclude that the performance of the two algorithms is not significantly different
print('t statistic: %.3f' % t)
print('p value: %.3f' % p)

# interpret the result
if p <= 0.05:
	print('At a confidence of level of 0.95, the result of the test suggests that there is sufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to a difference in the models.')
else:
	print('At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.')

t statistic: 2.102
p value: 0.090
At a confidence of level of 0.95, the result of the test suggests that there is insufficient evidence to reject the null hypothesis. The difference between mean test auc performance is likely due to statistical chance.
