In [None]:
import numpy as np
from sklearn.datasets import load_digits
data = load_digits()

print(data.target[0])

import matplotlib.pyplot as plt
plt.gray()
plt.matshow(data.images[0]) # Plot one of the digits.
plt.savefig('digit.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size = 0.5,
                                                    random_state = 1234)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier(random_state=123)
bayes = GaussianNB()

tree.fit(X_train, y_train)
bayes.fit(X_train, y_train)

predictions_tree = tree.predict(X_test)
predictions_bayes = bayes.predict(X_test)

print(f"Decision Tree accuracy: {accuracy_score(y_test, predictions_tree):.3f}")
print(f"GaussianNB accuracy: {accuracy_score(y_test, predictions_bayes):.3f}")

## Monte Carlo cross-validation

In [None]:
n = 500
accuracy_tree = []
accuracy_bayes = []

for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size = 0.5,
                                                    random_state = 123 + i)

    tree = DecisionTreeClassifier(random_state=123)
    bayes = GaussianNB()
    
    tree.fit(X_train, y_train)
    bayes.fit(X_train, y_train)
    
    predictions_tree = tree.predict(X_test)
    predictions_bayes = bayes.predict(X_test)

    accuracy_tree.append(accuracy_score(y_test, predictions_tree))
    accuracy_bayes.append(accuracy_score(y_test, predictions_bayes))

In [None]:
print(f"Decision Tree accuracy: {np.mean(accuracy_tree):.3f}")
print(f"GaussianNB accuracy: {np.mean(accuracy_bayes):.3f}")

In [None]:
import matplotlib.pyplot as plt

plt.hist(accuracy_tree, bins=15, alpha=0.5, label='Decision Tree')  # alpha sets transparency
plt.hist(accuracy_bayes, bins=15, alpha=0.5, label='GaussianNB')
plt.axvline(np.mean(accuracy_tree), color='blue', linestyle='dashed', linewidth=2)
plt.axvline(np.mean(accuracy_bayes), color='orange', linestyle='dashed', linewidth=2)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Accuracy')
plt.legend()
plt.savefig('histogram_dt-bayes.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Perform a Wilcoxon test.
from scipy.stats import wilcoxon
stat, p = wilcoxon(accuracy_tree, accuracy_bayes)
print(f"p-value: {p:.10f}")