## Multi-class classification for MNIST dataset

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from typing import Type
from matplotlib import pyplot
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

# * References
# MNIST dataset: http://yann.lecun.com/exdb/mnist/
# Image Classification with MNIST Dataset: https://debuggercafe.com/image-classification-with-mnist-dataset/
# Improving accuracy on MNIST: https://towardsdatascience.com/improving-accuracy-on-mnist-using-data-augmentation-b5c38eb5a903

### Prepare MNIST train and test data

In [None]:
# OpenML: https://www.openml.org/search?type=data&status=active

# mnist_data = fetch_openml("Fashion-MNIST", parser="auto", version="1")

# 0 T-shirt/top
# 1 Trouser
# 2 Pullover
# 3 Dress
# 4 Coat
# 5 Sandal
# 6 Shirt
# 7 Sneaker
# 8 Bag
# 9 Ankle boot

mnist_data = fetch_openml("mnist_784", parser="auto", version="1")

data: Type[pd.DataFrame] = mnist_data["data"]
target: Type[pd.DataFrame] = mnist_data["target"]

# Normalize the X (training set)
X, y = (data / 255).values.tolist(), target.values.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
origin_train_length = len(X_train)

### Plot partial mnist image

In [None]:
def PlotMnistImage(image: list[list[int]], labels: list[int]):
    for i in range(9):
        image_pixels = np.array(image[i]).reshape(28, 28)
        axis = pyplot.subplot(3, 3, i + 1)
        axis.get_xaxis().set_visible(False)
        axis.get_yaxis().set_visible(False)
        axis.set_title(labels[i])
        # pyplot.gray()
        pyplot.imshow(image_pixels)
    pyplot.show()


PlotMnistImage(X_train, y_train)

### 1: SGDClassifier accuracy

Use <span style='color:lightblue'><strong>SGDClassifier</strong></span> for the <span style='color:lightblue'><strong>MNIST</strong></span> dataset and measure the <span style='color:lightblue'><strong>accuracy</strong></span> (the ratio of correct predictions) using cross-validation (cv=3).

In [None]:
# First, use the default hyperparameter setting
sgd_clf = SGDClassifier(random_state=42, n_jobs=-1)
model = sgd_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

y_predict = sgd_clf.predict(X_test)

evaluation = confusion_matrix(y_test, y_predict)
display = ConfusionMatrixDisplay(evaluation, display_labels=sgd_clf.classes_)
display.plot()
pyplot.show()

In [None]:
from sklearn.metrics import classification_report

print(f"Classification report for classifier {sgd_clf}:\n" f"{classification_report(y_test, y_predict)}\n")

In [None]:
from sklearn.base import BaseEstimator


def CrossValidationAccuracy(estimator: Type[BaseEstimator], X_train, y_train):
    score = cross_val_score(estimator=estimator, X=X_train, y=y_train, cv=3, scoring="accuracy")
    print(f"Cross validation accuracy: {score}")
    print(f"Cross validation accuracy mean: {np.mean(score)}")


def TestingAccuracy(estimator: Type[BaseEstimator], X_test, y_test):
    y_predict = estimator.predict(X_test)
    score = accuracy_score(y_test, y_predict)
    print(f"Accuracy of test set: {score}")


CrossValidationAccuracy(sgd_clf, X_train, y_train)
TestingAccuracy(sgd_clf, X_test, y_test)

### 2: Data Augmentation

Using <span style='color:lightblue'><strong>Data Augmentation</strong></span> (artificially growing the training set) to see if the accuracy can be improved.

Note: You may write a function that can shift an MNIST image in any direction (left, right,
up, or down) by one pixel. Then, for each image in the training set, create four shifted
copies (one per direction) and add them to the training set. Finally, train your model
on this expanded training set and measure its accuracy on the test/validation set. (You
should not allow data obtained by augmentation of the training part leak into the
test/validation set.)

Ref: You can use the <span style='color:lightblue'><strong>shift()</strong></span> function from the <span style='color:lightblue'><strong>scipy.ndimage.interpolation</strong></span> module. For
example, shift(image, [2, 1], cval=0) shifts the image two pixels down and one pixel
to the right.

In [None]:
# scipy.ndimage.interpolation namespace is deprecated => use scipy.ndimage namespace
from scipy.ndimage import shift


def shift_image(image: Type[np.ndarray], dy, dx):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape((-1))


def MnistDataAugmentation(X_train: list, y_train: list, shift_bias) -> list:
    # left, right, up, down
    shiftParams = [[0, -shift_bias], [0, shift_bias], [-shift_bias, 0], [shift_bias, 0]]
    expanded_X_train = []
    expanded_y_train = []

    for s in shiftParams:
        shifted_image = [shift_image(np.array(X_train[i]), s[0], s[1]) for i in range(len(X_train))]
        expanded_X_train.extend(shifted_image)
        expanded_y_train.extend(y_train)

    X_train.extend(expanded_X_train)
    y_train.extend(expanded_y_train)


print(f"Before augmentation training set length: {len(X_train)}")

expanded_X_train, expanded_y_train = X_train.copy(), y_train.copy()
MnistDataAugmentation(expanded_X_train, expanded_y_train, 1)

print(f"After augmentation training set length: {len(expanded_X_train)}")

#### Reorganize the training data

In [None]:
np.random.seed(42)
shuffle_idx = np.random.permutation(len(expanded_X_train))
X_train_augmented = np.array(expanded_X_train)[shuffle_idx]
y_train_augmented = np.array(expanded_y_train)[shuffle_idx]

In [None]:
sgd_clf = SGDClassifier(random_state=42, n_jobs=-1)
model = sgd_clf.fit(X_train_augmented, y_train_augmented)


CrossValidationAccuracy(sgd_clf, X_train_augmented, y_train_augmented)
TestingAccuracy(sgd_clf, X_test, y_test)

### 3 Improve the performance

<span style="color:red"><strong>(Bonus)</strong></span> Is there any technique (such as <span style="color:lightblue"><strong>normalization</strong></span> or <span style="color:lightblue"><strong>hyperparameter</strong></span> tuning for SGDClassifier)
that can further improve the performance?

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "alpha": [0.0001, 0.001, 0.01, 0.1],
#     "max_iter": [100, 500, 1000],
#     "penalty": ["l1", "l2"],
# }

# grid_search = GridSearchCV(
#     estimator=SGDClassifier(
#         random_state=42,
#         n_jobs=-1,
#     ),
#     param_grid=param_grid,
#     verbose=1,
#     cv=5,
# )

# shuffle_idx = np.random.permutation(len(X_train_augmented))
# sample_size = np.array(range(int(len(shuffle_idx))))

# random_X_train = X_train_augmented[shuffle_idx[sample_size]]
# random_y_train = y_train_augmented[shuffle_idx[sample_size]]

# grid_search.fit(X_train_augmented, y_train_augmented)

# print(f"Best score: {grid_search.best_score_}")
# print(f"Best parameters: {grid_search.best_params_}")

# best_sgd_clf = grid_search.best_estimator_
# best_sgd_clf.fit(X_train_augmented, y_train_augmented)

# TestingAccuracy(best_sgd_clf, X_test, y_test)

#### Discover

- 如果訓練資料沒有經過標準化，訓練時間會加長，且模型精確度也會比標準化過的訓練資料低
- Partial fit (增量訓練) 可以增強模型訓練的效能
- Shuffle the new training set

### 4 Confusion matrix
Using <span style="color:lightblue"><strong>the confusion matrix</strong></span> to gain insights for performance evaluation/comparison.

In [None]:
y_predict = sgd_clf.predict(X_test)

evaluation = confusion_matrix(y_test, y_predict)
display = ConfusionMatrixDisplay(evaluation, display_labels=sgd_clf.classes_)
display.plot()
pyplot.show()

In [None]:
print(f"Classification report for classifier {sgd_clf}:\n" f"{classification_report(y_test, y_predict)}\n")