# ENGR 510: Introduction to Non-Neural Network ML

In [1]:
import numpy as np
import matplotlib.pyplot as plt

# For importing the MNIST data set:
from sklearn.datasets import fetch_openml

# Scikit-learn PCA model:
from sklearn.decomposition import PCA

# Scikit-learn LDA model:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Scikit-learn SVM classifier:
from sklearn.svm import LinearSVC

# Scikit-learn decision tree classifier:
from sklearn.tree import DecisionTreeClassifier

First, let us import and visualize the MNIST data set by running the following code block.

**Please run the cell below. DO NOT edit this cell.**

In [None]:
# Load the MNIST data:
mnist = fetch_openml("mnist_784", parser="auto")
X = np.array(mnist.data) / 255.0  # Scale the data to [0, 1]
y = np.array(mnist.target)

# Print some data information:
print("MNIST data loaded succesfully!")
print(f"Image data: X.shape = {X.shape}")
print(f"Label data: y.shape = {y.shape}")
print()

# Plot some of the MNIST images.
plt.figure(figsize=(14, 7))
for i in range(15):
    plt.subplot(3, 5, i + 1)
    plt.title(f"Image {i + 1}\nLabel = {y[i]}")
    plt.imshow(X[i, :].reshape(28, 28))
    plt.colorbar()
plt.tight_layout()
plt.show()

# Mean center the data for PCA.
X = X - np.mean(X, axis=0)

Notice that there are 70,000 images total, with each image being 28 x 28 pixels, or 784 pixels total.

Each image comes with a corresponding label, i.e. a number that indicates what digit is inside the image.

## Part 1: Dimensionality Reduction with PCA

In [3]:
### TODO: Perform SVD analysis.

In [4]:
### TODO: Save the first five dominant modes.
A1 = ...

# # OPTIONAL: Plot columns of V.
# plt.figure(figsize=(14, 2))
# for i, v in enumerate(A1.T):
#     plt.subplot(1, 5, i + 1)
#     plt.title(f"$v_{i + 1}$")
#     plt.imshow(v.reshape(28, 28))
#     plt.colorbar()
# plt.tight_layout()
# plt.show()

In [5]:
### TODO: Reconstruct the first MNIST image.
r_values = [5, 50, 100, 300]
x = X[0]


In [6]:
### TODO: Plot the singular values.


### TODO: Compute the rank truncation r.
A2 = ...


In [7]:
### TODO: Project data onto the 350 leading V modes.
X_pca = ...

### TODO: Plot projected data in 3-D.
### Once you define X_pca, uncomment and run the following plotting code!

# num_plot = 500
# fig = plt.figure()
# ax = fig.add_subplot(projection="3d")
# sc = ax.scatter(
#     X_pca[:num_plot, 0],
#     X_pca[:num_plot, 1],
#     X_pca[:num_plot, 2],
#     c=y[:num_plot].astype(int),
#     cmap="tab10",
#     marker="o",
# )
# plt.colorbar(sc)
# plt.show()

## Part 2: MNIST Digit Classification

First, obtain the rank $r=350$ PCA-projected data.

**Please run the cell below. DO NOT edit this cell.**

In [None]:
# Use Scikit-learn PCA model to reduce the data.
pca = PCA(n_components=350)
X_pca = pca.fit_transform(X)

print(f"PCA image data: X_pca.shape = {X_pca.shape}")
print(f"Label data: y.shape = {y.shape}")

The following code has been provided to help you with digit extraction and train / test splitting.

Feel free to use or not use this code, but please read the documentation carefully!

In [None]:
def get_digit_data(
    X: np.ndarray,
    y: np.ndarray,
    digit_list: list,
    n_test: int,
):
    """
    Helper function that takes the given data+labels, extracts the data+labels
    containing the desired digits, and returns a training / testing data split.

    Args:
        X = (n_samples, n_features) np.ndarray of data.
        y = (n_samples,) np.ndarray of corresponding labels.
        digit_list = list of desired MNIST digits to filter out.
        n_test = integer number of digits to take for the test set.
            The first n_test digits are always taken for the test set.
    Returns:
        1. (n_train, n_features) np.ndarray of training data.
        2. (n_train,) np.ndarray of training data labels.
        3. (n_test, n_features) np.ndarray of test data.
        4. (n_test,) np.ndarray of test data labels.
    """
    inds_test = np.array([], dtype=int)
    inds_train = np.array([], dtype=int)

    for digit in digit_list:
        digit_inds = np.where(y.astype(int) == digit)[0]
        inds_test = np.union1d(inds_test, digit_inds[:n_test])
        inds_train = np.union1d(inds_train, digit_inds[n_test:])

    return X[inds_train], y[inds_train], X[inds_test], y[inds_test]

# EXAMPLE: How to form a training / testing data split on the PCA projected data
# using the digits 0, 9 while setting the first 100 samples aside per digit for testing.
X_train, y_train, X_test, y_test = get_digit_data(X_pca, y, digit_list=[0, 9], n_test=100)
print(f"Training data: X_train.shape = {X_train.shape}")
print(f"Training labels: y_train.shape = {y_train.shape} y_train = {y_train}")
print(f"Testing data: X_test.shape = {X_test.shape}")
print(f"Testing labels: y_test.shape = {y_test.shape} y_test[:10] = {y_test[:10]}")

In [10]:
### TODO: Implement classification accuracy percentage.
def compute_accuracy(y, y_true):
    """
    Args:
        y = (n_samples,) np.ndarray of computed labels.
        y_true = (n_samples,) np.ndarray of true labels.
    Returns:
        Classification accuracy percentage as a float.
    """
    return ...

In [None]:
# TODO: Fit an LDA model using the digits 3, 4.
X_train, y_train, X_test, y_test = get_digit_data(...)

lda = LinearDiscriminantAnalysis()
# DO SOMETHING WITH lda!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A3 = np.array([train_accuracy, test_accuracy])

In [None]:
# TODO: Fit an LDA model using the digits 3, 5, 9.
X_train, y_train, X_test, y_test = get_digit_data(...)

lda = LinearDiscriminantAnalysis()
# DO SOMETHING WITH lda!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A4 = np.array([train_accuracy, test_accuracy])

In [None]:
# TODO: Fit an LDA model using all digit pairs.


In [None]:
# TODO: Plot the hardest digit pair and the easiest digit pair.


In [None]:
# TODO: Fit an SVM model using the digits 7, 9.
X_train, y_train, X_test, y_test = get_digit_data(...)

svm = LinearSVC()
# DO SOMETHING WITH svm!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A9 = np.array([train_accuracy, test_accuracy])

In [None]:
# TODO: Fit an SVM model using the digits 1, 6.
X_train, y_train, X_test, y_test = get_digit_data(...)

svm = LinearSVC()
# DO SOMETHING WITH svm!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A10 = np.array([train_accuracy, test_accuracy])

In [None]:
# TODO: Fit a decision tree using the digits 4, 9.
X_train, y_train, X_test, y_test = get_digit_data(...)

tree = DecisionTreeClassifier(random_state=1234)
# DO SOMETHING WITH tree!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A11 = np.array([train_accuracy, test_accuracy])

In [None]:
# TODO: Fit a decision tree using the digits 0, 1.
X_train, y_train, X_test, y_test = get_digit_data(...)

tree = DecisionTreeClassifier(random_state=1234)
# DO SOMETHING WITH tree!

y_train_predict = ...
y_test_predict = ...

train_accuracy = compute_accuracy(y_train_predict, y_train)
test_accuracy = compute_accuracy(y_test_predict, y_test)

A12 = np.array([train_accuracy, test_accuracy])