In [None]:
import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

%matplotlib inline
plt.rcParams['figure.figsize'] = (8,8)

# MNIST data

Now let's look at a slightly larger and more interesting dataset: the MNIST handwritten image dataset.

In [None]:
thin_by = 3
mnist_data = np.load('../data/mnist.npz')
mnist_train_features = mnist_data['train'].T.astype(float)[::thin_by]
mnist_train_labels = mnist_data['train_labels'].flatten()[::thin_by]
mnist_test_features = mnist_data['test'].T.astype(float)[::thin_by]
mnist_test_labels = mnist_data['test_labels'].flatten()[::thin_by]

Our data is now in a $20,000 \times 784$ array. There are 20,000 examples, each being a 784-dimensional vector.

In [None]:
mnist_train_features.shape

In [None]:
mnist_train_features[0]

Each of these vectors is actually a 28x28 image, "flattened" into a vector. We can reshape and visualize it:

In [None]:
plt.imshow(mnist_train_features[8_000].reshape(28, -1), cmap='gray')
plt.xticks([])
plt.yticks([])
plt.savefig('three.pdf', bbox_inches='tight')

## Classification

First, we'll separate the data into training and testing sets.

In [None]:
noisy_mnist_train_features = mnist_train_features + 10*np.random.normal(0, 1, size=mnist_train_features.shape)

In [None]:
train_is_seven = mnist_train_labels == 7
train_is_three = mnist_train_labels == 3

training_sevens = noisy_mnist_train_features[train_is_seven]
training_threes = noisy_mnist_train_features[train_is_three]

test_is_seven = mnist_test_labels == 7
test_is_three = mnist_test_labels == 3

testing_sevens = mnist_test_features[test_is_seven]
testing_threes = mnist_test_features[test_is_three]

In [None]:
X = np.vstack([training_sevens, training_threes])
X = np.column_stack([np.ones(X.shape[0]), X])
y = np.concatenate([
    np.ones(train_is_seven.sum()),
    np.zeros(train_is_three.sum())
])

X_test = np.vstack([testing_sevens, testing_threes])
X_test = np.column_stack([np.ones(X_test.shape[0]), X_test])
y_test = np.concatenate([
    np.ones(test_is_seven.sum()),
    np.zeros(test_is_three.sum())
])

Now we train the model:

In [None]:
w = np.linalg.solve(X.T @ X, X.T @ y)

And we can test it:

In [None]:
for i in range(20):
    ix = np.random.choice(len(X_test))
    x = X_test[ix]
    plt.matshow(x[1:].reshape((28, -1)))
    prediction = 7 if w @ x > .5 else 3
    plt.title(f'I think that this is a {prediction}.')
    

We can also take a look at the images that were misclassified.

In [None]:
misclassified = X_test[((X_test @ w) > .5).astype(int) != y_test]
misclassified

In [None]:
for i in range(10):
    plt.matshow(misclassified[i, 1:].reshape((28, -1)))