<table align="center">
   <td align="center"><a target="_blank" href="https://colab.research.google.com/github/umbcdata602/fall2020/blob/master/lab_mnist_knn.ipynb">
<img src="http://introtodeeplearning.com/images/colab/colab.png?v2.0"  style="padding-bottom:5px;" />Run in Google Colab</a></td>
</table>

# Lab -- MNIST KNN

With `load_digits` 8-by-8 version of MNIST, Bonaccorso uses KNN and then finds neighbors of a "noisy" sample.

* Bonaccorso Chapter 9, [knn.py](https://github.com/giuseppebonaccorso/Machine-Learning-Algorithms-Second-Edition/blob/master/Chapter09/knn.py) -- github

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(1000)

# Load the dataset
digits = load_digits()

# Scale the dataset
ss = StandardScaler(with_std=False)
X = ss.fit_transform(digits['data'])

# Create and train the model
#knn = NearestNeighbors(n_neighbors=25, leaf_size=30, algorithm='ball_tree')
knn = NearestNeighbors(n_neighbors=25, algorithm="brute")
knn.fit(X)

# Create a noisy sample (and show it)
X_noise = X[50] + np.random.normal(0.0, 1.5, size=(64,))
X_noise = X[50] + np.random.normal(0.0, 15., size=(64,))

fig, ax = plt.subplots(1, 2, figsize=(4, 8))

ax[0].imshow(digits['images'][50], cmap='gray')
ax[0].set_xticks([])
ax[0].set_yticks([])

ax[1].imshow(ss.inverse_transform(X_noise).reshape((8, 8)), cmap='gray')
ax[1].set_xticks([])
ax[1].set_yticks([])

plt.show()

# Compute the neighbors
distances, neighbors = knn.kneighbors(X_noise.reshape(1, -1), return_distance=True)

print('Distances:\n')
print(distances[0])

# Show the neighbors
fig, ax = plt.subplots(5, 5, figsize=(8, 8))

for y in range(5):
    for x in range(5):
        idx = neighbors[0][(x + (y * 5))]
        ax[y, x].matshow(digits['images'][idx], cmap='gray')
        ax[y, x].set_xticks([])
        ax[y, x].set_yticks([])

plt.show()

# 28-by-28 MNIST

In [2]:
# Load MNIST 70K 28-by-28 pixel images with scikit-learn (less than a minute)
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = y.astype(int)
X = ((X / 255.) - .5) * 2
X_train, X_test, y_train, y_test =\
train_test_split(X, y, test_size=2000, random_state=123, stratify=y)