In [312]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[5600:], X[:5600], y[5600:], y[:5600]

In [313]:
'''Exercise 1: Try to build a classisier for the MNIST dataset that achieves over 97% accuracy on the test set.'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#Use a Grid Search to find the best paramaters for the model
param_grid = {
    'n_neighbors':[3,4,5],
    'weights':['uniform', 'distance'],
    'metric':['minkowski', 'euclidean']
    }

knn = KNeighborsClassifier()
knn_grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=3)
knn_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=0.969 total time=  23.5s
[CV 2/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=0.972 total time=  26.0s
[CV 3/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=0.968 total time=  24.3s
[CV 4/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=0.966 total time=  24.4s
[CV 5/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=0.972 total time=  21.7s
[CV 1/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=0.970 total time=  21.1s
[CV 2/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=0.973 total time=  27.1s
[CV 3/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=0.970 total time=  31.1s
[CV 4/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=0.967 total time=  24.6s
[CV 5/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=0.973 tot

In [314]:
#Update model with the best paramaters
knn = KNeighborsClassifier(n_neighbors=4, weights='distance', metric='euclidean')
knn.fit(X_train, y_train)

#Calculate the Accuracy(%) on the test set
y_pred = knn.predict(X_test)
n_correct = sum(y_pred == y_test)
print(f'KNeighbors Classifier Percent Accurate: {(n_correct / len(y_pred))*100}')

KNeighbors Classifier Percent Accurate: 97.78571428571429


In [None]:
'''Exercise 2: Write a function that shifts an MNIST image in any direction by one pixel for each image in the training set,
creates four shifted copies (one per direction) then adds them to the traning set. Finally, train your best model on this expanded
training set.'''
from scipy.ndimage import shift
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def image_shifter(df, labels):
    shifted_images = []
    shifted_labels= []
    for image, label in zip(df.values, labels):
        test_image = image.reshape(28,28)

        shifted_down = shift(test_image, [1,0], cval=0).flatten()
        shifted_images.append(shifted_down)
        shifted_labels.append(label)

        shifted_up = shift(test_image, [-1,0], cval=0).flatten()
        shifted_images.append(shifted_up)
        shifted_labels.append(label)

        shifted_right = shift(test_image, [0,1], cval=0).flatten()
        shifted_images.append(shifted_right)
        shifted_labels.append(label)
        
        shifted_left= shift(test_image, [0,-1], cval=0).flatten()
        shifted_images.append(shifted_left)
        shifted_labels.append(label)

    shifted_images_df = pd.DataFrame(shifted_images, columns=df.columns)

    updated_df = pd.concat([df, shifted_images_df], ignore_index=True)
    updated_labels = pd.concat([labels, pd.Series(shifted_labels)], ignore_index=True)

    return updated_df, updated_labels

X_augmented, y_augmented = image_shifter(X_train, y_train)

X_augmented = np.array(X_augmented)
y__augmented = np.array(y_augmented)

shuffle_idx = np.random.permutation(len(X_augmented))

X_augmented = X_augmented[shuffle_idx]
y_augmented = y_augmented[shuffle_idx]

knn = KNeighborsClassifier(n_neighbors=4, weights='distance', metric='euclidean')
knn.fit(X_augmented, y_augmented)
y_pred = knn.predict(X_test)
n_correct = sum(y_pred == y_test)
print(f'KNeighbors Classifier Percent Accurate: {(n_correct / len(y_pred))*100}')