# Task

Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set.

## Loading libraries and the data

In [2]:
import sklearn
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
X, y = mnist['data'], mnist['target']

In [5]:
X.shape, y.shape

((70000, 784), (70000,))

## Splitting the data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Testing KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn_cl = KNeighborsClassifier()
knn_cl.fit(X_train, y_train)

KNeighborsClassifier()

In [11]:
y_knn_pred = knn_cl.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_knn_pred)

0.9693571428571428

## Fine Tuning

In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [14]:
grid_params = {
    'n_neighbors': [4,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [18]:
rs = RandomizedSearchCV(knn_cl,
                       grid_params,
                       scoring='accuracy',
                       n_jobs=-1,
                       cv=3,
                       verbose=1)

In [19]:
rs_results = rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [22]:
rs_results.best_score_, rs_results.best_params_

(0.9700714129270621,
 {'weights': 'distance', 'n_neighbors': 4, 'metric': 'euclidean'})

In [23]:
from scipy.stats import randint

In [28]:
grid_params = {
    'n_neighbors': [2,3,4]
}

In [29]:
knn_cl_bestr = KNeighborsClassifier(metric='euclidean', weights='distance')
gs = GridSearchCV(knn_cl_bestr,
                 grid_params,
                 verbose=1,
                 cv=3,
                 n_jobs=-2, 
                 scoring='accuracy')

In [30]:
gs_results = gs.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [31]:
gs_results.best_score_

0.9700714129270621

## Predictions on test set

In [32]:
from sklearn.metrics import accuracy_score

In [35]:
best_k = gs_results.best_estimator_

In [37]:
test_preds = gs_results.predict(X_test)
accuracy_score(y_test, test_preds)

0.9734285714285714