In [2]:
import numpy as np
np.random.seed(42)

# 1. Import data

In [3]:
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')
X, y = mnist['data'], mnist['target']

# 2. Split test set and train set

In [4]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

## 3.3 K Nearest Neighbors Classification

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, KFold

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## 3.4 Fine tuning weights & algorithm

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

params_grid = {
                "weights": ["uniform", "distance"],
                "algorithm": ["ball_tree", "kd_tree"]
              }


rand_search = RandomizedSearchCV(knn_clf, params_grid, n_iter=4, cv=2, scoring='accuracy', n_jobs=-1)
rand_search.fit(X_train, y_train)

# Display the results of the RandomizedSearch
cvres = rand_search.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(np.sqrt(mean_score), np.sqrt(std_score), params)

nan 0.020496219214783288 {'weights': 'uniform', 'algorithm': 'ball_tree'}
nan 0.01303648728951759 {'weights': 'distance', 'algorithm': 'ball_tree'}
nan 0.020496219214783288 {'weights': 'uniform', 'algorithm': 'kd_tree'}
nan 0.01303648728951759 {'weights': 'distance', 'algorithm': 'kd_tree'}




Best is {'weights': 'distance', 'algorithm': 'kd_tree'}
Distance than  has a few biggger accuracy than uniform on test set: 96.71% > 96.57% and a better accuracy 00.02% vs 00.04%
And the algorithm kd_tree is faster than ball_tree  35.46013749 vs 45.2507925

## 3.5 Fine tuning n_neighbors & leaf_size

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

params_grid = {
                "n_neighbors": sp_randint(1, 20),
                "leaf_size": sp_randint(10, 90),
                "weights": ["distance"],
                "algorithm": ["kd_tree"]
              }


rand_search = RandomizedSearchCV(knn_clf, params_grid, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)
rand_search.fit(X_train, y_train)

# Display the results of the RandomizedSearch
cvres = rand_search.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(np.sqrt(mean_score), np.sqrt(std_score), params)

The accuracy decreases when **n_neighbors** increase ; Best value would be n_neighbors = 2. **Leaf size** does not seem to impact the accuracy but the time. 