# Goal: build a classifier that achieves over 97% accuracy for the MNIST Dataset 

Load the data

In [2]:
import os,sys

#stats
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold \
                                    , cross_val_score ,cross_val_predict
from sklearn.base import clone
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score \
                            ,precision_recall_curve, roc_curve, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

# data
import sklearn.datasets
from sklearn.datasets import fetch_mldata #mldata.org is down, don't use
from tensorflow.examples.tutorials.mnist import input_data

#graphs 
import matplotlib
import matplotlib.pyplot as plt

#some variables
seed = 42 #random seed
np.random.seed(seed)

#magic
%matplotlib  inline



In [3]:
# doesn't work :(
# mnist = fetch_mldata("MNIST original")
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)

X_train = mnist.train.images
y_train = mnist.train.labels

X_test = mnist.test.images
y_test = mnist.test.labels

X = np.concatenate([X_train, X_test])
y = np.concatenate([y_train, y_test])

print("images shape is {} \nlabels shape is {}".format(X.shape,
                                                       y.shape
                                                      ))

#shuffle data and create test and train sets
shuffle_index = np.random.permutation(X.shape[0])

X = X[shuffle_index]
y = y[shuffle_index]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = seed
                                                   )

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
images shape is (65000, 784) 
labels shape is (65000,)


Lets start withe the SGD classifier to get a benchmark

In [17]:
sgd_clf =  SGDClassifier(random_state = seed, max_iter = 100)
scores = cross_val_score(sgd_clf, X, y, cv = 3, scoring = "accuracy")

In [20]:
print("Stochastic Gradient Descent accuracy scores: {}\n\
Stochastic Gradient Descent accuracy mean: {}\n\
Stochastic Gradient Descent accuracy devation: {}".format(
                np.round(scores, decimals = 3),
                np.round(np.mean(scores), decimals = 3),
                np.round(np.std(scores), decimals = 3)
    ))

Stochastic Gradient Descent accuracy scores: [0.912 0.91  0.919]
Stochastic Gradient Descent accuracy mean: 0.913
Stochastic Gradient Descent accuracy devation 0.004


Ok, so an accuracy of 91.3%, some distance from the target of 97%.

We can try to optimise some of its parameters with Gridsearch

In [39]:
sgd_clf = SGDClassifier(random_state = seed, max_iter = 100)
param_grid = param_grid = {
    "loss": ["log"],
    "penalty": ["l1", "l2", "elasticnet"],
    "alpha": [10 ** x for x in range(-6, 1)],
    "l1_ratio": [0, 0.01, 0.1,1],

}

clf_grid = GridSearchCV(estimator=sgd_clf, 
                        param_grid=param_grid,
                        n_jobs=-1, 
                        scoring='accuracy',
                        verbose = 10)

clf_grid.fit(X,y)

Fitting 3 folds for each of 84 candidates, totalling 252 fits
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l1 ...................
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l1 ...................
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l1 ...................
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l2, score=0.859345 - 1.5min
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l2, score=0.874412 - 1.4min
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l1, score=0.852238 - 3.3min
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l1, score=0.878750 - 3.3min
[CV] alpha=1e-06, l1_ratio=0, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l1, score=0.8

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  3.4min


[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=l2, score=0.880308 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1 ................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=elasticnet, score=0.859345 - 3.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1 ................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=elasticnet, score=0.874412 - 3.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1 ................
[CV]  alpha=1e-06, l1_ratio=0, loss=log, penalty=elasticnet, score=0.880308 - 3.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2 ................
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1, score=0.852238 - 3.2min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2 ................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.6min


[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2, score=0.859345 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2 ................
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2, score=0.874412 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l2, score=0.880308 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1, score=0.878750 - 3.3min
[CV] alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=l1, score=0.885017 - 3.3min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l1 .................
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.873096 - 3.8min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l1 .................
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l1, score=0.852238 - 3.2min
[CV] al

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 13.3min


[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.878104 - 3.9min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-06, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.880216 - 3.8min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2, score=0.859345 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2, score=0.874412 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l1, score=0.878750 - 3.3min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l2, score=0.880308 - 1.5min
[CV] alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=l1, score=0.885017 - 3.3min
[CV

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 16.7min


[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet, score=0.859022 - 3.9min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=l1 ...................
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=l1, score=0.852238 - 3.2min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=l1 ...................
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet, score=0.879719 - 3.8min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=0.1, loss=log, penalty=elasticnet, score=0.876246 - 3.8min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=l2, score=0.859345 - 1.5min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=l2 ...................
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=l2, score=0.874412 - 1.5min
[CV] alpha=1e-06, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=l1, score=0.878750 - 3.3min
[CV] 

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 23.2min


[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=elasticnet, score=0.852238 - 3.2min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=l1 ...................
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=elasticnet, score=0.878750 - 3.2min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=l1 ...................
[CV]  alpha=1e-06, l1_ratio=1, loss=log, penalty=elasticnet, score=0.885017 - 3.3min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=l1, score=0.891278 - 3.6min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=l2, score=0.900600 - 1.5min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=l2, score=0.905474 - 1.5min
[CV] alpha=1e-05, l1_ratio=0, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=l1, score=0.892643 - 3.5min
[CV] alpha=

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 29.4min


[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=elasticnet, score=0.900600 - 3.5min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l1 ................
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=elasticnet, score=0.905474 - 3.5min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l1 ................
[CV]  alpha=1e-05, l1_ratio=0, loss=log, penalty=elasticnet, score=0.912482 - 3.5min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l2 ................
[CV]  alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l1, score=0.891278 - 3.5min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l2 ................
[CV]  alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l2, score=0.900600 - 1.5min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l2 ................
[CV]  alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l2, score=0.905474 - 1.6min
[CV] alpha=1e-05, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=1e-05, l1_ratio=0.01, loss=log, penalty=l1, score=0.892643 - 3.6min

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 39.3min


[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l1, score=0.891278 - 3.6min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-05, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.912666 - 3.9min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2, score=0.900600 - 1.5min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2, score=0.905474 - 1.5min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l1, score=0.892643 - 3.6min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l2, score=0.912482 - 1.5min
[CV] alpha=1e-05, l1_ratio=0.1, loss=log, penalty=elasticnet .........
[CV]  alpha=1e-05, l1_ratio=0.1, loss=log, penalty=l1, score=0.900988 - 3.6min
[CV] alpha=1

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 46.4min


[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=l2, score=0.900600 - 1.5min
[CV] alpha=1e-05, l1_ratio=1, loss=log, penalty=l2 ...................
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=l2, score=0.905474 - 1.5min
[CV] alpha=1e-05, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=l1, score=0.892643 - 3.6min
[CV] alpha=1e-05, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=l2, score=0.912482 - 1.5min
[CV] alpha=1e-05, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=l1, score=0.900988 - 3.5min
[CV] alpha=0.0001, l1_ratio=0, loss=log, penalty=l1 ..................
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=elasticnet, score=0.891278 - 3.6min
[CV] alpha=0.0001, l1_ratio=0, loss=log, penalty=l1 ..................
[CV]  alpha=1e-05, l1_ratio=1, loss=log, penalty=elasticnet, score=0.892643 - 3.5min
[CV] alpha=0.0001, 

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 55.7min


[CV]  alpha=0.0001, l1_ratio=0, loss=log, penalty=l2, score=0.918575 - 1.5min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l1 ...............
[CV]  alpha=0.0001, l1_ratio=0, loss=log, penalty=elasticnet, score=0.911906 - 3.5min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l1 ...............
[CV]  alpha=0.0001, l1_ratio=0, loss=log, penalty=elasticnet, score=0.909490 - 3.4min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l1 ...............
[CV]  alpha=0.0001, l1_ratio=0, loss=log, penalty=elasticnet, score=0.918575 - 3.4min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l2 ...............
[CV]  alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l1, score=0.907383 - 3.6min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l2 ...............
[CV]  alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l2, score=0.911906 - 1.5min
[CV] alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l2 ...............
[CV]  alpha=0.0001, l1_ratio=0.01, loss=log, penalty=l2, score=0.909490 - 1.

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 65.9min


[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l1, score=0.907383 - 3.6min
[CV] alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l2 ................
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l2, score=0.911906 - 1.5min
[CV] alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l2 ................
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l2, score=0.909490 - 1.6min
[CV] alpha=0.0001, l1_ratio=0.1, loss=log, penalty=elasticnet ........
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l1, score=0.899935 - 3.7min
[CV] alpha=0.0001, l1_ratio=0.1, loss=log, penalty=elasticnet ........
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l2, score=0.918575 - 1.5min
[CV] alpha=0.0001, l1_ratio=0.1, loss=log, penalty=elasticnet ........
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=l1, score=0.910451 - 3.7min
[CV] alpha=0.0001, l1_ratio=1, loss=log, penalty=l1 ..................
[CV]  alpha=0.0001, l1_ratio=0.1, loss=log, penalty=elasticnet, score=0.911721 - 3.6min
[CV] a

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 75.7min


[CV]  alpha=0.0001, l1_ratio=1, loss=log, penalty=elasticnet, score=0.907383 - 3.7min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=l1 ...................
[CV]  alpha=0.0001, l1_ratio=1, loss=log, penalty=elasticnet, score=0.899935 - 3.6min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=l1 ...................
[CV]  alpha=0.001, l1_ratio=0, loss=log, penalty=l1, score=0.876557 - 3.4min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=0.0001, l1_ratio=1, loss=log, penalty=elasticnet, score=0.910451 - 3.7min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=0.001, l1_ratio=0, loss=log, penalty=l2, score=0.906414 - 1.5min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=l2 ...................
[CV]  alpha=0.001, l1_ratio=0, loss=log, penalty=l2, score=0.902289 - 1.5min
[CV] alpha=0.001, l1_ratio=0, loss=log, penalty=elasticnet ...........
[CV]  alpha=0.001, l1_ratio=0, loss=log, penalty=l1, score=0.880966 - 3.4min
[CV] alp

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 87.0min


[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=l1, score=0.880966 - 3.5min
[CV] alpha=0.001, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=l1, score=0.895956 - 3.4min
[CV] alpha=0.001, l1_ratio=0.01, loss=log, penalty=elasticnet ........
[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=l2, score=0.913728 - 1.5min
[CV] alpha=0.001, l1_ratio=0.1, loss=log, penalty=l1 .................
[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.906045 - 3.6min
[CV] alpha=0.001, l1_ratio=0.1, loss=log, penalty=l1 .................
[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.902289 - 3.6min
[CV] alpha=0.001, l1_ratio=0.1, loss=log, penalty=l1 .................
[CV]  alpha=0.001, l1_ratio=0.1, loss=log, penalty=l1, score=0.876557 - 3.4min
[CV] alpha=0.001, l1_ratio=0.1, loss=log, penalty=l2 .................
[CV]  alpha=0.001, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.913636 

[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 99.5min


[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=l2, score=0.902289 - 1.5min
[CV] alpha=0.001, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=l1, score=0.880966 - 3.4min
[CV] alpha=0.001, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=l1, score=0.895956 - 3.4min
[CV] alpha=0.001, l1_ratio=1, loss=log, penalty=elasticnet ...........
[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=l2, score=0.913728 - 1.5min
[CV] alpha=0.01, l1_ratio=0, loss=log, penalty=l1 ....................
[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=elasticnet, score=0.876557 - 3.4min
[CV] alpha=0.01, l1_ratio=0, loss=log, penalty=l1 ....................
[CV]  alpha=0.001, l1_ratio=1, loss=log, penalty=elasticnet, score=0.880966 - 3.4min
[CV] alpha=0.01, l1_ratio=0, loss=log, penalty=l1 ....................
[CV]  alpha=0.01, l1_ratio=0, loss=log, penalty=l1, score=0.797462 - 3.2min
[CV] alpha=0.01, l1_

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 110.5min


[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=l2, score=0.885187 - 1.5min
[CV] alpha=0.01, l1_ratio=0.01, loss=log, penalty=l2 .................
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=l2, score=0.883596 - 1.5min
[CV] alpha=0.01, l1_ratio=0.01, loss=log, penalty=elasticnet .........
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=l1, score=0.795671 - 3.2min
[CV] alpha=0.01, l1_ratio=0.01, loss=log, penalty=elasticnet .........
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=l1, score=0.812777 - 3.2min
[CV] alpha=0.01, l1_ratio=0.01, loss=log, penalty=elasticnet .........
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=l2, score=0.897803 - 1.5min
[CV] alpha=0.01, l1_ratio=0.1, loss=log, penalty=l1 ..................
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.884033 - 3.4min
[CV] alpha=0.01, l1_ratio=0.1, loss=log, penalty=l1 ..................
[CV]  alpha=0.01, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.882027 - 3.4min
[CV] 

[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 123.9min


[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=l2, score=0.883596 - 1.5min
[CV] alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet ............
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=l1, score=0.795671 - 3.2min
[CV] alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet ............
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=l1, score=0.812777 - 3.2min
[CV] alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet ............
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=l2, score=0.897803 - 1.5min
[CV] alpha=0.1, l1_ratio=0, loss=log, penalty=l1 .....................
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet, score=0.797462 - 3.2min
[CV] alpha=0.1, l1_ratio=0, loss=log, penalty=l1 .....................
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet, score=0.795671 - 3.2min
[CV] alpha=0.1, l1_ratio=0, loss=log, penalty=l1 .....................
[CV]  alpha=0.01, l1_ratio=1, loss=log, penalty=elasticnet, score=0.812777 - 3.2min
[CV] alpha=0.1, l1

[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 136.0min


[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=l1, score=0.112527 - 3.1min
[CV] alpha=0.1, l1_ratio=0.01, loss=log, penalty=elasticnet ..........
[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=l1, score=0.112537 - 3.1min
[CV] alpha=0.1, l1_ratio=0.01, loss=log, penalty=elasticnet ..........
[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=l2, score=0.865630 - 1.5min
[CV] alpha=0.1, l1_ratio=0.1, loss=log, penalty=l1 ...................
[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.841440 - 3.3min
[CV] alpha=0.1, l1_ratio=0.1, loss=log, penalty=l1 ...................
[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.845426 - 3.3min
[CV] alpha=0.1, l1_ratio=0.1, loss=log, penalty=l1 ...................
[CV]  alpha=0.1, l1_ratio=0.01, loss=log, penalty=elasticnet, score=0.860691 - 3.3min
[CV] alpha=0.1, l1_ratio=0.1, loss=log, penalty=l2 ...................
[CV]  alpha=0.1, l1_ratio=0.1, loss=log, penalty=l1, score=0.112506 - 3.0min
[CV] 

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 149.0min


[CV]  alpha=0.1, l1_ratio=1, loss=log, penalty=elasticnet, score=0.112506 - 3.0min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=l1 .......................
[CV]  alpha=0.1, l1_ratio=1, loss=log, penalty=elasticnet, score=0.112527 - 3.0min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=l1 .......................
[CV]  alpha=0.1, l1_ratio=1, loss=log, penalty=elasticnet, score=0.112537 - 3.0min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=l2 .......................
[CV]  alpha=1, l1_ratio=0, loss=log, penalty=l1, score=0.097877 - 3.0min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=l2 .......................
[CV]  alpha=1, l1_ratio=0, loss=log, penalty=l2, score=0.771574 - 1.6min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=l2 .......................
[CV]  alpha=1, l1_ratio=0, loss=log, penalty=l2, score=0.786024 - 1.6min
[CV] alpha=1, l1_ratio=0, loss=log, penalty=elasticnet ...............
[CV]  alpha=1, l1_ratio=0, loss=log, penalty=l1, score=0.112527 - 3.1min
[CV] alpha=1, l1_ratio=0, loss=lo

[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed: 163.3min


[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l1, score=0.097877 - 3.0min
[CV] alpha=1, l1_ratio=0.1, loss=log, penalty=l2 .....................
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l2, score=0.771574 - 1.5min
[CV] alpha=1, l1_ratio=0.1, loss=log, penalty=l2 .....................
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l2, score=0.786024 - 1.6min
[CV] alpha=1, l1_ratio=0.1, loss=log, penalty=elasticnet .............
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l1, score=0.112527 - 3.1min
[CV] alpha=1, l1_ratio=0.1, loss=log, penalty=elasticnet .............
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l1, score=0.112537 - 3.1min
[CV] alpha=1, l1_ratio=0.1, loss=log, penalty=elasticnet .............
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=l2, score=0.806730 - 1.5min
[CV] alpha=1, l1_ratio=1, loss=log, penalty=l1 .......................
[CV]  alpha=1, l1_ratio=0.1, loss=log, penalty=elasticnet, score=0.097877 - 3.1min
[CV] alpha=1, l1_ratio=1, loss=log, penal

[Parallel(n_jobs=-1)]: Done 252 out of 252 | elapsed: 174.8min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=100, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'loss': ['log'], 'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1], 'l1_ratio': [0, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=10)

In [40]:
print("Accuracy: {}\n\
Best Params\n\
alpha: {} \n\
l1: {} \n\
penalty: {}".format(clf_grid.best_score_,
                    clf_grid.best_params_["alpha"],
                    clf_grid.best_params_["l1_ratio"],
                    clf_grid.best_params_["penalty"]
                   ))

Accuracy: 0.9133230769230769
Best Params
alpha: 0.0001 
l1: 0 
penalty: l2


Well, appears as if parameter tuning does nothing...

Lets try a few more classifiers and see what ones work best

**Random Forest**

In [45]:
rnf_clf = RandomForestClassifier(n_jobs = 2, random_state = seed)

rnf_scores = cross_val_score(rnf_clf,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy"
                         )

In [47]:
print("Random Forest accuracy scores: {}\n\
Random Forest accuracy mean: {}\n\
Random Forest accuracy devation: {}".format(
                np.round(rnf_scores, decimals = 3),
                np.round(np.mean(rnf_scores), decimals = 3),
                np.round(np.std(rnf_scores), decimals = 3)
    ))

Random Forest accuracy scores: [0.937 0.937 0.94 ]
Random Forest accuracy mean: 0.938
Random Forest accuracy devation: 0.002


Better, by ~2% maybe this could work?

In [57]:
param_grid = {
    "n_estimators": [int(x) for x in range(200,2100,200)],
    "max_depth": [int(x) for x in range(10,110,10)],
 }

In [58]:
rmf_clf_grid = GridSearchCV(estimator=rnf_clf, 
                        param_grid=param_grid,
                        n_jobs=-1, 
                        scoring='accuracy',
                        verbose = 10)

rmf_clf_grid.fit(X_train,y_train)

Fitting 3 folds for each of 190 candidates, totalling 570 fits
[CV] max_depth=10, n_estimators=200 ..................................
[CV] max_depth=10, n_estimators=200 ..................................
[CV] max_depth=10, n_estimators=200 ..................................
[CV] max_depth=10, n_estimators=300 ..................................
[CV] ......... max_depth=10, n_estimators=200, score=0.943666 -  41.6s
[CV] max_depth=10, n_estimators=300 ..................................
[CV] ......... max_depth=10, n_estimators=200, score=0.945936 -  41.7s
[CV] max_depth=10, n_estimators=300 ..................................
[CV] ......... max_depth=10, n_estimators=200, score=0.941799 -  42.0s
[CV] max_depth=10, n_estimators=400 ..................................
[CV] ......... max_depth=10, n_estimators=300, score=0.944048 - 1.0min
[CV] max_depth=10, n_estimators=400 ..................................
[CV] ......... max_depth=10, n_estimators=300, score=0.941744 - 1.0min
[CV] max_depth

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.7min


[CV] ......... max_depth=10, n_estimators=300, score=0.946590 - 1.0min
[CV] max_depth=10, n_estimators=500 ..................................
[CV] ......... max_depth=10, n_estimators=400, score=0.944866 - 1.4min
[CV] max_depth=10, n_estimators=500 ..................................
[CV] ......... max_depth=10, n_estimators=400, score=0.941526 - 1.4min
[CV] max_depth=10, n_estimators=500 ..................................
[CV] ......... max_depth=10, n_estimators=400, score=0.946809 - 1.4min
[CV] max_depth=10, n_estimators=600 ..................................
[CV] ......... max_depth=10, n_estimators=500, score=0.944102 - 1.8min
[CV] max_depth=10, n_estimators=600 ..................................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.5min


KeyboardInterrupt: 

**SVM**

In [64]:
sv_clf = SVC(random_state = seed)

sv_scores = cross_val_score(sv_clf,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy",
                          verbose = 10
                         )

[CV]  ................................................................
[CV] ....................... , score=0.9318318154550908, total= 7.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.5min remaining:    0.0s


[CV] ........................ , score=0.930780559646539, total= 7.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.0min remaining:    0.0s


[CV] ....................... , score=0.9353518821603928, total= 7.8min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 22.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 22.8min finished


In [65]:
print("Support Vector Classifer scores: {}\n\
Support Vector Classifer mean: {}\n\
Support Vector Classifer devation: {}".format(
                np.round(sv_scores, decimals = 3),
                np.round(np.mean(sv_scores), decimals = 3),
                np.round(np.std(sv_scores), decimals = 3)
    ))

Support Vector Classifer scores: [0.932 0.931 0.935]
Support Vector Classifer mean: 0.933
Support Vector Classifer devation: 0.002


In [66]:
lsv_clf = LinearSVC(random_state = seed)

lsv_scores = cross_val_score(lsv_clf,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy",
                          verbose = 10
                         )

[CV]  ................................................................
[CV] ....................... , score=0.9072912690189235, total=  56.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.0s remaining:    0.0s


[CV] ....................... , score=0.9062892052582774, total=  57.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s


[CV] ........................ , score=0.911620294599018, total=  59.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.9min finished


In [67]:
print("Linear Support Vector Classifer scores: {}\n\
Linear Support Vector Classifer mean: {}\n\
Linear Support Vector Classifer devation: {}".format(
                np.round(lsv_scores, decimals = 3),
                np.round(np.mean(lsv_scores), decimals = 3),
                np.round(np.std(lsv_scores), decimals = 3)
    ))

Support Vector Classifer scores: [0.907 0.906 0.912]
Support Vector Classifer mean: 0.908
Support Vector Classifer devation: 0.002


In [75]:
rnn_clf = RadiusNeighborsClassifier()
param_grid = {
    "radius": [x for x in range(10,100,10)]
 }
               
rnn_clf_grid = GridSearchCV(estimator=rnn_clf, 
                        param_grid=param_grid,
                        n_jobs=-1, 
                        scoring='accuracy',
                        verbose = 10)

rnn_scores = cross_val_score(rnn_clf_grid,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy",
                          verbose = 10
                         )

[CV]  ................................................................
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] radius=10 .......................................................
[CV] radius=10 .......................................................
[CV] radius=10 .......................................................
[CV] radius=20 .......................................................
[CV] .............................. radius=10, score=0.421139 -29.2min
[CV] radius=20 .......................................................
[CV] .............................. radius=10, score=0.433459 -29.4min
[CV] radius=20 .......................................................
[CV] .............................. radius=20, score=0.112375 -17.2min
[CV] radius=30 .......................................................
[CV] .............................. radius=30, score=0.112320 - 4.6min
[CV] radius=30 .......................................................
[CV] ............

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 55.5min


[CV] .............................. radius=30, score=0.112375 - 4.4min
[CV] radius=40 .......................................................
[CV] .............................. radius=40, score=0.112320 - 4.5min
[CV] radius=40 .......................................................
[CV] .............................. radius=40, score=0.112348 - 4.4min
[CV] radius=40 .......................................................
[CV] .............................. radius=40, score=0.112375 - 4.4min
[CV] radius=50 .......................................................
[CV] .............................. radius=50, score=0.112320 - 4.5min
[CV] radius=50 .......................................................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 77.6min


[CV] .............................. radius=50, score=0.112348 - 4.5min
[CV] radius=50 .......................................................
[CV] .............................. radius=50, score=0.112375 - 4.4min
[CV] radius=60 .......................................................
[CV] .............................. radius=60, score=0.112320 - 4.5min
[CV] radius=60 .......................................................
[CV] .............................. radius=60, score=0.112348 - 4.4min
[CV] radius=60 .......................................................
[CV] .............................. radius=60, score=0.112375 - 4.4min
[CV] radius=70 .......................................................
[CV] .............................. radius=70, score=0.112320 - 4.3min
[CV] radius=70 .......................................................
[CV] .............................. radius=70, score=0.112348 - 4.3min
[CV] radius=70 .......................................................


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 108.4min


[CV] .............................. radius=70, score=0.112375 - 4.3min
[CV] radius=80 .......................................................
[CV] .............................. radius=80, score=0.112320 - 4.3min
[CV] radius=80 .......................................................
[CV] .............................. radius=80, score=0.112348 - 4.4min
[CV] radius=80 .......................................................
[CV] .............................. radius=80, score=0.112375 - 4.4min
[CV] radius=90 .......................................................
[CV] .............................. radius=90, score=0.112320 - 4.4min
[CV] radius=90 .......................................................
[CV] .............................. radius=90, score=0.112348 - 4.3min
[CV] radius=90 .......................................................


[Parallel(n_jobs=-1)]: Done  23 out of  27 | elapsed: 134.4min remaining: 23.4min


[CV] .............................. radius=90, score=0.112375 - 4.3min


Process ForkPoolWorker-51:
Traceback (most recent call last):
  File "/home/edd/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/edd/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/edd/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/edd/.local/share/virtualenvs/ml_projects-PtpjDyvs/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
  File "/home/edd/anaconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/edd/anaconda3/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)


KeyboardInterrupt: 

In [None]:
print("Radial Neighbors Classifer scores: {}\n\
Radial Neighbors Classifer mean: {}\n\
Radial Neighbors Classifer devation: {}".format(
                np.round(rnn_scores, decimals = 3),
                np.round(np.mean(rnn_scores), decimals = 3),
                np.round(np.std(rnn_scores), decimals = 3)
    ))

In [4]:
knn_clf = KNeighborsClassifier()

In [None]:


knn_scores = cross_val_score(knn_clf,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy",
                          verbose = 10
                         )

[CV]  ................................................................


In [77]:
print("K Neighbors Classifer scores: {}\n\
K Neighbors Classifer mean: {}\n\
K Neighbors Classifer devation: {}".format(
                np.round(knn_scores, decimals = 3),
                np.round(np.mean(knn_scores), decimals = 3),
                np.round(np.std(knn_scores), decimals = 3)
    ))

K Neighbors Classifer scores: [0.966 0.966 0.967]
K Neighbors Classifer mean: 0.966
K Neighbors Classifer devation: 0.0


oooooh so close, i think we have a winner so far though....

In [12]:
k_range = range(1, 15)
weight_options = ['uniform', 'distance']
leaf_range = range(20,40)
param_dist = {"n_neighbors" : k_range,
              "weights" : weight_options,
              "leaf_size": leaf_range
             }

knn_rand = RandomizedSearchCV(knn_clf,
                          param_dist,
                          cv=3,
                          scoring='accuracy',
                          n_iter=10,
                          random_state=seed,
                          n_jobs = 2,
                          verbose = 10)

knn_rand.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] weights=distance, n_neighbors=3, leaf_size=36 ...................
[CV] weights=distance, n_neighbors=3, leaf_size=36 ...................
[CV]  weights=distance, n_neighbors=3, leaf_size=36, score=0.968018 - 7.7min
[CV] weights=distance, n_neighbors=3, leaf_size=36 ...................


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  7.7min


[CV]  weights=distance, n_neighbors=3, leaf_size=36, score=0.966515 - 7.7min
[CV] weights=distance, n_neighbors=3, leaf_size=32 ...................
[CV]  weights=distance, n_neighbors=3, leaf_size=36, score=0.965313 - 7.7min
[CV] weights=distance, n_neighbors=3, leaf_size=32 ...................
[CV]  weights=distance, n_neighbors=3, leaf_size=32, score=0.966515 - 7.7min
[CV] weights=distance, n_neighbors=3, leaf_size=32 ...................


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 15.4min


[CV]  weights=distance, n_neighbors=3, leaf_size=32, score=0.965313 - 7.7min
[CV] weights=distance, n_neighbors=5, leaf_size=26 ...................
[CV]  weights=distance, n_neighbors=3, leaf_size=32, score=0.968018 - 7.7min
[CV] weights=distance, n_neighbors=5, leaf_size=26 ...................
[CV]  weights=distance, n_neighbors=5, leaf_size=26, score=0.966185 - 9.6min
[CV] weights=distance, n_neighbors=5, leaf_size=26 ...................
[CV]  weights=distance, n_neighbors=5, leaf_size=26, score=0.968282 - 9.6min
[CV] weights=uniform, n_neighbors=2, leaf_size=23 ....................
[CV]  weights=distance, n_neighbors=5, leaf_size=26, score=0.965181 - 9.6min
[CV] weights=uniform, n_neighbors=2, leaf_size=23 ....................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 42.4min


[CV]  weights=uniform, n_neighbors=2, leaf_size=23, score=0.957880 - 9.6min
[CV] weights=uniform, n_neighbors=2, leaf_size=23 ....................
[CV]  weights=uniform, n_neighbors=2, leaf_size=23, score=0.958589 - 9.6min
[CV] weights=uniform, n_neighbors=13, leaf_size=31 ...................
[CV]  weights=uniform, n_neighbors=2, leaf_size=23, score=0.957069 - 9.6min
[CV] weights=uniform, n_neighbors=13, leaf_size=31 ...................
[CV]  weights=uniform, n_neighbors=13, leaf_size=31, score=0.955837 - 8.6min
[CV] weights=uniform, n_neighbors=13, leaf_size=31 ...................
[CV]  weights=uniform, n_neighbors=13, leaf_size=31, score=0.959314 - 8.6min
[CV] weights=uniform, n_neighbors=1, leaf_size=25 ....................


[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 60.6min


[CV]  weights=uniform, n_neighbors=13, leaf_size=31, score=0.958520 - 8.2min
[CV] weights=uniform, n_neighbors=1, leaf_size=25 ....................
[CV]  weights=uniform, n_neighbors=1, leaf_size=25, score=0.965329 -10.1min
[CV] weights=uniform, n_neighbors=1, leaf_size=25 ....................
[CV]  weights=uniform, n_neighbors=1, leaf_size=25, score=0.965381 - 9.6min
[CV] weights=uniform, n_neighbors=11, leaf_size=29 ...................
[CV]  weights=uniform, n_neighbors=1, leaf_size=25, score=0.965247 - 9.6min
[CV] weights=uniform, n_neighbors=11, leaf_size=29 ...................
[CV]  weights=uniform, n_neighbors=11, leaf_size=29, score=0.957551 - 9.6min
[CV] weights=uniform, n_neighbors=11, leaf_size=29 ...................
[CV]  weights=uniform, n_neighbors=11, leaf_size=29, score=0.960963 - 9.6min
[CV] weights=uniform, n_neighbors=9, leaf_size=30 ....................
[CV]  weights=uniform, n_neighbors=9, leaf_size=30, score=0.959001 - 7.7min
[CV] weights=uniform, n_neighbors=9, le

[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 97.7min


[CV]  weights=uniform, n_neighbors=11, leaf_size=29, score=0.960367 - 9.7min
[CV] weights=uniform, n_neighbors=9, leaf_size=30 ....................
[CV]  weights=uniform, n_neighbors=9, leaf_size=30, score=0.962348 - 7.8min
[CV] weights=distance, n_neighbors=9, leaf_size=23 ...................
[CV]  weights=uniform, n_neighbors=9, leaf_size=30, score=0.960960 - 7.8min
[CV] weights=distance, n_neighbors=9, leaf_size=23 ...................
[CV]  weights=distance, n_neighbors=9, leaf_size=23, score=0.960583 - 9.7min
[CV] weights=distance, n_neighbors=9, leaf_size=23 ...................
[CV]  weights=distance, n_neighbors=9, leaf_size=23, score=0.963864 - 9.7min
[CV] weights=uniform, n_neighbors=8, leaf_size=38 ....................
[CV]  weights=uniform, n_neighbors=8, leaf_size=38, score=0.961110 - 7.7min
[CV] weights=uniform, n_neighbors=8, leaf_size=38 ....................
[CV]  weights=distance, n_neighbors=9, leaf_size=23, score=0.962147 - 9.7min
[CV] weights=uniform, n_neighbors=8, l

[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed: 132.5min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'n_neighbors': range(1, 15), 'weights': ['uniform', 'distance'], 'leaf_size': range(20, 40)},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          scoring='accuracy', verbose=10)

In [13]:
print("Accuracy: {}\n\
Best params: {} \n\
Best estimator {}:".format(knn_rand.best_score_,
                           knn_rand.best_params_,
                           knn_rand.best_estimator_))
      

Accuracy: 0.9666153846153847
Best params: {'weights': 'distance', 'n_neighbors': 3, 'leaf_size': 36} 
Best estimator KNeighborsClassifier(algorithm='auto', leaf_size=36, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='distance'):


NICE, just over the 97%

more tuning with gridsearch might get us a bit further...

In [14]:
param_dist = {"n_neighbors" : [x for x in range(3, 6,1)],
              "weights" : ["distance"],
              "leaf_size" : [x for x in range(25,40,5)]
             }

knn_grid = GridSearchCV(knn_clf,
                          param_dist,
                          cv=3,
                          scoring='accuracy',
                          n_jobs  = 2,
                          verbose = 10)

knn_grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] leaf_size=25, n_neighbors=3, weights=distance ...................
[CV] leaf_size=25, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=3, weights=distance, score=0.966515 - 9.7min
[CV] leaf_size=25, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=3, weights=distance, score=0.968018 - 9.7min


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  9.7min


[CV] leaf_size=25, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=3, weights=distance, score=0.965313 - 9.7min
[CV] leaf_size=25, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=4, weights=distance, score=0.967306 - 9.7min
[CV] leaf_size=25, n_neighbors=4, weights=distance ...................


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 19.4min


[CV]  leaf_size=25, n_neighbors=4, weights=distance, score=0.966170 - 9.7min
[CV] leaf_size=25, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=4, weights=distance, score=0.968546 - 9.7min
[CV] leaf_size=25, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=5, weights=distance, score=0.966185 - 9.7min
[CV] leaf_size=25, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=25, n_neighbors=5, weights=distance, score=0.968282 - 9.8min
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.966515 - 7.7min
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 46.5min


[CV]  leaf_size=25, n_neighbors=5, weights=distance, score=0.965181 - 9.7min
[CV] leaf_size=30, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.968018 - 7.8min
[CV] leaf_size=30, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=3, weights=distance, score=0.965313 - 7.8min
[CV] leaf_size=30, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=4, weights=distance, score=0.967306 - 7.7min
[CV] leaf_size=30, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=4, weights=distance, score=0.968546 - 7.7min
[CV] leaf_size=30, n_neighbors=5, weights=distance ...................


[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 64.0min


[CV]  leaf_size=30, n_neighbors=4, weights=distance, score=0.966170 - 7.7min
[CV] leaf_size=30, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=5, weights=distance, score=0.966185 - 7.8min
[CV] leaf_size=30, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=5, weights=distance, score=0.968282 - 7.8min
[CV] leaf_size=35, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=30, n_neighbors=5, weights=distance, score=0.965181 - 7.8min
[CV] leaf_size=35, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=3, weights=distance, score=0.966515 - 7.7min
[CV] leaf_size=35, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=3, weights=distance, score=0.968018 - 7.8min
[CV] leaf_size=35, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=3, weights=distance, score=0.965313 - 7.8min
[CV] leaf_size=35, n_neighbors=4, w

[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 93.0min


[CV]  leaf_size=35, n_neighbors=4, weights=distance, score=0.967306 - 7.8min
[CV] leaf_size=35, n_neighbors=4, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=4, weights=distance, score=0.968546 - 7.7min
[CV] leaf_size=35, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=4, weights=distance, score=0.966170 - 7.8min
[CV] leaf_size=35, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=5, weights=distance, score=0.966185 - 7.8min
[CV] leaf_size=35, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=35, n_neighbors=5, weights=distance, score=0.968282 - 7.8min
[CV]  leaf_size=35, n_neighbors=5, weights=distance, score=0.965181 - 7.4min


[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 115.9min remaining:    0.0s
[Parallel(n_jobs=2)]: Done  27 out of  27 | elapsed: 115.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'n_neighbors': [3, 4, 5], 'weights': ['distance'], 'leaf_size': [25, 30, 35]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=10)

In [18]:
print("Accuracy: {}\n\
Best params: {} \n\
Best estimator {}:".format(knn_grid.best_score_,
                           knn_grid.best_params_,
                           knn_grid.best_estimator_))
      

Accuracy: 0.9673406593406594
Best params: {'leaf_size': 25, 'n_neighbors': 4, 'weights': 'distance'} 
Best estimator KNeighborsClassifier(algorithm='auto', leaf_size=25, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance'):


In [20]:
y_pred = knn_grid.predict(X_test)

In [21]:
print("Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))

Test set accuracy: 0.973948717948718


ok, that was suprisingly good, lets see what the optimum answer is 

In [4]:
knn_clf = KNeighborsClassifier(leaf_size = 30, 
                               n_neighbors = 4, 
                               weights = "distance"
                              )

knn_scores = cross_val_score(knn_clf,
                          X_train,
                          y_train,
                          cv = 3,
                          scoring = "accuracy",
                          verbose = 10
                         )

[CV]  ................................................................
[CV] ....................... , score=0.9686222808174028, total= 7.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.8min remaining:    0.0s


[CV] ....................... , score=0.9667040284828905, total= 7.9min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.7min remaining:    0.0s


[CV] ....................... , score=0.9692672953900943, total= 8.5min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 24.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 24.2min finished


In [28]:
print("CV accuracy: {} \ :".format(knn_scores))


CV accuracy: [0.96730604 0.96854599 0.96616988] \ :


In [7]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance')

In [8]:
y_pred = knn_clf.predict(X_test)

In [9]:
print("Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))

Test set accuracy: 0.9707179487179487


Whooo, better that the optimum!