# 3.2 K-Nearest

In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

Load the small image data set.

In [2]:
npzfile = np.load('../data/image_data_10.npz')

In [3]:
X = npzfile['arr_0']
y = npzfile['arr_1']

In [4]:
# reshape 
X = np.reshape(X, (X.shape[0], X.shape[1]*X.shape[2]*X.shape[3]))

In [5]:
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# define grid for grid search
n_neighbors_range = np.linspace(1, 10, 10, dtype=np.int8)
metrics_range = ['euclidean', 'minkowski']
param_grid = dict(n_neighbors=n_neighbors_range, metric=metrics_range)

In [7]:
# find the best parameters for the model
grid = GridSearchCV(
    KNeighborsClassifier(), 
    param_grid=param_grid,  
    cv=StratifiedKFold(n_splits=3),
    n_jobs=8,
    verbose=1000)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Memmaping (shape=(1212, 67500), dtype=float64) to new file C:\Users\flori\AppData\Local\Temp\joblib_memmaping_pool_4588_1606588853888\4588-1606589071656-2898dbb030e98a4a31c8c039fc3d6992.pkl
Pickling array (shape=(1212,), dtype=int32).
Pickling array (shape=(806,), dtype=int32).
Pickling array (shape=(406,), dtype=int32).
Memmaping (shape=(1212, 67500), dtype=float64) to old file C:\Users\flori\AppData\Local\Temp\joblib_memmaping_pool_4588_1606588853888\4588-1606589071656-2898dbb030e98a4a31c8c039fc3d6992.pkl
Pickling array (shape=(1212,), dtype=int32).
Pickling array (shape=(808,), dtype=int32).
Pickling array (shape=(404,), dtype=int32).
Memmaping (shape=(1212, 67500), dtype=float64) to old file C:\Users\flori\AppData\Local\Temp\joblib_memmaping_pool_4588_1606588853888\4588-1606589071656-2898dbb030e98a4a31c8c039fc3d6992.pkl
Pickling array (shape=(1212,), dtype=int32).
Pickling array (shape=(810,), dtype=int32).
Pickling array

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=8,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int8), 'metric': ['euclidean', 'minkowski']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1000)

In [8]:
# {'metric': 'euclidean', 'n_neighbors': 10} were the best fitting parameters in this setup
grid.best_params_

{'metric': 'euclidean', 'n_neighbors': 10}

In [9]:
# use the best parameters for the model
best_n_neighbors = grid.best_params_['n_neighbors']
best_metric = grid.best_params_['metric']

In [10]:
# fit the model with the best parameters
clf = KNeighborsClassifier(n_neighbors=best_n_neighbors, metric=best_metric)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [11]:
# score on test data
clf.score(X_test, y_test)

0.41584158415841582

In [12]:
# score on training data
clf.score(X_train, y_train)

0.40099009900990101