# 3.2 K-Nearest

In [1]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

Load the small image data set.

In [2]:
npzfile = np.load('data/image_data.npz')

In [3]:
X = npzfile['arr_0']
y = npzfile['arr_1']

In [4]:
# reshape 
X = np.reshape(X, (X.shape[0], X.shape[1]*X.shape[2]*X.shape[3]))

In [5]:
# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
n_neighbors = np.linspace(1, 10, 10, dtype=np.int8)
metrics = ['euclidean', 'minkowski']
param_grid = {'n_neighbors': n_neighbors, 'metric': metrics}

In [7]:
# find the best parameters for the model
grid = GridSearchCV(
    KNeighborsClassifier(), 
    param_grid=param_grid, 
    n_jobs=8, 
    cv=StratifiedKFold(n_splits=3))
grid.fit(X_train, y_train)
grid.best_params_

{'metric': 'euclidean', 'n_neighbors': 7}

In [8]:
# use the best parameters for the model
best_n_neighbors = grid.best_params_['n_neighbors']
best_metric = grid.best_params_['metric']

In [9]:
# fit the model with the best parameters
clf = KNeighborsClassifier(n_neighbors=best_n_neighbors, metric=best_metric)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [10]:
# score on test data
clf.score(X_test, y_test)

0.5

In [11]:
# score on training data
clf.score(X_train, y_train)

0.46916076845298282