## K-Nearest Neighbors Classifier Testing
In this notebook, test a K-Nearest Neighbors classifier from scikit learn for image classification of the sea lion chips.

In [1]:
import cv2
import numpy as np
import pathlib
import pickle
import random

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split

##### Retrieve the arrays and labels via pickle. Assign arrays to X (our training features) and labels to y (our training target/labels)

In [2]:
with open('image_arrays.pkl', 'rb') as f:
    X = pickle.load(f)
with open('image_labels.pkl', 'rb') as f:
    y = pickle.load(f)

##### Create train test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.12,
                                                    random_state=42,
                                                    shuffle=True)

##### Initialize the classifier and fit to the training data:
- Some testing indicated that n_neighbors = 6 was ideal number of neighbors

In [17]:
model = 'knn'
knn_clf = KNeighborsClassifier(n_neighbors=6)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                     weights='uniform')

##### Predict on subset of the test data

In [5]:
print(
    "Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}"
)
print("Predictions: ", knn_clf.predict(X_test[0:20]))
print("Actual classes: ", y_test[0:20])

Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}
Predictions:  [3 0 0 2 0 0 2 3 0 2 3 3 3 2 2 0 0 3 0 3]
Actual classes:  [3, 0, 0, 2, 0, 1, 1, 3, 1, 2, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3]


##### Create confusion matrix - good explanation here if unfamiliar: https://www.geeksforgeeks.org/confusion-matrix-machine-learning/

In [7]:
y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)

conf_matrix = confusion_matrix(y_train, y_train_pred)
conf_matrix

array([[392,   4, 166,  39,   3],
       [ 41,   1,  21,   3,   1],
       [ 71,   0,  37,  18,   0],
       [ 14,   0,  14, 212,   0],
       [ 32,   1,  10,   1,   0]], dtype=int64)

### Scoring Metrics

##### Mean accuracy

In [14]:
cross_val_mean = cross_val_score(knn_clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cross_val_mean

0.6085932734883235

##### F1 Scores

In [15]:
f1_scores = f1_score(y_train, y_train_pred, average=None)
f1_scores

array([0.67937608, 0.02739726, 0.19786096, 0.82651072, 0.        ])

In [18]:
# Export scores for visualization later
import pandas as pd
results = pd.DataFrame(dict(model=model,
                           f1_scores=f1_scores,
                           cross_val=cross_val_mean))
results

Unnamed: 0,model,f1_scores,cross_val
0,knn,0.679376,0.608593
1,knn,0.027397,0.608593
2,knn,0.197861,0.608593
3,knn,0.826511,0.608593
4,knn,0.0,0.608593
