k-NN
- simplest model for classification
- measures distance between data neighbors
- not really learning
- requires saving the model plus the data to make predictions from
- computationally expensive due to storing training/test data on disk
- ranking algorithm of similarity
- is the input most like a, b or c category

Import packages

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imutils import paths
import numpy as np
import argparse
import cv2
import os

Build image preprocessor

In [4]:
class SimplePreprocessor:
	def __init__(self, width, height, inter=cv2.INTER_AREA):
		# store the target image width, height, and interpolation
		# method used when resizing
		self.width = width
		self.height = height
		self.inter = inter

	def preprocess(self, image):
		# resize the image to a fixed size, ignoring the aspect
		# ratio
		return cv2.resize(image, (self.width, self.height),
			interpolation=self.inter)


Build image loader

In [5]:
class SimpleDatasetLoader:
	def __init__(self, preprocessors=None):
		# store the image preprocessor
		self.preprocessors = preprocessors

		# if the preprocessors are None, initialize them as an
		# empty list
		if self.preprocessors is None:
			self.preprocessors = []

	def load(self, imagePaths, verbose=-1):
		# initialize the list of features and labels
		data = []
		labels = []

		# loop over the input images
		for (i, imagePath) in enumerate(imagePaths):
			# load the image and extract the class label assuming
			# that our path has the following format:
			# /path/to/dataset/{class}/{image}.jpg
			image = cv2.imread(imagePath)
			label = imagePath.split(os.path.sep)[-2]

            # check to see if our preprocessors are not None
			if self.preprocessors is not None:
				# loop over the preprocessors and apply each to
				# the image
				for p in self.preprocessors:
					image = p.preprocess(image)

			# treat our processed image as a "feature vector"
			# by updating the data list followed by the labels
			data.append(image)
			labels.append(label)

			# show an update every `verbose` images
			if verbose > 0 and i > 0 and (i + 1) % verbose == 0:
				print("[INFO] processed {}/{}".format(i + 1,
					len(imagePaths)))

		# return a tuple of the data and labels
		return (np.array(data), np.array(labels))

Implement k-NN

In [6]:
args = {
	"dataset": "dataset/animals",
	"neighbors": 1,
	"jobs": -1
}

Get list of images to process

In [7]:
print("[INFO] loading images...")
imagePaths = list(paths.list_images(args["dataset"]))

# init the image preprocessor
# load dataset from disk
# reshape the matrix
sp = SimplePreprocessor(32, 32)
sdl = SimpleDatasetLoader(preprocessors=[sp])
(data, labels) = sdl.load(imagePaths, verbose=500)
data = data.reshape((data.shape[0], 3072))

# show info on mem consumption
print("[INFO] features matrix: {:.1f}MB".format(
	data.nbytes / (1024 * 1024.0)))

[INFO] loading images...
[INFO] processed 500/3000
[INFO] processed 1000/3000
[INFO] processed 1500/3000
[INFO] processed 2000/3000
[INFO] processed 2500/3000
[INFO] processed 3000/3000
[INFO] features matrix: 8.8MB


Encode integers

In [8]:
le = LabelEncoder()
labels = le.fit_transform(labels)

# split the data into train and  test 75% : 25%
(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42)

Train and evaluate k-NN classifier

In [9]:
print("[INFO] evaluating k-NN classifier...")
model = KNeighborsClassifier(n_neighbors=args["neighbors"],
        n_jobs=args["jobs"])

model.fit(trainX, trainY)
print(classification_report(testY, model.predict(testX),
        target_names=le.classes_))

[INFO] evaluating k-NN classifier...
              precision    recall  f1-score   support

        cats       0.40      0.56      0.46       249
        dogs       0.41      0.47      0.43       262
       panda       0.80      0.32      0.46       239

    accuracy                           0.45       750
   macro avg       0.53      0.45      0.45       750
weighted avg       0.53      0.45      0.45       750

