According to Karpathy's Cs231(Stanford) course nearest neighbors "can be" used in image classification even if it's rarely used in real life. My transfer learning model with VGG-16 architecture actually has better log-loss than this one, however I wanted to try out KNN on image data. So here KNN was implemented based on normalized color histogram of input images. No parameter tuning was done so far.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
import cv2
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [None]:
# Extracts raw pixel array from images
def get_image_vector(image,size=(64,64)):
    return cv2.resize(image,size).flatten()

In [None]:
# Extracts the color histogram from images
def extract_color_histogram(image,bins = (8,8,8)):
    hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,bins,[0,256,0,256,0,256])
    cv2.normalize(hist,hist)
    return hist.flatten()

In [None]:
train_dir = "../input/train/"
test_dir = "../input/test_stg1/"

In [None]:
classes = sorted(os.listdir(train_dir))[1:]
print(classes)

In [None]:
# Get the training data paths

image_path_list = []
for c in classes:
    fish_images = [train_dir+c+'/'+item for item in os.listdir(train_dir+c+'/')]
    image_path_list.extend(fish_images)

In [None]:
# Get the labels

labels = []
for c in classes:
    l = [c]*len(os.listdir(train_dir+c+'/'))
    labels.extend(l)
    

In [None]:
# Encode the labels

labels = LabelEncoder().fit_transform(labels)

In [None]:
labels

In [None]:
# Get the color histograms from the images

features = []
for i,image_path in enumerate(image_path_list):
    image = cv2.imread(image_path)
    hist = extract_color_histogram(image)
    features.append(hist)
    if(i%1000==0):
        print(str(i)+ "  completed")

In [None]:
X_train,X_test,y_train,y_test = train_test_split(features,labels,test_size = 0.25, random_state = 42)

In [None]:
 log_loss_scorer = make_scorer(log_loss, greater_is_better = False, needs_proba = True)

In [None]:
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid = {"n_neighbors":[1,2,3,4,5]}, cv = 5)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
grid_search.score(X_test,y_test)

In [None]:
print(grid_search.best_score_)

In [None]:
print(grid_search.best_estimator_)

In [None]:
print(grid_search.best_params_)

In [None]:

 # model = KNeighborsClassifier(n_neighbors = 2, n_jobs = -1)

In [None]:
# model.fit(X_train,y_train)

In [None]:
# accuracy = model.score(X_test,y_test)

In [None]:
# print(accuracy)

In [None]:
# preds_validation = model.predict_proba(X_test)

In [None]:
# log_loss(y_test,preds_validation)

In [None]:
test_files = [im for im in os.listdir(test_dir)]

In [None]:
test_features = []
for i,image_path in enumerate(test_files):
    image = cv2.imread(test_dir + '/'+ image_path)
    hist = extract_color_histogram(image)
    test_features.append(hist)
    if(i%1000==0):
        print(str(i)+ "  completed")

In [None]:
preds = grid_search.predict_proba(test_features)

In [None]:
submission1 = pd.DataFrame(preds, columns= classes)
submission1.insert(0, 'image', test_files)
submission1.head()

In [None]:
clipped_preds = np.clip(preds,(1-0.82)/7,0.82)

submission2 = pd.DataFrame(clipped_preds, columns= classes)
submission2.insert(0, 'image', test_files)
submission2.head()

In [None]:
submission2.to_csv("K_neighbors_submission.csv",index = False)

Leaderboard score = 

Inspiration : Adrian Rosebrock's blog post linked here (http://www.pyimagesearch.com/2016/08/08/k-nn-classifier-for-image-classification/) 