# K Nearest Neighbors
Implementing the algorithm to train based on a set of data and return the prediction based on the 'k' nearest items 

# Imports

In [59]:
import numpy as np
from collections import Counter

# Test data
We're going to create some test data. Let's say we've got cats, dogs and dinosaurs with 3 measures - ear length, tail length, leg length.

0 - cat, 1 - dog, 2 - dinosaur

In [68]:
dic={0:'cat',1:'dog',2:'dinosaur'}
X=np.random.rand(30,3)
X[:10,0]+=0.2
X[10:20,0]+=0.5
X[:10,1]+=2
X[10:20,1]+=6
X[:10,2]+=4
X[10:20,2]+=6
X[20:,:]+=10
m,n=X.shape
y=np.zeros((m,1))
y[:10]=0
y[10:20]=1
y[20:]=2

In [63]:
# set k
k=7

In [70]:
# our example case
example=np.array([[8,5.5,6]])

In [71]:
# Find distance for each case from the example
X_y_distance=(np.sum((X-example)**2,1).reshape(m,1))**0.5
# Add the categorical data to the array
X_y_distance=np.concatenate((X_y_distance,y),axis=1)
# Sort the data by the closest to further matches
X_y_distance=X_y_distance[X_y_distance[:,0].argsort()]

In [73]:
# Create a Counter for all items to k
cnt=Counter(X_y_distance[:k,1])

In [75]:
# print the item that has the max value
for item in cnt.keys():
    if cnt[item]==max(cnt.values()):
        print(dic[item])
print(cnt)

dinosaur
Counter({2.0: 4, 1.0: 3})


# KNN as class

In [79]:
class KNN:
    """K Nearest Neighbors algorithm
    Parameters
    ------------
    X : numpy array
        Array should hold all relevant criteria.
        Data should be organized by case x feature
    y : numpy array
        Can be a flat array or with dimensions case x 1.
        Can hold categorical data as strings or integers:
         np.array(['a','b','a','c']) or np.array([0,1,0,2])
    Available methods
    -------------
    predict : function
        Used for the prediction of nearest neighbor"""
    def __init__(self,X,y):
        self.y_dic={}
        self.X=X
        self.m,self.n=X.shape
        self.y=self.categorize(y.reshape(self.m,1))
    def categorize(self,y):
        m,n=y.shape
        new_y=np.zeros((m,1))
        unique_y = np.unique(y)
        for i, item in enumerate(unique_y):
            self.y_dic[i]=item
            new_y[y==item]=i
        return new_y
    def predict(self,case,k):
        """Prediction method
        Parameters
        ------------
        case : numpy array
            An array of features
        k    : integer
            The number of nearest neighbors
            that should be compared.
            For best results use odd numbers
        Returns
        ------------
        return_item : string/int
            Returns case classified based
            on K neighbors"""
        return_item=[]
        X_y_distance=(np.sum((self.X-case)**2,1).reshape(self.m,1))**0.5
        X_y_distance=np.concatenate((X_y_distance,self.y),axis=1)
        X_y_distance=X_y_distance[X_y_distance[:,0].argsort()]
        cnt=Counter(X_y_distance[:k,1])
        for item in cnt.keys():
            if cnt[item]==max(cnt.values()):
                #print(self.y_dic[item])
                return_item.append(self.y_dic[item])
        if len(return_item)>1:
            print('More than one item returned. Please set k to odd')
        else:
            return_item=return_item[0]
        print('Nearest item: {0}'.format(return_item))
        return return_item

# Testing

In [80]:
a=KNN(X,y)

In [82]:
a.predict(example,7)

Nearest item: 2.0


2.0