In [1]:
import numpy as np
import scipy

Parameters:	

k = int, optional (default = 3)
Number of neighbours per neighbourhood

metric = string, default 'euclidean'
Distance metric used to compare relative distances. Default = 'euclidean' where p=2 using minkowski distance.

Methods:

classify(x,class_x,y) where x = training datapoints (NxD),
                            class_x = classes of the datapoints used for training(x) (NxD),
                            y = datapoints to be classifed (test set) (PxS)
                      Returns:
                            predictions = array of class predictions (PxS)
                
error(class_y) where class_y = true classes corresponding to test set (PxS)
               Returns:
                     score - percentage accuracy of the model
                         
                            

In [9]:
class KNN:
    def __init__(self,k=3,metric='euclidean'):
        self.k = k
        self.metric = metric
        if metric == 'euclidean':
            self.p=2
        elif metric == 'minkowski':
            self.p=1
    
    # calculates distance between points, default = 'euclidean'
    def distance_metric(self,x,y):
        return scipy.spatial.distance.minkowski(x,y,self.p)

    # model is trained on x and predicts classes for y
    def classify(self,x,class_x,y):
        if self.k>len(x):
            raise Exception('K cant be greater than the number of training examples')
        
        # Calculate distance matrix between training and test points
        distance = np.array([])
        for i in range(len(x)):
            for j in range(len(y)):
                distance = np.append(distance,self.distance_metric(x[i],y[j]))
        distance = np.reshape(distance,(len(x),len(y))).T
        
        # Locate minimum distances and compute corresponding classes
        self.prediction = np.array([])
        for loc in range(distance.shape[0]):
            arranged = np.argsort(distance[loc])[:self.k]
            count = np.argmax(np.bincount(np.reshape(class_x[arranged],(len(arranged),))))
            self.prediction = np.append(self.prediction,count)
            
        return self.prediction.astype(int)
    
    # Calculates percentage accuracy of the classifier
    def error(self,class_y):
        count = len(np.where(self.prediction==class_y)[0])
        return (count/len(class_y))*100


In [10]:
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data
Y = iris.target

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .75)

k = KNN(k=3,metric='euclidean')
predictions = k.classify(X_train,Y_train,X_test)
score = k.error(Y_test)
print("Class predictions: ",predictions)
print("Classifier score:",score)

Class predictions:  [2 1 0 2 1 0 1 0 1 1 0 1 0 1 0 2 0 0 0 1 1 1 2 0 0 1 2 1 0 2 0 1 0 1 0 0 0
 2 1 1 0 2 1 2 2 1 2 2 0 2 0 2 1 2 0 0 2 0 0 1 2 2 1 0 0 0 0 2 0 0 0 0 1 2
 0 2 0 2 1 1 2 0 2 0 1 1 2 1 2 0 0 0 2 1 1 2 0 2 1 1 2 2 1 1 1 2 2 1 1 0 0
 1 0]
Classifier score: 96.46017699115043
