In [33]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model, datasets, neighbors
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score


In [34]:
def euclideDistance(row1, row2):
    return np.sum((row1-row2) ** 2)

In [35]:
class KNearestNeighbors:
    
    def __init__(self, k = 3):
        '''
        Parameters:
            k: number of neighbor
        '''
        self.k = k
    
    def fit(self, data_train, label_train):
        self.data_train = data_train
        self.label_train = label_train
        
    def getNeighbors(self, data_test):
        
        # Get distance list from data test to each data train
        distance = []
        for i in range(len(self.data_train)):
            d = euclideDistance(self.data_train[i], data_test)
            distance.append((self.data_train[i], self.label_train[i], d))
            
        # Sort to get smallest distance data
        distance.sort(key=lambda tup: tup[-1])
        
        # Select k smallest data
        return distance[:self.k-1][:1]
            
        
    def predict(self, data_test):
        
        label_pred = []
        
        for data in data_test:
            
            # Get k neighbors
            neighbor = self.getNeighbors(data)
        
            # Get label only
            neighbor_label = [row[1] for row in neighbor]
        
            # Get label with most appeared time
            label_pred.append(max(set(neighbor_label), key=neighbor_label.count))
            
        return np.array(label_pred)
        
        

In [36]:
np.random.seed(3)

In [37]:
iris = datasets.load_iris()

In [38]:
X = iris.data
y = iris.target

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

In [40]:
# Our model
model = KNearestNeighbors(k = 10)

In [41]:
model.fit(X_train, y_train)

In [42]:
# Predict
y_pred = model.predict(X_test)

In [43]:
# Sklearn model
model1 = neighbors.KNeighborsClassifier(n_neighbors=10)

In [44]:
model1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [45]:
# Sklearn predict
y_pred1 = model1.predict(X_test)

In [46]:
# Compare our output, Sklearn output and actual output
print("Our output:     ", y_pred[:30])
print("Sklearn output: ", y_pred1[:30])
print("Actual output:  ", y_test[:30]) 

Our output:      [0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 2 2 2 0 2 2 2 1 0 2 2 1 1 1]
Sklearn output:  [0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 1 2 2 1 0 2 2 1 1 1]
Actual output:   [0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1]


In [47]:
# Our model accuracy
accuracy_score(y_test, y_pred)

0.9666666666666667

In [48]:
# Sklearn model accuracy
accuracy_score(y_test, y_pred1)

0.9333333333333333