<a href="https://colab.research.google.com/github/dropthejase/ml_training/blob/main/ml_from_scratch/knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sortedcontainers import SortedList # sorts things in a list for you
from collections import Counter

In [None]:
# calculate the distance between X_test vs all of X_train
# add their y_targets and their distances
# sort based on ascending distance
# slice out the first 3
# capture the votes
# most votes = predicted class

class KNN():
  def __init__(self, k=3):
    self.k = k
  
  def fit(self, X, y):
    self.X = np.array(X)
    self.y = np.array(y)
  
  def predict(self, X):
    X = np.array(X)
    
    y_pred = []

    for i, x_test in enumerate(X):

      sl = SortedList()
      k_classes = []

      for j, x_train in enumerate(self.X): #for each row of X_train

        d = np.sqrt(np.sum((x_test - x_train)**2)) #calculate Euclidean distance

        sl.add((d, self.y[j])) # add distance and corresponding y to SortedList

      # take the k nearest neighbours
      sl_final = sl[:self.k] 

      # add the classes of the KNNs to a Counter object
      for dist, target in sl_final:
        k_classes.append(target)

      k_class_count = Counter(k_classes)
        
      y_pred.append(k_class_count.most_common(1)[0][0])

    return np.array(y_pred)

      #print(sl_final)
      #print(k_class_count)
    
  def score(self, X_test, y_test):
    y_pred = self.predict(X_test)
    return np.mean(y_pred == y_test)

**TEST DATA**

In [None]:
X_train = np.array([[1, 1, 1, 1],
                    [2, 2, 2, 2],
                    [3, 3, 3, 3]])

y_train = np.array([1,2,3])

X_test = np.array([[1, 2, 1, 1],
                   [0, 0, 0, 0],
                   [10, 10, 10, 10]])

y_test = np.array([1,1,3])

In [None]:
knn_test = KNN(k=2)
knn_test.fit(X_train, y_train)
knn_test.predict(X_test)

array([1, 1, 3])

In [None]:
knn_test.score(X_test, y_test)

1.0

**TEST WITH SCIKIT DATA**

In [None]:
from sklearn import datasets
iris = datasets.load_iris()

In [None]:
df = pd.DataFrame(iris.data, columns=['sepal_length','sepal_width','petal_length','petal_width'])
df['target'] = iris.target

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
101,5.8,2.7,5.1,1.9,2
88,5.6,3.0,4.1,1.3,1
60,5.0,2.0,3.5,1.0,1
105,7.6,3.0,6.6,2.1,2
124,6.7,3.3,5.7,2.1,2


In [None]:
X = df.drop('target',axis=1)
y = df['target']

#X = np.array(X)
#y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
knn = KNN()
knn.fit(X_train, y_train)
knn.predict(X_test)

array([2, 0, 0, 1, 2, 2, 1, 0, 1, 2, 2, 2, 2, 1, 0, 0, 0, 2, 1, 2, 1, 0,
       2, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1,
       0, 0, 1, 2, 2, 0])

In [None]:
knn.score(X_test, y_test)

0.98