# KNearestNeighbor class 
- `fit()` method
- `predict()` method

In [23]:
from math import sqrt

In [74]:
class KNearestNeighbor:
    
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors
        self.train = None
    
    def __euclidean_distance(self, row1, row2):
        """
        The square root of the sum of the squared differences between two vectors.
        The smaller the value, the more similar two records will be.
        Value of 0 indicates no difference.
        
        euclidian distance = sqrt(sum i to N (x1_i - x2_i)^2)
        """
        # 0.0 so that distance will float
        distance = 0.0

        # loop for columns
        for i in range(len(row1) - 1):
            # squared difference between the two vectors
            distance += (row1[i] - row2[i])**2

        return sqrt(distance)
    
    
    def fit(self, train):
        """Fits model to training data"""
        self.train = train
        
    def __get_neighbors(self, train, new_obs, k):
        """
        Locates most similar neighbors via euclidian distance.

        Params: 

            train: a dataset

            new_obs: a new observation; observation for which neighbors are to be found

            k: k-neighbors; the number of neighbors to be found (int)
        """

        distances = []
        neighbors = []
        
        # Rules for whether or not train is a pandas.DataFrame
        if type(train) == pd.core.frame.DataFrame:
            
            for i,row in train.iterrows():
                # calculate distance
                d = self.__euclidean_distance(new_obs, list(row))

                # fill distances list with tuples of row index and distance
                distances.append((i, d))

                # sort distances by second value in tuple
                distances.sort(key=lambda tup: tup[1])
        else:
            
            for i,row in enumerate(train):
                # calculate distance
                d = self.__euclidean_distance(new_obs, row)

                # fill distances list with tuples of row index and distance
                distances.append((i, d))

                # sort distances by second value in tuple
                distances.sort(key=lambda tup: tup[1])

        for i in range(k):
            # Grabs k-records from distances list
            neighbors.append(distances[i])

        return neighbors
    
    
    def predict(self, train, new_obs):
        """
        Predicts a class label on a new observation from provided training data.

        Params: 

            new_obs: a new observation; observation for which neighbors are to be found

            k: k-neighbors; the number of neighbors to be found (int)
        """
        self.train = train #> for some reason, defining the model again with passing
                           #> in train with method call brought accuracy up to 95%,
                           #> whereas without this, accuracy was 31%. Not clear why 
                           #> this is the case since self.train is already defined in
                           #> the `model.fit()` call ...
        
        # Compile list of neighbors
        neighbors = self.__get_neighbors(self.train, new_obs, self.n_neighbors)
        
        # Grab index of the closest neighbor
        n_index = neighbors[0][0]
        
        # Add rules for if train is a pandas.DataFrame
        if type(self.train) == pd.core.frame.DataFrame:
            # Assumes labels are in last column of dataframe
            loc = self.train.columns[-1]
            pred = self.train[loc][n_index]
        else:
            # Prediction is the label from train record at n_index location. Assumes label
            # is at end of record.
            pred = self.train[n_index][-1]

        return pred
    
    def score(self, x, y):
        """
        Calculates accuracy of predictions (on classification problems).

        Params:

            x: actual, or correct labels

            y: predicated labels
        """

        correct = 0

        for i in range(len(x)):
            # Rules for if `x` is a pandas.Series
            if type(x) == pd.core.series.Series:
                if x.iloc[i] == y[i]:
                    correct += 1

            else:
                if x[i] == y[i]:
                    correct += 1

        return correct / float(len(x)) * 100.0

In [2]:
dataset = [
    [2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]
]

In [75]:
nn = KNearestNeighbor(n_neighbors=3)

In [76]:
nn.fit(dataset)

In [77]:
new = [8.675418651,-0.242068655,1]
nn.predict(dataset, new)

1

In [79]:
samp = [
    [2.7810836,2.550537003],
    [1.465489372,2.362125076],
    [3.396561688,4.400293529],
    [1.38807019,1.850220317],
    [3.06407232,3.005305973],
    [7.627531214,2.759262235],
    [5.332441248,2.088626775],
    [6.922596716,1.77106367],
    [8.675418651,-0.242068655],
    [7.673756466,3.508563011]
]

predictions = []

for obs in samp:
    pred = nn.predict(dataset, obs)
    predictions.append(pred)

predictions

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

# Load and split iris data

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Load iris dataset

cols = [
    "sepal_len",
    "sepal_wid",
    "petal_len",
    "petal_wid",
    "class"
]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
df = pd.read_csv(url, names=cols)

# Cleanup class names
names = []
for x in df["class"]:
    x = x.replace("Iris-","")
    names.append(x)
    
df["class"] = names

# Encode class names
labels = []
for x in df["class"]:
    x = x.replace("versicolor","0")
    x = x.replace("virginica","1")
    x = x.replace("setosa","2")
    x = int(x)
    labels.append(x)
    
df["class"] = labels


### Split iris data into train and test sets

train, test = train_test_split(df, train_size=0.70, test_size=0.30, random_state=5)

# X feature matrices, y target vectors
target = "class"

X_test = test.drop(target, axis=1)
y_test = test[target]

# Generate predictions with KNearestNeighbor

In [80]:
predictions = []

for _, obs in X_test.iterrows():
    pred = nn.predict(train, list(obs))
    predictions.append(pred)

len(predictions)

45

In [70]:
y_test.iloc[3]

2

In [71]:
predictions[3]

1

In [82]:
print(f"ABW KNearestNeighbors (OOP) Accuracy: {nn.score(y_test, predictions):.2f}")

ABW KNearestNeighbors (OOP) Accuracy: 95.56


# Benchmark scikit-learn version to compare

In [84]:
# Split train into x and y for use with sklearn model

X_train = train.drop(target, axis=1)
y_train = train[target]

# print(X_train.shape, y_train.shape)

# Import KNeighborsClassifier for comparison to my own
# KNearestNeighbor (classifier)
from sklearn.neighbors import KNeighborsClassifier

# create instance object
sk_nn = KNeighborsClassifier(n_neighbors=3) #> to match

# fit model
sk_nn.fit(X_train, y_train)

# Generate predictions
sk_preds = sk_nn.predict(X_test)

print(f"Sklearn KNeighborsClassifier Accuracy: {sk_nn.score(X_test, y_test):.2f}")

Sklearn KNeighborsClassifier Accuracy: 0.96


# TODO:

- blog post
- cleanup notebooks / repo