# K-Nearest Neighbors with Python 

Instructions on how nearest neighbors works and how to implement without scikit-learn was accessed at: https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/. 

Iris dataset from UCI can be found here: https://archive.ics.uci.edu/ml/datasets/iris

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

In [17]:
# Load iris dataset

cols = [
    "sepal_len",
    "sepal_wid",
    "petal_len",
    "petal_wid",
    "class"
]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
df = pd.read_csv(url, names=cols)

# Cleanup class names
names = []
for x in df["class"]:
    x = x.replace("Iris-","")
    names.append(x)
    
df["class"] = names

print(df.shape)
df.head()

(150, 5)


Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [18]:
df["class"].value_counts()

versicolor    50
virginica     50
setosa        50
Name: class, dtype: int64

# Step I: Euclidian Distance

In [19]:
# the square root of the sum of the squared differences between two vectors
# the smaller the value, the more similar two records will be
# value of 0 indicates no difference

# euclidian distance = sqrt(sum i to N (x1_i - x2_i)^2)

# x1 is first row of data, x2 is second row, i is the index to a specific column
# as we sum across all columns

def euclidean_distance(row1, row2):
    
    # 0.0 so that distance will float
    distance = 0.0
    
    # loop for columns
    for i in range(len(row1) - 1):
        # squared difference between the two vectors
        distance += (row1[i] - row2[i])**2
        
    return sqrt(distance)

In [23]:
# Test distance function
dataset = [
    [2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]
]

row0 = dataset[0]

for row in dataset:
    d = euclidean_distance(row0, row)
    print(d)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


# Step II: Get nearest neighbors

In [24]:
# A "neighbor" will be the `k`-closest instance per distance measure
# Locating a neighbor for new data will involve calculating new data
# distance from each observation in dataset

In [50]:
def get_neighbors(train, new_obs, k):
    """
    Locates most similar neighbors via euclidian distance.
    
    Params: 
        
        train: a dataset
        
        new_obs: a new observation; observation for which neighbors are to be found
        
        k: k-neighbors; the number of neighbors to be found (int)
    """
    
    distances = []
    neighbors = []

    for i,row in enumerate(train):
        # calculate distance
        d = euclidean_distance(new_obs, row)

        # fill distances list with tuples of row index and distance
        distances.append((i, d))

        # sort distances by second value in tuple
        distances.sort(key=lambda tup: tup[1])

    for i in range(k):
        # Grabs k-records from distances list
        neighbors.append(distances[i])

    return neighbors

In [52]:
# Test get_neighbors 

nays = get_neighbors(dataset, dataset[0], 3)
for n in nays:
    print(n)
    
# As expected, first record is most simlar to itself

(0, 0.0)
(4, 0.5356280721938492)
(1, 1.3290173915275787)


In [54]:
dataset[1]

[1.465489372, 2.362125076, 0]

In [49]:
dataset[2]

[3.396561688, 4.400293529, 0]

# Step III: Make predictions

In [82]:
# For classification, can return the most represented class from the neighbors of the
# new observation

# Can do this by using `max()` on neighbors list
# For ex., if class labels are 0 or 1, and out of 5 neighbors, three of them have a 1,
# then `max()` will identify 1 as the max, which we can use as the predicted class

def predict_classification(train, new_obs, k):
    """
    Predicts a class label on a new observation from provided training data.
    
    Params: 
        
        train: a dataset
        
        new_obs: a new observation; observation for which neighbors are to be found
        
        k: k-neighbors; the number of neighbors to be found (int)
    """
    # Compile list of neighbors
    neighbors = get_neighbors(train, new_obs, k)
    
    # Compile list of distance values for neighbors
    out_vals = [row[-1] for row in neighbors]
    
    # Get prediction from set of out_vals. Go by max count rather than max val
    pred = max(set(out_vals), key=out_vals.count)
    
    return pred

In [86]:
y_pred = predict_classification(dataset, dataset[0], 3)
print(f"Expected: {dataset[0][-1]} \nPrediction: {y_pred:.0f}")

Expected: 0 
Prediction: 0


In [87]:
def accuracy_metric(x, y):
    """
    Calculates accuracy of predictions (on classification problems).
    
    Params:
        
        x: actual, or correct labels
        
        y: predicated labels
    """
    
    correct = 0
    
    for i in range(len(x)):
        if x[i] == y[i]:
            correct += 1
    return correct / float(len(x)) * 100.0

# TODO:

- Split iris into train and test sets to gauge nn model performance
- Convert model to OOP implementation
    - KNearestNeighbor class
        - `.fit()` method
        - `.predict()` method
- Import nn model from sklearn and compare