In [171]:
import numpy as np
from scipy import stats
import pandas as pd

# The Plan

Making a basic KNN algorithm has three main steps:

1. Measure the distance between points
    - this is generally Euchlidean Distance
    
2. Sort the data based off the distance from x point
    - once the sorting happens, we can simply take the top neighbors
    - depending on the amount of neighbors the user wants
    
3. Return the number of neighbors the user wants and the prediction for it's class/numerical value (if regression)

In [175]:
#first make a function that measures Euchlidean Distance 

def e_distance(original_arr, test_arr):
    #start by having distance variable at 0
    d = 0
    
    #go through each item in the array
    #assuming the last item is actually classification label and is skipped
    
    for row in range(len(original_arr) - 1):
        #this is the execution of the euchlidian formula
        
        # the difference between two rows squared
        # added to the total distance of rows
        # then the square root is taken of total distance
        
        # and returned as the distance between arrays
        d += (original_arr[row] - test_arr[row]) ** 2
        
        distance = np.sqrt(d)
        
    return distance

In [176]:
#test 

arr1 = [15, 25, 35, 45, 55]
arr2 = [15, 25, 35, 45, 55]

print('Distance between two identical arrays:', e_distance(arr1, arr2))

arr3 = [15, 25, 35, 45, 55]
arr4 = [15, 25, 30, 50, 55]

print('Distance between two similar arrays:', e_distance(arr3, arr4))

Distance between two identical arrays: 0.0
Distance between two similar arrays: 7.0710678118654755


In [182]:
#sort the e_distance between arrays 

#essentially I will be iterating over each row in the array
def knearest_neighbors_classification(original_array, test_array, k):
    all_distances = []
    for row in original_array:
        #measuring it's distance from my test point
        d = e_distance(row, test_array)
        #store them all in a list
        all_distances.append([d, row])

    #sort the list, smallest to largest
    all_distances.sort()
    
    #then return the number of nearest neighbors the user wants 
    count = 0
    for i in all_distances[:k]:
        count += 1
        print(f'This is the {count} closest variable:', i)
        
    #I also would like to return a prediction for the class of the test array
    nearest_classes = []
    for i in all_distances[:k]:
        nearest_classes.append(i[-1][-1])
    
    prediction = stats.mode(nearest_classes)
    return f'Prediction is that this is in the {prediction[0]} class.'

In [193]:
cat_data = [[12, 23, 0],
             [4.4, 11, 1],
             [5.2, 12, 1],
             [11, 30, 0],
             [4.7, 8, 1],
             [14, 27, 0],
             [3.9, 9, 1], 
             [9, 27, 0],
             [10, 28, 0],
             [3.7, 10, 1]]

your_cat = [12, 20, 0]

knearest_neighbors_classification(cat_data, your_cat, 2)

This is the 1 closest variable: [3.0, [12, 23, 0]]
This is the 2 closest variable: [7.280109889280518, [14, 27, 0]]


'Prediction is that this is in the [0] class.'

In [194]:
regression_dummy_data = [[22, 23, 56],
             [10, 11, 134],
             [11, 15, 166],
             [29, 30, 87],
             [9, 13, 123],
             [25, 27, 55],
             [10, 12, 111], 
             [25, 27, 98],
             [24, 28, 75],
             [14, 16, 147]]

In [195]:
#sort the e_distance between arrays 

#essentially I will be iterating over each row in the array
def knearest_neighbors_regression(original_array, test_array, k):
    all_distances = []
    for row in original_array:
        #measuring it's distance from my test point
        d = e_distance(row, test_array)
        #store them all in a list
        all_distances.append([d, row])

    #sort the list, smallest to largest
    all_distances.sort()
    
    #then return the number of nearest neighbors the user wants 
    count = 0
    for i in all_distances[:k]:
        count += 1
        print(f'This is the {count} closest variable:', i)
    #I also would like to return a prediction for the class of the test array
    nearest_values = []
    for i in all_distances[:k]:
        nearest_values.append(i[-1][-1])
    
    prediction = np.mean(nearest_values)
    return f'Given the input value, the prediction is {prediction}.'

In [196]:
test_arr = [12, 13]

print(knearest_neighbors_regression(regression_dummy_data, test_arr, 2))

This is the 1 closest variable: [2.23606797749979, [10, 12, 111]]
This is the 2 closest variable: [2.23606797749979, [11, 15, 166]]
Given the input value, the prediction is 138.5.
