# Own implementation of k nearest neighbors algorithm

Implement KNN by hand for just 2 dimensions with normalization. 
Some substeps are needed for it.


In [37]:
import numpy as np 
import math 
from sklearn import preprocessing 
from collections import Counter

## funct: You normalize your data in another table

In [38]:
def normalize(array):
    min_max_scaler = preprocessing.MinMaxScaler()
    return min_max_scaler.fit_transform(array)

## funct: You code a simple euclid distance function

In [39]:
def distance(pointA, pointB):
    x_delta = (pointA[0] - pointB[0]) ** 2
    y_delta = (pointA[1] - pointB[1]) ** 2
    return math.sqrt(x_delta + y_delta)

## funct: You take a point and calculate the distance to all points

In [40]:
def calc_distances_to_sample(sample, points):
    distances = []
    for point in points:
        distance_to_sample = distance(sample,point)
        distances.append(distance_to_sample)
    
    return distances
    

## funct: You take the list from above and sort it

In [41]:
def sort_by_distance(dataset, distance_column_index):
    return dataset[np.argsort(dataset[:,distance_column_index])]

## funct: You take the *k* nearest and determine the mode of them 

In [42]:
def mode_of_k_nearest(k, dataset, class_column):
    k_nearest = dataset[:k,class_column]
    return Counter(k_nearest).most_common(1)[0][0]

# Let's test our algorithm with the films example

In [43]:
data = np.genfromtxt("data/films.csv", delimiter=',')
data

array([[  3., 104.,   1.],
       [  2., 100.,   1.],
       [  1.,  81.,   1.],
       [101.,  10.,   0.],
       [ 99.,   5.,   0.],
       [ 98.,   2.,   0.]])

In [44]:
normalized_values = normalize(data[:,0:2])
data[:,0:2] = normalized_values
data

array([[0.02      , 1.        , 1.        ],
       [0.01      , 0.96078431, 1.        ],
       [0.        , 0.7745098 , 1.        ],
       [1.        , 0.07843137, 0.        ],
       [0.98      , 0.02941176, 0.        ],
       [0.97      , 0.        , 0.        ]])

In [45]:
sample = [18,90]
distances = calc_distances_to_sample(sample, data)
distances

[90.79801980219612,
 90.8384391655147,
 91.02300863369925,
 91.51441692116809,
 91.5662991902853,
 91.5970572671415]

In [46]:
data = np.insert(data, 3, distances, axis = 1)
data

array([[2.00000000e-02, 1.00000000e+00, 1.00000000e+00, 9.07980198e+01],
       [1.00000000e-02, 9.60784314e-01, 1.00000000e+00, 9.08384392e+01],
       [0.00000000e+00, 7.74509804e-01, 1.00000000e+00, 9.10230086e+01],
       [1.00000000e+00, 7.84313725e-02, 0.00000000e+00, 9.15144169e+01],
       [9.80000000e-01, 2.94117647e-02, 0.00000000e+00, 9.15662992e+01],
       [9.70000000e-01, 0.00000000e+00, 0.00000000e+00, 9.15970573e+01]])

In [47]:
data = sort_by_distance(data,3)
data

array([[2.00000000e-02, 1.00000000e+00, 1.00000000e+00, 9.07980198e+01],
       [1.00000000e-02, 9.60784314e-01, 1.00000000e+00, 9.08384392e+01],
       [0.00000000e+00, 7.74509804e-01, 1.00000000e+00, 9.10230086e+01],
       [1.00000000e+00, 7.84313725e-02, 0.00000000e+00, 9.15144169e+01],
       [9.80000000e-01, 2.94117647e-02, 0.00000000e+00, 9.15662992e+01],
       [9.70000000e-01, 0.00000000e+00, 0.00000000e+00, 9.15970573e+01]])

In [48]:
mode_of_k_nearest(3,data,2)

1.0

*This is the our predicted class! :)*