# K Nearest Neighbors - A Supervised Machine Learning Algorithm

Implementing KNN classification Algorithm using base Python for the classic Iris Dataset.



#### Steps:

    1. Divide the dataset into train and test data.
    2. Calculate distance (Euclidean distance) between each new data point and all data points in training dataset.
    3. Sort the distances in ascending order and extract top k shortest distances.  
    4. Find the most frequentyly occurring label/class for the k shortest distance data points.
    5. Assign the most frequently occurring class label to the new data point.
    6. Calculate the model accuracy - number of correct match / total new input data points.
    
    
    
    
    

In [1]:
# importing dependencies here
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import operator

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# reading the classic iris dataset into a df
iris_df = pd.read_csv("iris_dataset.csv")

<IPython.core.display.Javascript object>

In [3]:
iris_df.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

<IPython.core.display.Javascript object>

In [4]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


<IPython.core.display.Javascript object>

In [5]:
# dividing the data into train and test
def divide_data(data):

    np.random.seed(400)
    data = data.apply(np.random.permutation, axis=0)

    train_data = data[0 : int(0.8 * 150)]
    test_data = data[int(0.8 * 150) :]

    #     print("TRAIN ", train_data)
    #     print("TEST", test_data)
    return train_data, test_data

<IPython.core.display.Javascript object>

In [6]:
# Euclidean distance calculation
def calc_distance(training_data, test_data_row, length):
    distance = 0
    for x in range(length):
        distance += (training_data[x] - test_data_row[x]) ** 2
    #         distance += np.square(training_data[x] - test_data_row[x])
    return np.sqrt(distance)


#     return distance

<IPython.core.display.Javascript object>

In [7]:
def knn(train_data, test_data_instance, k):

    distance_dict = {}
    length = test_data_instance.shape[0]

    # create dictionary of training row index and row distance from the new data point
    for i in range(len(train_data)):
        dist = calc_distance(train_data.iloc[i], test_data_instance, length)
        distance_dict[i] = dist  # dist[0]

    #     print("distance_dict", distance_dict)
    # create list of sorted tuples (index,dist) based on distance in ascending order
    sorted_distances = sorted(
        distance_dict.items(), key=operator.itemgetter(1)
    )  # dict.items() returns a list of (key,value) pairs

    # create list of top k number of distances and related row index (list k shortest distances)
    neighbors_index = []
    neighbors_dist = []
    for j in range(k):
        neighbors_index.append(sorted_distances[j][0])
        neighbors_dist.append(round(sorted_distances[j][1], 2))
    #     print("neighbors_index", neighbors_index)

    # counting most occurring label/class
    count = {}
    neigh_label = []

    for m in range(len(neighbors_index)):

        response = train_data.iloc[neighbors_index[m]][-1]
        #         print("RESPONSE", response)

        neigh_label.append(response)

        if response in count.keys():
            count[response] += 1
        else:
            count[response] = 1

    #     print("COUNT", count)
    # sorting to get label of most frequent class (in descending order)
    sortcount = sorted(count.items(), key=operator.itemgetter(1), reverse=True)

    #     print("neighbors_index", neighbors_index)
    #     print("neighbors_dist", neighbors_dist)
    #     print("SORT COUNT", sortcount)
    return (sortcount[0][0], neighbors_index, neighbors_dist, neigh_label)

<IPython.core.display.Javascript object>

In [8]:
###################################################################################
#                               MAIN SCRIPT                                       #
###################################################################################

train_data, test_data = divide_data(iris_df)

count = 0
result_labels = []
neigh_indices = []
neigh_distances = []
neigh_labels = []
actual_label = []
status = []

for i in range(len(test_data)):

    k = 4

    result_label, neigh_index, neigh_dist, neigh_label = knn(
        train_data, test_data.iloc[i][0:-1], k
    )

    result_labels.append(result_label)
    neigh_indices.append(neigh_index)
    neigh_distances.append(neigh_dist)
    neigh_labels.append(neigh_label)
    actual_label.append(test_data.iloc[i][-1])

    if result_label == test_data.iloc[i][-1]:
        count += 1
        status.append("pass")
    else:
        status.append("fail")

# calculating the accuracy
score = (count / test_data.shape[0]) * 100

# results dataframe for easy visualization
df = pd.DataFrame(
    {
        #                     "neighbor_index":neigh_indices,
        "neighbor_dist": neigh_distances,
        "neighbor_label": neigh_labels,
        "result_label": result_labels,
        "actual_label": actual_label,
        "result": status,
    }
)

print(df)
print("\nk: ", k)
print("Total Number of Input Data Points: ", test_data.shape[0])
print("Number of Passed test cases: ", count)
print(f"\nModel Accuracy: {score:.2f}%")

               neighbor_dist                                   neighbor_label  \
0   [0.51, 0.54, 0.55, 0.58]       [setosa, virginica, virginica, versicolor]   
1   [0.44, 0.62, 0.68, 0.85]          [virginica, setosa, setosa, versicolor]   
2    [0.5, 0.88, 0.96, 1.12]      [virginica, setosa, versicolor, versicolor]   
3   [0.35, 0.37, 0.56, 0.62]           [setosa, setosa, virginica, virginica]   
4   [0.44, 0.52, 0.62, 0.76]   [virginica, virginica, versicolor, versicolor]   
5   [0.26, 0.36, 0.46, 0.66]       [virginica, versicolor, virginica, setosa]   
6   [0.57, 0.87, 0.99, 1.02]      [versicolor, versicolor, virginica, setosa]   
7   [0.88, 0.91, 0.92, 0.92]              [virginica, setosa, setosa, setosa]   
8    [0.78, 0.93, 0.99, 1.0]           [virginica, virginica, setosa, setosa]   
9   [0.79, 0.81, 1.14, 1.19]      [setosa, versicolor, versicolor, virginica]   
10  [0.54, 1.35, 1.39, 1.49]      [versicolor, virginica, versicolor, setosa]   
11  [0.46, 0.52, 0.62, 0.66]

<IPython.core.display.Javascript object>