# K Nearest Neighbors - A Supervised Machine Learning Algorithm

Implementing KNN classification Algorithm using base Python for the classic Iris Dataset.



#### Steps:

    1. Divide the dataset into train and test data.
    2. Calculate distance (Euclidean distance) between each new data point and all data points in training dataset.
    3. Sort the distances in ascending order and extract top k shortest distances.  
    4. Find the most frequentyly occurring label/class for the k shortest distance data points.
    5. Assign the most frequently occurring class label to the new data point.
    6. Calculate the model accuracy - number of correct match / total new input data points.
    
    
    
    
    

In [1]:
# importing dependencies here
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import operator

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# reading the classic iris dataset into a df
iris_df = pd.read_csv("iris_dataset.csv")

<IPython.core.display.Javascript object>

In [3]:
iris_df.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

<IPython.core.display.Javascript object>

In [4]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


<IPython.core.display.Javascript object>

In [5]:
# dividing the data into train and test
def split_data(data):

    train_data = iris_df.sample(
        frac=0.8, random_state=200
    )  # random state is a seed value
    test_data = iris_df.drop(train_data.index)

    return train_data, test_data

<IPython.core.display.Javascript object>

In [6]:
# Euclidean distance calculation
def calc_distance(training_data, test_data_row, length):
    distance = 0.0
    for x in range(length):
        distance += (training_data[x] - test_data_row[x]) ** 2
    return np.sqrt(distance)

<IPython.core.display.Javascript object>

In [7]:
def knn(train_data, test_data_instance, k):

    distance_dict = {}
    length = test_data_instance.shape[0]

    # create dictionary of training row index and row distance from the new data point
    for i in range(len(train_data)):
        dist = calc_distance(train_data.iloc[i], test_data_instance, length)
        distance_dict[i] = dist  # dist[0]

    # create list of sorted tuples (index,dist) based on distance in ascending order
    sorted_distances = sorted(
        distance_dict.items(), key=operator.itemgetter(1)
    )  # dict.items() returns a list of (key,value) pairs

    # create list of top k number of distances and related row index (list k shortest distances)
    neighbors_index = []
    neighbors_dist = []
    for j in range(k):
        neighbors_index.append(sorted_distances[j][0])
        neighbors_dist.append(round(sorted_distances[j][1], 2))
    #     print("neighbors_index", neighbors_index)

    # counting most occurring label/class
    count = {}
    neigh_label = []

    for m in range(len(neighbors_index)):

        response = train_data.iloc[neighbors_index[m]][-1]
        neigh_label.append(response)

        if response in count.keys():
            count[response] += 1
        else:
            count[response] = 1

    # sorting to get label of most frequent class (in descending order)
    sortcount = sorted(count.items(), key=operator.itemgetter(1), reverse=True)

    #     print("neighbors_index", neighbors_index)
    #     print("neighbors_dist", neighbors_dist)
    #     print("SORT COUNT", sortcount)
    return (sortcount[0][0], neighbors_index, neighbors_dist, neigh_label)

<IPython.core.display.Javascript object>

In [8]:
###################################################################################
#                               MAIN SCRIPT                                       #
###################################################################################

train_data, test_data = split_data(iris_df)
test_data = test_data.reset_index().drop("index", axis=1)
print("TRAIN", train_data)
print("TEST", test_data)

count = 0
result_labels = []
neigh_indices = []
neigh_distances = []
neigh_labels = []
actual_label = []
status = []

for i in range(len(test_data)):

    k = 2

    result_label, neigh_index, neigh_dist, neigh_label = knn(
        train_data, test_data.iloc[i][0:-1], k
    )

    result_labels.append(result_label)
    neigh_indices.append(neigh_index)
    neigh_distances.append(neigh_dist)
    neigh_labels.append(neigh_label)
    actual_label.append(test_data.iloc[i][-1])

    if result_label == test_data.iloc[i][-1]:
        count += 1
        status.append("pass")
    else:
        status.append("fail")

# calculating the accuracy
score = (count / test_data.shape[0]) * 100

# results dataframe for easy visualization
df = pd.DataFrame(
    {
        "neighbor_dist": neigh_distances,
        "neighbor_label": neigh_labels,
        "result_label": result_labels,
        "actual_label": actual_label,
        "result": status,
    }
)

print(df)
print("\nk: ", k)
print("Total Number of Input Data Points: ", test_data.shape[0])
print("Number of Passed test cases: ", count)
print(f"\nModel Accuracy: {score:.2f}%")

TRAIN      sepal_length  sepal_width  petal_length  petal_width     species
84            5.4          3.0           4.5          1.5  versicolor
122           7.7          2.8           6.7          2.0   virginica
28            5.2          3.4           1.4          0.2      setosa
24            4.8          3.4           1.9          0.2      setosa
75            6.6          3.0           4.4          1.4  versicolor
..            ...          ...           ...          ...         ...
117           7.7          3.8           6.7          2.2   virginica
59            5.2          2.7           3.9          1.4  versicolor
45            4.8          3.0           1.4          0.3      setosa
32            5.2          4.1           1.5          0.1      setosa
31            5.4          3.4           1.5          0.4      setosa

[120 rows x 5 columns]
TEST     sepal_length  sepal_width  petal_length  petal_width     species
0            4.9          3.0           1.4          0.2

<IPython.core.display.Javascript object>

## Implementation using sklearn

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv("iris_dataset.csv")

X = df[[column for column in df.columns if not column == "species"]].values
y = df["species"].values

print(X)
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
knn = KNeighborsClassifier(n_neighbors=2, algorithm="brute")

knn.fit(X_train, y_train)
print("SCORE", knn.score(X_test, y_test))

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

<IPython.core.display.Javascript object>