# k-Nearest Neighbors

#### Loading data and related libraries

In [None]:
import pandas as pd
from plotnine import *
from sklearn.model_selection import train_test_split

iris = pd.read_csv("iris.csv")
iris.head()

#### Let's look at how the species are distributed against petal width and length

In [None]:
ggplot(iris,aes(x="petal_length",y="petal_width",color="species"))+geom_point()

#### We see a clear distinction between setosa and the others so it would be interesting to see how good the accuracy will be if a data point to be tested is in the top right region

#### Let's separate out the petal length and width for our model

In [None]:
x = iris[['petal_length','petal_width']]
y = iris['species']

#### We use a built-in library for the model. We call this model a classifier as it is being used to classify a label.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh=KNeighborsClassifier(n_neighbors=1) # initialize kNN classifier with k = 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # split data into training and testing sets
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
neigh.fit(X_train,y_train)

In [None]:
print("predicted:",neigh.predict(X_test)) # what the classifier predicts
print("neighbors",neigh.kneighbors(X_test)) # for each test data point, what were the closest 1 neighbors

#### We need to check accuracy of the classifier

In [None]:
from sklearn.metrics import accuracy_score
predictions = neigh.predict(X_test)
accuracy_score(y_test,predictions)

#### So a very good accuracy but not 100%

#### Let's try to see what happens to the accuracy score as we keep incrementing the value of k

In [None]:
accuracy = []
k = []
for i in range(1,30):
    neigh=KNeighborsClassifier(n_neighbors=i)
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    neigh.fit(X_train,y_train)
    predictions = neigh.predict(X_test)
    k.append(i)
    accuracy.append(accuracy_score(y_test,predictions))

In [None]:
df = pd.DataFrame({'k':k,'accuracy':accuracy})
ggplot(df,aes(x='k',y='accuracy'))+geom_line()