# K-Nearest Neighbour Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification

X,y = make_classification(n_samples=1000, n_classes=2, n_features=3, n_redundant=1, random_state=999)


In [3]:
X

array([[-0.33504974,  0.02852654,  1.16193084],
       [-1.37746253, -0.4058213 ,  0.44359618],
       [-1.04520026, -0.72334759, -3.10470423],
       ...,
       [-0.75602574, -0.51816111, -2.20382324],
       [ 0.56066316, -0.07335845, -2.15660348],
       [-1.87521902, -1.11380394, -4.04620773]])

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

### Read more docs about the params in knearest classifier 

1. "p" param - calculation of distance between 2 points is done using p value
    if p = 1 -----> distance btw 2 points is calculated using Manhattan distance
    if p = 2 -----> distance btw 2 points is calculated using Eculidean distance

2. "algorithm" param - to select the best algo to compute the nearest neighbours 
    values of are - {kd_tree, ball_tree, brute, auto}  ----> read docs for more info

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5,algorithm='auto')

# fit the training data
knn_classifier.fit(X_train, y_train)

# predict the test data
y_pred = knn_classifier.predict(X_test)

In [6]:
y_pred

array([1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,

## Performance Metrics

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("classification report",classification_report(y_test, y_pred))
print("accuracy score",accuracy_score(y_test, y_pred))
print("confusion matrix",confusion_matrix(y_test, y_pred))

classification report               precision    recall  f1-score   support

           0       0.90      0.92      0.91       177
           1       0.92      0.90      0.91       173

    accuracy                           0.91       350
   macro avg       0.91      0.91      0.91       350
weighted avg       0.91      0.91      0.91       350

accuracy score 0.9085714285714286
confusion matrix [[163  14]
 [ 18 155]]


### Performing GridSearchCV - hyperparameter tuning to find the k value for the inout 


GridSearchCV from the sklearn.model_selection library is a powerful method for hyperparameter tuning. 
It performs an exhaustive search over a specified parameter grid, evaluating each combination using cross-validation to find the optimal set of parameters for a given model. 
For the K-Nearest Neighbors (KNN) algorithm, the 'k' value (number of neighbors) is a key hyperparameter to tune.

In [9]:
param_grid = {'n_neighbors': list(range(1, 11))} 

#Create a GridSearchCV object
# cv=5 means 5-fold cross-validation
#grid_search = GridSearchCV(knn, param_grid, cv=5)

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best 'k' value
print("Best k value:", grid_search.best_params_['n_neighbors'])


Best k value: 9
