# ReadMe
- This section is to learn about k-nearest neighbor (kNN)
- kNN is being used to predict unlabelled data points based on nearest neighbor of known data points

# Environment Set Up
- install necessary libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
pd.options.display.float_format = "{:,.2f}".format
from datasets import load_dataset
from numpy import ravel

  from .autonotebook import tqdm as notebook_tqdm


# Load Data
- we will use this social network ads data to https://huggingface.co/datasets/saifhmb/social-network-ads
- we will predict if someone will purchase the product through very simple dataset

In [2]:
df = pd.read_parquet("hf://datasets/saifhmb/social-network-ads/data/train-00000-of-00001.parquet")
df.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


# 1. EDA

In [3]:
#check dataframe size
df.shape

(400, 3)

In [4]:
# Check Dataset Types
df.dtypes

Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object

In [5]:
# check empty columns
df.isna().sum()

Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

# 2. kNN

## 2.a. Train Test Split and Scale

In [6]:
#Split Dataset into X and y
X = df.drop(columns=['Purchased'])
y = df['Purchased'].values

#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

#Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 2.b. Choose K
- we will do cross validation using gridsearch to choose optimal k
- fit on gridsearch is only on train data

In [7]:
knn = KNeighborsClassifier()

k_range = list(range(1, 20))
param_grid = dict(n_neighbors = k_range)
  
# I will use cv=5 here
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
  
# fit it to the grid
grid_search=grid.fit(X_train, ravel(y_train))

# get the best k!!
print(grid_search.best_params_)

{'n_neighbors': 7}


## 2.c Implement kNN

In [8]:
# SOLUTION
#create model with k=7
knn = KNeighborsClassifier(n_neighbors=7)

#fit the model
knn.fit(X_train, ravel(y_train))

#predict
y_pred = knn.predict(X_test) 

#check accuracy score
accuracy_score(y_test, y_pred)

0.925