# k-NN classifier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


# Train the model

In [3]:
data = pd.read_csv('siren_data_train_no_outliers.csv')
data.head()

X = data.drop(['heard'], axis=1)
y = data['heard']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)

score = cross_val_score(knn, X, y, cv=5)
print("Cross validation score:", score.mean(), "Standard deviation:", score.std())  
pd.crosstab(y_val, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Cross validation score: 0.8844744611899232 Standard deviation: 0.003245861543286284


Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,150,86,236
1,38,819,857
All,188,905,1093


# Tune the model
It's time for some hyperparameter tuning, for k-NN there is only one parameter: number of neighbors. We will use GridSearchCV to tune it. 

In [4]:
param_grid = {'n_neighbors': np.arange(2, 100)}
knn_cv = GridSearchCV(knn, param_grid, cv=10)

knn_cv.fit(X_train, y_train)

print("Tuned hyperparameter k:", knn_cv.best_params_)
print("Tuned accuracy:", knn_cv.best_score_)

Tuned hyperparameter k: {'n_neighbors': 29}
Tuned accuracy: 0.8981430940734366


# Naive classifier

In [5]:
y_pred_naive = np.ones(len(X_val))
accuracy = accuracy_score(y_val, y_pred_naive)
print("Accuracy Score:", accuracy)

Accuracy Score: 0.7840805123513266
