# k-NN classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


# Reading the data

In [59]:
data = pd.read_csv('siren_data_train.csv')
data.head()

Unnamed: 0,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,noise,in_vehicle,asleep,no_windows,age
0,2712,1998301.0,9011692.0,-171.588672,1,0,1999193.0,9011824,0,0,0,0,59
1,2721,1928907.0,8954624.0,-51.208102,1,0,1928298.0,8955382,0,0,0,0,29
2,297,2026384.0,8256164.0,39.018754,1,0,2025706.0,8255615,0,0,0,0,32
3,739,1743184.0,8052652.0,15.046022,1,0,1742935.0,8052585,0,0,0,0,36
4,1852,1350375.0,7909850.0,144.60317,1,0,1350807.0,7909543,0,0,0,0,55


# Preprocessing the data
Since the data consists of coordinates for the person and the nearest horn, we can replace these with the just the distance to the nearest horn. We can also go ahead and drop the "near_fid" column as this column doesn't provide any information towards the target variable "heard". The same goes for "near_angle" which we can drop since it is uncorrelated, as shown in question 3 of the data analysis task.

In [60]:
distances = {'distance': []}
for i in range(0, len(data)):
    point_a = np.array((float(data.iloc[i, 1]), float(data.iloc[i, 2])))
    point_b = np.array((float(data.iloc[i, 6]), float(data.iloc[i, 7])))
    distance = np.linalg.norm(point_a - point_b)
    distances['distance'].append(distance)


data = data.assign(distance=distances['distance'])

data = data.drop(['near_x', 'near_y', 'ycoor', 'xcoor', 'near_fid', 'near_angle','building'], axis=1)

data.head()

Unnamed: 0,heard,noise,in_vehicle,asleep,no_windows,age,distance
0,1,0,0,0,0,59,901.283517
1,1,0,0,0,0,29,972.00626
2,1,0,0,0,0,32,872.340924
3,1,0,0,0,0,36,257.804449
4,1,0,0,0,0,55,529.686791


There are some outliers in the distance column, lets remove them. 

In [61]:
# Removing outliers
z = np.abs(stats.zscore(data['distance']))
threshold = 2
outlier_indices = np.where(z > threshold)[0]
data_no_outliers = data.drop(outlier_indices, axis=0)
data_no_outliers.dropna(inplace=True)

print("original shape: ", data.shape)
print("new shape: ", data_no_outliers.shape)

original shape:  (5710, 7)
new shape:  (5462, 7)


# Train the model

In [62]:
X = data_no_outliers.drop(['heard'], axis=1)
y = data_no_outliers['heard']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_hat = knn.predict(X_train)

print("Accuracy on training set:", accuracy_score(y_train, y_hat))

y_pred = knn.predict(X_val)

print("Accuracy on validation set:", accuracy_score(y_val, y_pred))

score = cross_val_score(knn, X, y, cv=5)
print("Cross validation score:", score.mean(), "Standard deviation:", score.std())  

(4369, 6) (4369,)
Accuracy on training set: 0.9091325246051728
Accuracy on validation set: 0.8874656907593779
Cross validation score: 0.8844744611899232 Standard deviation: 0.003245861543286284


In [57]:
correlation_matrix = data_no_outliers.corr()
correlation_with_target = correlation_matrix["heard"].abs().sort_values(ascending=False)
print(correlation_with_target)

heard         1.000000
distance      0.647269
in_vehicle    0.415248
noise         0.385856
age           0.272923
no_windows    0.264627
asleep        0.154445
building      0.038410
Name: heard, dtype: float64


# Tune the model
It's time for some hyperparameter tuning, for k-NN there is only one parameter: number of neighbors. We will use GridSearchCV to tune it. 

In [None]:
param_grid = {'n_neighbors': np.arange(1, 70)}
knn_cv = GridSearchCV(knn, param_grid, cv=5)

knn_cv.fit(X, y)
print("Tuned hyperparameter k:", knn_cv.best_params_)

print("Tuned accuracy:", knn_cv.best_score_)

