In [22]:
# import the usual libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("data/data.csv",names=["A"+str(x) for x in range(1,17,1)])
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [36]:
# lets encode all the worthy columns as categories
# operate on copy in case of mistakes
df2 = df.copy()
for col in df2.columns:
    if df2[col].dtype == 'object':
        df2[col] = df2[col].astype('category')
        df2[col] = df2[col].cat.codes
df2.rename(columns={'A16':'target'},inplace=True) # rename last column to target since it gound level truth
df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,2,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,1,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,1,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,2,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,2,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


In [37]:
# lets make X the feature matrix and y the target matrix
X = df2[df2.columns[:-1]]
y = df2['target']
#scale X's
X = StandardScaler().fit_transform(X)

Part 1: Using the KNN Classifier provided by the sklearn library

 1. Initialize the classifier with default value for n_neighbors
 2. Train the classifier
 3. Determine the recall score of the classifier

In [38]:
#initialise with defaults
knn = KNeighborsClassifier()
# fitting the model
knn.fit(X, y)
# Score the model
knn.score(X,y)

0.9

Part 2: Using the cross_val_score module provided by the sklearn library

 1. Perform 10 fold cross validation to obtain the optimal value to use for n_neighbor
 2. Retrain the classifier
 3. Determine the recall score of the classifier


In [39]:
# creating odd list of K for KNN
neighbors = filter(lambda x: x % 2 != 0, list(range(1,50)))
# empty dict that will hold cv scores
cv_scores = {}
# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10)
    cv_scores[k] = scores.mean()
#find the best k by looking for the highest score
best_k = max(cv_scores, key=lambda k: cv_scores[k])
best_k

7

In [41]:
# rerunning KNN with new k
#initialise with defaults
knn_new = KNeighborsClassifier(n_neighbors=7)
# fitting the model
knn_new.fit(X, y)
# Score the model
knn_new.score(X,y)

0.8942028985507247