# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

In [2]:
import pandas as pd


# Get Example Data

In [8]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide

data = pd.read_csv("http://www.stat.tamu.edu/~sheather/book/docs/datasets/MichelinNY.csv", encoding = "ISO-8859-1")
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [9]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']

data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [10]:
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train test split

In [11]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.head()

Unnamed: 0,Food,Decor,Service,Price
109,20,14,16,50
88,26,13,14,13
65,18,17,15,37
31,18,17,19,40
71,24,13,22,25


### Train model with k=5

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))

y_pred = knn.predict(X_test) # y_pred includes your predictions

accuracy: 0.88


### Train model with k=10

In [13]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))
y_pred = knn.predict(X_test)


accuracy: 0.88


In [14]:
y_pred # view predictions for test data

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0],
      dtype=int64)

## Using Cross validation for model evaluation

In [1]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# Set up function parameters for diff't cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

print("KFold:\n{}".format(
cross_val_score(KNeighborsClassifier(), X, y, cv=kfold)))

print("StratifiedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=skfold)))

print("RepeatedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=rkf)))


NameError: name 'KNeighborsClassifier' is not defined

## Tuning models with grid search

In [16]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.829
best parameters: {'n_neighbors': 13}
test-set score: 0.732


In [15]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)
print(results)


   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0         0.0010           0.0006         0.731707          1.000000   
1         0.0008           0.0005         0.796748          0.866251   
2         0.0016           0.0000         0.788618          0.848167   
3         0.0017           0.0001         0.821138          0.836422   
4         0.0007           0.0006         0.804878          0.834709   
5         0.0011           0.0004         0.804878          0.819279   
6         0.0007           0.0006         0.788618          0.808411   

  param_n_neighbors                params  rank_test_score  split0_test_score  \
0                 1   {u'n_neighbors': 1}                7           0.769231   
1                 3   {u'n_neighbors': 3}                4           0.846154   
2                 5   {u'n_neighbors': 5}                5           0.923077   
3                 7   {u'n_neighbors': 7}                1           0.923077   
4                 