In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

Toy data for illustrative purposes

In [2]:
train_df = pd.read_csv("http://soph.info/metis/nyc18_ds15/wookiee-train.csv", index_col=0)
test_df = pd.read_csv("http://soph.info/metis/nyc18_ds15/wookiee-test.csv", index_col=0)

Set up for modeling

In [3]:
x_train = train_df.drop('wookieecolor', axis=1)
y_train = train_df['wookieecolor']

x_test = test_df.drop('wookieecolor', axis=1)
y_test = test_df['wookieecolor']

## A crude class that carries out KNN

In [75]:
class KNN():
    
    def __init__(self, K=3):
        
        # default setting for K (number of neighbors)
        self.K = K
        
        # predicted labels (initialize as empty)
        self.pred = []
    
    def predict(self, x_train, y_train, x_test):
        
        # loop to carry out KNN classification test observations:
        # for each test observation:
        # 1) get distance to each train observation
        # 2) label according to most frequent label of K closest neighbors in training observations
        for row in range(len(x_test)):
            dists = []
            for row2 in range(len(x_train)):
                sum_sq_d = sum((x_test.iloc[row,:] - x_train.iloc[row2, :])**2)
                dists.append(sum_sq_d**0.5)
            self.pred.append(y_train[np.argsort(dists)[:self.K]].mode().values[0])
        
        # return predictions for test data
        return self.pred
    
    def score(self, y):
        
        # return accuracy score (% correctly predicted compared to true labels) 
        return sum(self.pred == y)/len(y)

In [61]:
knn = KNN(K=5)

In [62]:
y_pred = knn.predict(x_train, y_train, x_test)

In [63]:
y_pred

['chartreuse',
 'red',
 'red',
 'blue',
 'red',
 'white',
 'white',
 'red',
 'blue',
 'red',
 'white',
 'white',
 'white',
 'chartreuse',
 'red',
 'chartreuse',
 'red',
 'red',
 'white',
 'red',
 'red',
 'red',
 'chartreuse',
 'red',
 'white',
 'blue',
 'white',
 'white',
 'white',
 'red',
 'red',
 'chartreuse',
 'red',
 'blue',
 'red',
 'red',
 'white',
 'red',
 'red',
 'chartreuse',
 'red',
 'blue',
 'red',
 'red',
 'chartreuse',
 'red',
 'white',
 'red',
 'red',
 'chartreuse',
 'white',
 'red',
 'blue',
 'chartreuse',
 'white',
 'white',
 'white',
 'red',
 'red',
 'blue',
 'white',
 'red',
 'white',
 'red',
 'blue',
 'white',
 'white',
 'red',
 'blue',
 'white',
 'blue',
 'blue',
 'red',
 'blue',
 'white',
 'red',
 'blue',
 'white',
 'red',
 'red',
 'red',
 'white',
 'red',
 'red',
 'white',
 'red',
 'blue',
 'red',
 'blue',
 'red',
 'red',
 'white',
 'blue',
 'white',
 'red',
 'red',
 'white',
 'white',
 'white',
 'white',
 'white',
 'white',
 'white',
 'red',
 'red',
 'red',
 'red

In [64]:
knn.score(y_test)

0.744

## Compare to the results from scikit-learn

In [65]:
from sklearn.neighbors import KNeighborsClassifier

In [66]:
knn2 = KNeighborsClassifier(n_neighbors = 5)

In [67]:
knn2.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [68]:
knn2.predict(x_test)

array(['chartreuse', 'red', 'red', 'blue', 'red', 'white', 'white', 'red',
       'blue', 'red', 'white', 'white', 'white', 'chartreuse', 'red',
       'chartreuse', 'red', 'red', 'white', 'red', 'red', 'red',
       'chartreuse', 'red', 'white', 'blue', 'white', 'white', 'white',
       'red', 'red', 'chartreuse', 'red', 'blue', 'red', 'red', 'white',
       'red', 'red', 'chartreuse', 'red', 'blue', 'red', 'red',
       'chartreuse', 'red', 'white', 'red', 'red', 'chartreuse', 'white',
       'red', 'blue', 'chartreuse', 'white', 'white', 'white', 'red',
       'red', 'blue', 'white', 'red', 'white', 'red', 'blue', 'white',
       'white', 'red', 'blue', 'white', 'blue', 'blue', 'red', 'blue',
       'white', 'red', 'blue', 'white', 'red', 'red', 'red', 'white',
       'red', 'red', 'white', 'red', 'blue', 'red', 'blue', 'red', 'red',
       'white', 'blue', 'white', 'red', 'red', 'white', 'white', 'white',
       'white', 'white', 'white', 'white', 'red', 'red', 'red', 'red',
      

In [69]:
knn2.score(x_test, y_test)

0.744