In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

Toy data for illustrative purposes

In [2]:
train_df = pd.read_csv("http://soph.info/metis/nyc18_ds15/wookiee-train.csv", index_col=0)
test_df = pd.read_csv("http://soph.info/metis/nyc18_ds15/wookiee-test.csv", index_col=0)

Set up for modeling

In [3]:
X_train = train_df.drop('wookieecolor', axis=1)
y_train = train_df['wookieecolor']

X_test = test_df.drop('wookieecolor', axis=1)
y_test = test_df['wookieecolor']

## A crude class that carries out KNN

In [4]:
class KNN():
    
    def __init__(self, K=3):
        # default setting for K (number of neighbors)
        self.K = K
        
    def fit(self, X, y):
        
        # fit step is simply loading the features and labels
        self.X = X
        self.y = y
    
    def predict(self, X):
        
        # initialize empty predicted label list
        pred = []
        
        # loop to carry out KNN classification test observations:
        # for each test observation:
        # 1) get distance to all training observations
        # 2) label according to most frequent label of K closest neighbors in training observations
        for row in range(len(X)):
            dists = np.sqrt(np.sum((X.iloc[row] - self.X)**2, axis=1))
            pred.append(self.y[np.argsort(dists)[:self.K]].mode().values[0])
        
        # return predictions for test data
        return pred
    
    def score(self, X, y):
        
        # return accuracy score (% correctly predicted compared to true labels) 
        return sum(y == self.predict(X))/len(y)

In [5]:
knn = KNN(K=5)

In [6]:
knn.fit(X_train, y_train)

In [7]:
pred = knn.predict(X_test)

In [8]:
knn.score(X_test, y_test)

0.744

## Compare to the results from scikit-learn

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn2 = KNeighborsClassifier(n_neighbors = 5)

In [11]:
knn2.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
y_pred = knn2.predict(X_test)
sum(y_pred == pred) == len(y_pred) # check to see if all the predicted labels between sklearn and our method are identical

True

In [13]:
knn2.score(X_test, y_test)

0.744

Looks as if the predicted labels are the same in each case!