## Classification with KNN (K-Nearest Neighbors)


In [70]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
data = fetch_ucirepo(id=602)
X = data.data.features
y = data.data.targets

classes = np.unique(y)

# NOTE: Modification just for our case, this dataset is too big for our KNN implementation
X = X[:int(len(X)*0.25)]
y = y[:int(len(y)*0.25)] 



def prep_data(X,y):
    X = X.copy()
    y = y.copy()
    
    # Encode targets
    for i, c in enumerate(classes):
        y[y == c] = i

    return X.values, y.values



def distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def k_nearest_neighbors(X, x1, k=5):
    distances = []
    y_values = X[:,-1]
    for i in range(len(X)):
        d = distance(X[i][:-1], x1)
        distances.append((d, y_values[i]))
    distances.sort()
    return distances[:k]

train_set = {}

test_set = {}

X, y = prep_data(X, y)


# Merge X and y
X = np.column_stack((X, y))


np.random.shuffle(X)

test_size = 0.2
X_train, X_test = X[:-int(test_size*len(X))], X[-int(test_size*len(X)):]

correct = 0
total = 0
for x in X_test:
    
    x_true_class = x[-1]
    x_feature = x[:-1]
    knn = k_nearest_neighbors(X_train, x_feature, 5)
    votes = [i[-1] for i in knn]
    majority_vote = max(set(votes), key=votes.count)
    confidence = votes.count(majority_vote) / len(votes)
    
    if majority_vote == x_true_class:
        correct += 1
    total += 1
    
print(f"Accuracy: {correct/total}, Correct: {correct}, Total: {total}")

Accuracy: 0.9735294117647059, Correct: 662, Total: 680


## Class Implementation

In [67]:
class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2)**2))

    def k_nearest_neighbors(X, x1, k=5):
        distances = []
        y_values = X[:,-1]
        for i in range(len(X)):
            d = distance(X[i][:-1], x1)
            distances.append((d, y_values[i]))
        distances.sort()
        return distances[:k]
    
    def predict(self, X, x):
        knn = k_nearest_neighbors(X, x, self.k)
        votes = [i[-1] for i in knn]
        majority_vote = max(set(votes), key=votes.count)
        confidence = votes.count(majority_vote) / len(votes)
        return majority_vote, confidence
    
    def fit_and_score(self, X):
        correct = 0
        total = 0
        X_train, X_test = X[:int(len(X)*0.8)], X[int(len(X)*0.8):]
        
        for x in X_test:
            vote, confidence = self.predict(X_train, x[:-1])
            if vote == x[-1]:
                correct += 1
            total += 1
            
        return correct/total
        


In [68]:
clf = KNN(k=3)

In [69]:
clf.fit_and_score(X)

0.986784140969163