# Intro

First ML project! Building a simple k-NN model & running it on the classic iris dataset

# Model

## Minkowski distance between vectors and matrices

In [7]:
import numpy as np

In [8]:
def minkowski_vec(x1,x2,p=2.0,axis=None):
    return np.sum(
        np.abs((x1-x2))**p
        ,axis=axis
    )**(1.0/p)

In [9]:
def minkowski_mat(x,Y,p=2.0):
    return minkowski_vec(x,Y,p,axis=1)

## 1-NN

In [10]:
def nn(x, data, p=2): 
    feats = data[:,:-1] # remove last column (which is y)
    dist = minkowski_mat(x, feats, p)
    i = np.argmin(dist)
    neighbor = data[i,-1]
    return neighbor


## Running nn on the iris dataset

In [11]:
def split_data(data):
    training_size = min(100,int(len(data)*0.7))
    testing_size = len(data) - training_size
    shuffled_data = np.random.permutation(data)
    return shuffled_data[:training_size],shuffled_data[training_size:]

iris = np.loadtxt("iris.txt")
train,test = split_data(iris)

def nn_performance(training_data,test_data):
    accuracy = 0.0
    for i in test_data:
        truth = i[-1]
        prediction = nn(i[:-1],training_data) 
        if truth == prediction:
            accuracy += 1
    return accuracy / len(test_data)

# print(knn_performance(test,test)) # should be 1.0
# print(knn_performance(train,train)) # should also be 1.0
print(100 * nn_performance(train,test), "% Accuracy")


94.0 % Accuracy


In [12]:
def knn(x, data, k, p=2): 
    feats = data[:,:-1] # remove last column (which is y)
    dist = minkowski_mat(x, feats, p)
    indices = np.argpartition(dist,range(k))[:k] # find indices of k-NN
    neighbors = data[indices,-1] # get elements at those indices
    classes,counts = np.unique(neighbors,return_counts=True) # frequency table for each class
    winning_index = np.argmax(counts) # class with the highest frequency
    return classes[winning_index]
    
def knn_performance(training_data,test_data,k):
    accuracy = 0.0
    for i in test_data:
        truth = i[-1]
        prediction = knn(i[:-1],training_data,k) 
        if truth == prediction:
            accuracy += 1
    return accuracy / len(test_data)

def find_k(training_data,test_data,possible_ks = None):
    if possible_ks == None:
        possible_ks = range(len(training_data))
    best_k = 0
    best_performance = 0.0
    for k in possible_ks:
        if k > 0: # k has to be > 0
            perf = knn_performance(training_data,test_data,k)
            if perf > best_performance:
                best_k = k
                best_performance = perf
    return best_k,best_performance

train_k, train_perf = find_k(train,test)
test_k, test_perf = find_k(test,train)
print("Best k for training dataset:", train_k, "; Accuracy:",train_perf)
print("Best k for testing dataset:", test_k, "; Accuracy:",test_perf)


Best k for training dataset: 3 ; Accuracy: 0.96
Best k for testing dataset: 7 ; Accuracy: 0.99


## 100-NN

In [13]:
print(knn_performance(train,test,100))

0.28


**Explanation: Since the training dataset has 100 elements, the 100 nearest neighbors are simply the entire dataset.
          You therefore return the class that occurs the most often in the training dataset**

In [14]:
x = test[np.random.randint(len(test))][:-1] #pick a random row from test data and remove its class
predicted_class = knn(x,train,100)
print("Predicted class:", predicted_class)
classes,counts = np.unique(test[:,-1],return_counts=True) # frequency table for each class in test
print("Frequency:",counts[classes.tolist().index(predicted_class)]*1.0/len(test))
print("Which is the same as the 100-NN performance")

Predicted class: 1.0
Frequency: 0.28
Which is the same as the 100-NN performance
