In [1]:
import numpy as np

In [2]:
def distance_of_2_examples(u, v, p):
    d_power = np.power(np.subtract(u, v), p)
    distance = np.power(np.sum(d_power, axis=0), 1/p)
    return distance

In [3]:
def distance_of_1set_1ex(set1, ex, p):
    m = set1.shape[0]
    d_set = np.zeros((m, 1))
    for i in range(m):
        d_set[i] = distance_of_2_examples(set1[i], ex, p)
    d_set.reshape(m, 1)
    return d_set

In [4]:
#read dataset
def load_dataset():
    dims = 4
    classes = 3
        
    train_set = np.loadtxt('iris.trn', delimiter=',')
    test_set = np.loadtxt('iris.tst', delimiter=',')
    train_x = train_set[:, :4]
    train_y = train_set[:, -1:]
    test_x = test_set[:, :4]
    test_y = test_set[:, -1:]
    
    return train_x, train_y, test_x, test_y, classes

In [5]:
def top_k_idx(arr, k):
    return np.argpartition(arr, -k)[-k:]

In [6]:
def kNN(train_x, train_y, test_x, class_num, distance_p, k):
    m_train = train_x.shape[0]
    m_test = test_x.shape[0]
    d_set = np.zeros((m_test, m_train))
    d_set_top_k_idx = np.zeros((m_test, k), dtype=int)
    class_of_top_knn = np.zeros((m_test, k), dtype=int)
    class_occurence = np.zeros((m_test, class_num), dtype=int) #o[i][j]: occurence of class j in ith example
    predict = np.zeros((m_test, 1), dtype=int)
    
    #get top k classes
    for i in range(m_test):
        #get distance of example ith in test compare to examples in train
        #each row is a list of distance compare to each example in train
        d_set[i] = distance_of_1set_1ex(train_x, test_x[i], distance_p).reshape(1,-1)
        
        #sort to get k min distance
        #idx = np.argpartition(x, -k)[-k:]  # Indices of top k max, idx not sorted
        d_set_top_k_idx[i] = np.argpartition(d_set[i], k)[:k]
        d_set_top_k_idx[i] = d_set_top_k_idx[i]
        
        #get ith example's class list of top knn
        class_of_top_knn[i] = train_y[d_set_top_k_idx[i]].reshape(1,-1)
        
        #get class with highest occurence = predict result
        for j in range(class_num):
            class_occurence[i][j] = np.sum(class_of_top_knn[i]==j)
        
        #predicted class is the class with hight occurence
        predict[i] = np.where(class_occurence[i]==np.max(class_occurence[i]))
    return predict
        
    

In [13]:
train_x, train_y, test_x, test_y, classes = load_dataset()
predict = kNN(train_x, train_y, test_x, classes, 2, 1)

In [14]:
#accuracy
isCorrect = test_y == predict
correct_cnt = np.count_nonzero(isCorrect==True)
total_cnt = isCorrect.shape[0]
acc = correct_cnt/total_cnt*100
print(acc)

94.0


In [9]:
#testing cell, meaningless
arr = np.array([1, 2, 3, 4, 5, 5])
print(arr)
idx = np.argpartition(arr, 3)[:3]
print(arr[idx])
o = np.zeros((1, 6))
for i in range(6):
    o[0][i] = np.sum(arr==i)
print(o[0])
predict = np.max(o[0])
print(predict)

[1 2 3 4 5 5]
[2 1 3]
[0. 1. 1. 1. 1. 2.]
2.0


In [17]:
#ex1
train_x = np.array([[0.376000, 0.488000],
                  [0.312000, 0.544000],
                  [0.298000, 0.624000],
                  [0.394000, 0.600000],
                  [0.506000, 0.512000],
                  [0.488000, 0.334000],
                  [0.478000, 0.398000],
                  [0.606000, 0.366000],
                  [0.428000, 0.294000],
                  [0.542000, 0.252000]]
                  )
train_y = np.array([[0], [0], [0], [0], [0], [1], [1], [1], [1], [1]])

test_x = np.array([[0.550000, 0.364000],
                  [0.558000, 0.470000],
                  [0.456000, 0.450000],
                  [0.450000, 0.570000]]
                  )
#1.
# print(distance_of_1set_1ex(train_x, test_x[0], 2))
# print(distance_of_1set_1ex(train_x, test_x[1], 2))
# print(distance_of_1set_1ex(train_x, test_x[2], 2))
# print(distance_of_1set_1ex(train_x, test_x[3], 2))
predict = kNN(train_x, train_y, test_x, 2, 2, 3)
print(predict)

[[1]
 [1]
 [0]
 [0]]
