In [1]:
import numpy as np
import pandas as pd

In [2]:
def distance_of_2_examples(u, v, q=1):
    d_power = np.power(np.abs(np.subtract(u, v)), q)
    distance = np.power(np.sum(d_power, axis=0), 1/q)
    return distance

In [3]:
def distance_of_1set_1ex(set1, ex, q=1):
    m = set1.shape[0]
    d_set = np.zeros((m, 1))
    for i in range(m):
        d_set[i] = distance_of_2_examples(set1[i], ex, q)
    d_set.reshape(m, 1)
    return d_set

In [4]:
#read dataset
def load_dataset(train_set_file_name, test_set_file_name):
        
    train_set = np.loadtxt(train_set_file_name, delimiter=',')
    test_set = np.loadtxt( test_set_file_name, delimiter=',')
    
    dims = np.size(train_set, 1) - 1 #number of col - 1 = dim, last col is class label
    max_class = int(np.amax(train_set[:, dims])) #max value of label
   
    train_x = train_set[:, :dims]
    train_y = train_set[:, -1:]
    test_x = test_set[:, :dims]
    test_y = test_set[:, -1:]
    
    #if test_y == -1 => class = -1 vs 1 => let it become 0 - 1 for generalization
    #replace -1 -> 0
    train_y = np.where(train_y==-1.0, 0, train_y)
    test_y = np.where(test_y==-1.0, 0, test_y)
    
    return train_x, train_y, test_x, test_y, max_class

In [5]:
def top_k_idx(arr, k):
    return np.argpartition(arr, -k)[-k:]

In [6]:
def accuracy(predict, test_y):
    isCorrect = []
    isCorrect = (test_y == predict)
    correct_cnt = np.count_nonzero(isCorrect==True)
    total_cnt = isCorrect.shape[0]
    
    acc = correct_cnt/total_cnt
    
    return acc

In [7]:
def confusion_matrix(predict, test_y, classes):
    matrix = np.zeros((classes+1, classes+1)) #matrix[i][j] = predicted j for actual lable i
    for i in range(classes+1):
        #list of idx where actual label = i
        idx = np.where(test_y==i)
        for j in range(classes+1):
            matrix[i][j] = int(np.count_nonzero(predict[idx]==j))
    return matrix
    

In [8]:
def kNN(train_set_file_name, test_set_file_name, k):
    #get dataset
    train_x, train_y, test_x, test_y, max_class = load_dataset(train_set_file_name, test_set_file_name)
    
    #get dataset size
    m_train = train_x.shape[0]
    m_test = test_x.shape[0]
    
    
    d_set = np.zeros((m_test, m_train)) #distance of  ith example in test compare to all examples in train
    d_set_top_k_idx = np.zeros((m_test, k), dtype=int) #top k index of k nearest examples
    class_of_top_knn = np.zeros((m_test, k), dtype=int) 
    class_occurence = np.zeros((m_test, max_class+1), dtype=int) #o[i][j]: occurence of class j in ith example
    predict = np.zeros((m_test, 1), dtype=int)
    
    #get top k classes
    for i in range(m_test):
        #get distance of ith example in test compare to all examples in train
        #each row is a list of distance compare to each example in train
        d_set[i] = distance_of_1set_1ex(train_x, test_x[i]).reshape(1,-1)
        
        #sort to get k min distance
        #idx = np.argpartition(x, -k)[-k:]  # Indices of top k max, idx not sorted
        d_set_top_k_idx[i] = np.argpartition(d_set[i], k)[:k]
        
        #get ith example's class list of top knn
        class_of_top_knn[i] = train_y[d_set_top_k_idx[i]].reshape(1,-1)
        
        #get class with highest occurence = predict result
        for j in range(max_class+1):
            class_occurence[i][j] = np.sum(class_of_top_knn[i]==j)
        
        #predicted class is the class with hight occurence
        most_occurence_class = np.squeeze(np.where(class_occurence[i]==np.max(class_occurence[i])))
        if most_occurence_class.size==1:
            predict[i] = most_occurence_class
        else:
            #get a random class if many classes have same occurence
            predict[i] = np.random.choice(most_occurence_class)
#         print(i)
#         print(np.where(class_occurence[i]==np.max(class_occurence[i])))
#         print(most_occurence_class)
#         print( predict[i])
    acc = accuracy(predict, test_y)
    confusion_arr = confusion_matrix(predict, test_y, max_class)
    print("Accuracy: " + str(acc*100) + "%")
    
    print("Confusion matrix (col header is actual val, row header is predicted val): ")
    label_list = [str(i) for i in range(max_class+1)]
    df = pd.DataFrame.from_records(data=confusion_arr, index=label_list)
    print(df)
    
    return predict
    

In [10]:
k = 3
predict = {}
predict["iris"] = kNN('dataset/iris.trn', 'dataset/iris.tst', k)
print("==============")
predict["fp"] = kNN('dataset/fp.trn', 'dataset/fp.tst', k)
print("==============")
predict["letter"]= kNN('dataset/let.trn', 'dataset/let.tst', k)
print("==============")
predict["optics"]= kNN('dataset/opt.trn', 'dataset/opt.tst', k)
print("==============")
predict["leukemia"]= kNN('dataset/ALLAML.trn', 'dataset/ALLAML.tst', k)

Accuracy: 92.0%
Confusion matrix (col header is actual val, row header is predicted val): 
      0     1     2
0  17.0   0.0   0.0
1   0.0  15.0   0.0
2   0.0   4.0  14.0
Accuracy: 78.75%
Confusion matrix (col header is actual val, row header is predicted val): 
     0     1    2     3    4    5    6     7     8    9    10    11   12   13  \
0   0.0   0.0  0.0   0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0  0.0   
1   0.0  29.0  0.0   0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0  0.0   
2   0.0   0.0  4.0   0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0  0.0   
3   0.0   1.0  0.0  10.0  0.0  0.0  0.0   1.0   0.0  0.0  0.0   0.0  0.0  0.0   
4   0.0   1.0  0.0   0.0  4.0  0.0  0.0   1.0   0.0  0.0  0.0   0.0  1.0  0.0   
5   0.0   1.0  0.0   0.0  0.0  8.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0  0.0   
6   0.0  10.0  0.0   0.0  0.0  0.0  3.0   0.0   0.0  0.0  0.0   1.0  0.0  0.0   
7   0.0   0.0  0.0   0.0  0.0  0.0  0.0  10.0   0.0  0.0  0.0   0.0  0.0  0.0   
8   0.0 

==============================
below is testing cell, meaningless

In [None]:

train_x, train_y, test_x, test_y, classes = load_dataset('dataset/iris.trn', 'dataset/iris.tst')
predict = kNN('dataset/iris.trn', 'dataset/iris.tst', k)
idx = np.where(test_y==2)
print(idx)
print(predict[idx])
np.count_nonzero(predict[idx]==1)

np.arange(0, 5, 1)

In [None]:

# col_title = [0, 1 , 2, 3]
row_title = label_list = [str(i) for i in range(3+1)]
matrix = confusion_matrix(predict, test_y, 3)
print(matrix)
df = pd.DataFrame.from_records(data=matrix, index =row_title)
print(df)
# print(col_title)
# print(confusion_matrix(predict, test_y, 3))

In [None]:
arr = np.array([1, 2, 3, 4, 5, 5])
print(arr)
idx = np.argpartition(arr, 3)[:3]
print(arr[idx])
o = np.zeros((1, 6))
for i in range(6):
    o[0][i] = np.sum(arr==i)
print(o[0])
predict = np.max(o[0])
print(predict)

In [None]:
#ex1.1 data
train_x = np.array([[0.376000, 0.488000],
                  [0.312000, 0.544000],
                  [0.298000, 0.624000],
                  [0.394000, 0.600000],
                  [0.506000, 0.512000],
                  [0.488000, 0.334000],
                  [0.478000, 0.398000],
                  [0.606000, 0.366000],
                  [0.428000, 0.294000],
                  [0.542000, 0.252000]]
                  )
train_y = np.array([[0], [0], [0], [0], [0], [1], [1], [1], [1], [1]])

test_x = np.array([[0.550000, 0.364000],
                  [0.558000, 0.470000],
                  [0.456000, 0.450000],
                  [0.450000, 0.570000]]
                  )