In [1]:
import math
import numpy as np
data = []

In [2]:
with open('haberman.data', 'r') as f:
    for line in f.readlines():
        atributes = line.strip('\n').split(',')
        data.append([int(x) for x in atributes])

In [3]:
(data)

[[30, 64, 1, 1],
 [30, 62, 3, 1],
 [30, 65, 0, 1],
 [31, 59, 2, 1],
 [31, 65, 4, 1],
 [33, 58, 10, 1],
 [33, 60, 0, 1],
 [34, 59, 0, 2],
 [34, 66, 9, 2],
 [34, 58, 30, 1],
 [34, 60, 1, 1],
 [34, 61, 10, 1],
 [34, 67, 7, 1],
 [34, 60, 0, 1],
 [35, 64, 13, 1],
 [35, 63, 0, 1],
 [36, 60, 1, 1],
 [36, 69, 0, 1],
 [37, 60, 0, 1],
 [37, 63, 0, 1],
 [37, 58, 0, 1],
 [37, 59, 6, 1],
 [37, 60, 15, 1],
 [37, 63, 0, 1],
 [38, 69, 21, 2],
 [38, 59, 2, 1],
 [38, 60, 0, 1],
 [38, 60, 0, 1],
 [38, 62, 3, 1],
 [38, 64, 1, 1],
 [38, 66, 0, 1],
 [38, 66, 11, 1],
 [38, 60, 1, 1],
 [38, 67, 5, 1],
 [39, 66, 0, 2],
 [39, 63, 0, 1],
 [39, 67, 0, 1],
 [39, 58, 0, 1],
 [39, 59, 2, 1],
 [39, 63, 4, 1],
 [40, 58, 2, 1],
 [40, 58, 0, 1],
 [40, 65, 0, 1],
 [41, 60, 23, 2],
 [41, 64, 0, 2],
 [41, 67, 0, 2],
 [41, 58, 0, 1],
 [41, 59, 8, 1],
 [41, 59, 0, 1],
 [41, 64, 0, 1],
 [41, 69, 8, 1],
 [41, 65, 0, 1],
 [41, 65, 0, 1],
 [42, 69, 1, 2],
 [42, 59, 0, 2],
 [42, 58, 0, 1],
 [42, 60, 1, 1],
 [42, 59, 2, 1],
 [42, 

In [4]:
def info_dataset(data, verbose=True):
    label1, label2 = 0, 0
    data_size = len(data)
    for datum in data:
        if datum[-1] == 1:
            label1 += 1
        else:
            label2 += 1
    if verbose:
        print('Total of samples: %d' % data_size)
        print('Total label 1: %d' % label1)
        print('Total label 2: %d' % label2)
    return [len(data), label1, label2]

In [5]:
info_dataset(data)

Total of samples: 306
Total label 1: 225
Total label 2: 81


[306, 225, 81]

In [6]:
p = 0.6
_, label1, label2 = info_dataset(data,False)

In [7]:
def standardization():
	v1 = [a[0] for a in data]
	v2 = [a[1] for a in data]
	v3 = [a[2] for a in data]

	mean_v1 = np.mean(v1)
	mean_v2 = np.mean(v2)
	mean_v3 = np.mean(v3)

	std_v1 = np.std(v1)
	std_v2 = np.std(v2)
	std_v3 = np.std(v3)

	for d in data:
		d[0] = (d[0]-mean_v1)/std_v1
		d[1] = (d[1]-mean_v2)/std_v2
		d[2] = (d[2]-mean_v3)/std_v3

In [8]:
standardization()

In [9]:
# train data / test data 나누기
train_set, test_set = [], []
max_label1, max_label2 = int(p * label1), int(p * label2)
total_label1, total_label2 = 0, 0
for sample in data:
    if (total_label1 + total_label2) < (max_label1 + max_label2):
        train_set.append(sample)
        if sample[-1] == 1 and total_label1 < max_label1:
            total_label1 += 1
        else:
            total_label2 += 1
    else:
        test_set.append(sample)

In [10]:
def euclidian_dist(p1, p2):
    dim, sum_ = len(p1), 0
    for index in range(dim - 1):
        sum_ += math.pow(p1[index] - p2[index], 2)
    return math.sqrt(sum_)

In [11]:
def manhattan_dist(p1,p2):
	dim, dist = len(p1),0
	for index in range(dim-1):
		dist += abs(p1[index] - p2[index])

	return dist

In [12]:
def knn(train_set, new_sample, K):
    dists, train_size = {}, len(train_set)
    
    for i in range(train_size):
        d = manhattan_dist(train_set[i], new_sample)
        dists[i] = d
    
    k_neighbors = sorted(dists, key=dists.get)[:K]
    
    qty_label1, qty_label2 = 0, 0
    for index in k_neighbors:
        if train_set[index][-1] == 1:
            qty_label1 += 1
        else:
            qty_label2 += 1
            
    if qty_label1 > qty_label2:
        return 1
    else:
        return 2

In [13]:
# correct 수
correct, K = 0, 15
tn, fp, fn, tp = 0, 0, 0, 0
for sample in test_set:
    label = knn(train_set, sample, K)
    if sample[-1] == label == 2:
        tn += 1
        correct += 1

    elif sample[-1] == label == 1:
        tp += 1
        correct += 1

    elif sample[-1] == 1 and label == 2:
        fn += 1

    elif sample[-1] == 2 and label == 1:
        fp += 1

n = tn + fp
p = fn + tp
n_p = tn + fn
p_p = fp + tp


In [14]:
print("Train set size: {}".format(len(train_set)))
print("Test set size: {}".format(len(test_set)))
print("Correct predicitons: {}".format(correct))
print("Accuracy: {}%".format(round(100 * correct / len(test_set),2)))
print(f"Presicion: {round(tp/p_p*100,2)}%")
print(f"Recall: {round(tp/p*100,2)}%")

Train set size: 183
Test set size: 123
Correct predicitons: 94
Accuracy: 76.42%
Presicion: 80.77%
Recall: 90.32%
