In [1]:
import pandas as pd

# Carregando o dataset
dataset = pd.read_csv("dataset/haberman/haberman.csv", header=None)

class_column = len (dataset.columns) - 1

# Checando os dados
print (dataset)

class_names = pd.unique(dataset[class_column])

# Descobrindo o número de instâncias por classes
for i in class_names:
    print( str(i) + ': ' + str(len (dataset.loc[dataset[class_column] == i])) )

      0   1   2  3
0    30  64   1  1
1    30  62   3  1
2    30  65   0  1
3    31  59   2  1
4    31  65   4  1
5    33  58  10  1
6    33  60   0  1
7    34  59   0  2
8    34  66   9  2
9    34  58  30  1
10   34  60   1  1
11   34  61  10  1
12   34  67   7  1
13   34  60   0  1
14   35  64  13  1
15   35  63   0  1
16   36  60   1  1
17   36  69   0  1
18   37  60   0  1
19   37  63   0  1
20   37  58   0  1
21   37  59   6  1
22   37  60  15  1
23   37  63   0  1
24   38  69  21  2
25   38  59   2  1
26   38  60   0  1
27   38  60   0  1
28   38  62   3  1
29   38  64   1  1
..   ..  ..  .. ..
276  67  66   0  1
277  67  61   0  1
278  67  65   0  1
279  68  67   0  1
280  68  68   0  1
281  69  67   8  2
282  69  60   0  1
283  69  65   0  1
284  69  66   0  1
285  70  58   0  2
286  70  58   4  2
287  70  66  14  1
288  70  67   0  1
289  70  68   0  1
290  70  59   8  1
291  70  63   0  1
292  71  68   2  1
293  72  63   0  2
294  72  58   0  1
295  72  64   0  1
296  72  67 

In [2]:
from math import sqrt

def euclidean(p, q):
    if len(p) != len (q):
        return -1
    
    local_sum = 0
    for i in range(0, len(p)):
        local_sum += pow(q[i] - p[i], 2)
    
    return sqrt (local_sum)

def manhattan(p, q):
    if len(p) != len (q):
        return -1
    
    local_sum = 0
    for i in range(0, len(p)):
        local_sum += abs(p[i] - q[i])
    
    return local_sum

In [3]:
import numpy as np

# Embaralhando os dados
dataset = dataset.iloc[np.random.permutation(len(dataset))]

# Separando o dataset por classes
survived = dataset.loc[dataset[class_column] == class_names[0]]
died     = dataset.loc[dataset[class_column] == class_names[1]]

min_class = min(len(survived), len(died))
#min_class = 60

blind = pd.concat([survived[min_class:len(survived)],
                       died[min_class:len(died)]])

survived = survived[0:min_class]
died     = died[0:min_class]

In [4]:
train_percentage = 0.8

# Obtendo os conjuntos de treino e de testes
trainset = pd.concat([    survived[0: int (len(survived) * train_percentage + 1)],
                              died[0: int (len(died)     * train_percentage + 1)]])

testset =  pd.concat([    survived[int (len(survived) * train_percentage + 1) : ],
                              died[int (len(died)     * train_percentage + 1) : ]])

In [5]:
from operator import itemgetter
from collections import Counter

# Criando a função de classificação
def knn(k, train, element, function):
    distance = []
    
    local_class_column = len (train.columns) - 1
    
    for _, row in train.iterrows():
        distance.append((function(row[0:local_class_column], element[0:local_class_column]), row[local_class_column]))
    
    distance.sort(key=itemgetter(0))
    distance = [classes[1] for classes in distance[0:k]]
    
    most_common = Counter(distance)
    #print("Classification: " + max(most_common, key=most_common.get) + ", " + element[4])
    return max(most_common, key=most_common.get)

# Função de avaliação de acurácia
def evaluate(k, train, test):
    acc = 0
    
    local_class_column = len (train.columns) - 1
    
    for _, row in test.iterrows():
        if( knn(k, train, row, euclidean) == row[local_class_column] ):
            acc += 1
    
    return acc / len(test)

# Descobrindo a acurácia para todas as configurações possíveis
def evaluate_by_config(train, test):
    for k in range(1,11):
        print("K = " + str(k) + ", acc = " + str(evaluate(k, train, test)))

In [6]:
# Checando a melhor configuração
evaluate_by_config(trainset, testset)

K = 1, acc = 0.5625
K = 2, acc = 0.5625
K = 3, acc = 0.59375
K = 4, acc = 0.59375
K = 5, acc = 0.625
K = 6, acc = 0.65625
K = 7, acc = 0.625
K = 8, acc = 0.59375
K = 9, acc = 0.59375
K = 10, acc = 0.5625


In [7]:
evaluate_by_config(trainset, blind)

K = 1, acc = 0.6180555555555556
K = 2, acc = 0.6180555555555556
K = 3, acc = 0.5972222222222222
K = 4, acc = 0.6180555555555556
K = 5, acc = 0.6319444444444444
K = 6, acc = 0.6597222222222222
K = 7, acc = 0.6388888888888888
K = 8, acc = 0.6527777777777778
K = 9, acc = 0.6527777777777778
K = 10, acc = 0.7013888888888888
