In [1]:
import pandas as pd

# Carregando o dataset
dataset = pd.read_csv("dataset/occupancy_detection/datatraining.txt", header=None)

class_column = len (dataset.columns) - 1

# Checando os dados
print (dataset)

class_names = pd.unique(dataset[class_column])

# Descobrindo o número de instâncias por classes
for i in class_names:
    print( str(i) + ': ' + str(len (dataset.loc[dataset[class_column] == i])) )

              0          1      2           3         4  5
0     23.180000  27.272000  426.0  721.250000  0.004793  1
1     23.150000  27.267500  429.5  714.000000  0.004783  1
2     23.150000  27.245000  426.0  713.500000  0.004779  1
3     23.150000  27.200000  426.0  708.250000  0.004772  1
4     23.100000  27.200000  426.0  704.500000  0.004757  1
5     23.100000  27.200000  419.0  701.000000  0.004757  1
6     23.100000  27.200000  419.0  701.666667  0.004757  1
7     23.100000  27.200000  419.0  699.000000  0.004757  1
8     23.100000  27.200000  419.0  689.333333  0.004757  1
9     23.075000  27.175000  419.0  688.000000  0.004745  1
10    23.075000  27.150000  419.0  690.250000  0.004741  1
11    23.100000  27.100000  419.0  691.000000  0.004739  1
12    23.100000  27.166667  419.0  683.500000  0.004751  1
13    23.050000  27.150000  419.0  687.500000  0.004734  1
14    23.000000  27.125000  419.0  686.000000  0.004715  1
15    23.000000  27.125000  418.5  680.500000  0.004715 

In [2]:
from math import sqrt

def euclidean(p, q):
    if len(p) != len (q):
        return -1
    
    local_sum = 0
    for i in range(0, len(p)):
        local_sum += pow(q[i] - p[i], 2)
    
    return sqrt (local_sum)

def manhattan(p, q):
    if len(p) != len (q):
        return -1
    
    local_sum = 0
    for i in range(0, len(p)):
        local_sum += abs(p[i] - q[i])
    
    return local_sum

In [8]:
import numpy as np

# Embaralhando os dados
dataset = dataset.iloc[np.random.permutation(len(dataset))]

# Separando o dataset por classes
occupied     = dataset.loc[dataset[class_column] == class_names[0]]
not_occupied = dataset.loc[dataset[class_column] == class_names[1]]

blind = pd.concat([ occupied[len(occupied) - 100     : len(occupied)],
                not_occupied[len(not_occupied) - 100 : len(not_occupied)]])

occupied     =     occupied[0:200]
not_occupied = not_occupied[0:200]

In [9]:
train_percentage = 0.8

# Obtendo os conjuntos de treino e de testes
trainset = pd.concat([    occupied[0: int (len(occupied)     * train_percentage + 1)],
                      not_occupied[0: int (len(not_occupied) * train_percentage + 1)]])

testset =  pd.concat([    occupied[int (len(occupied)     * train_percentage + 1) : ],
                      not_occupied[int (len(not_occupied) * train_percentage + 1) : ]])

In [10]:
from operator import itemgetter
from collections import Counter

# Criando a função de classificação
def knn(k, train, element, function):
    distance = []
    
    local_class_column = len (train.columns) - 1
    
    for _, row in train.iterrows():
        distance.append((function(row[0:local_class_column], element[0:local_class_column]), row[local_class_column]))
    
    distance.sort(key=itemgetter(0))
    distance = [classes[1] for classes in distance[0:k]]
    
    most_common = Counter(distance)
    #print("Classification: " + max(most_common, key=most_common.get) + ", " + element[4])
    return max(most_common, key=most_common.get)

# Função de avaliação de acurácia
def evaluate(k, train, test):
    acc = 0
    
    local_class_column = len (train.columns) - 1
    
    for _, row in test.iterrows():
        if( knn(k, train, row, euclidean) == row[local_class_column] ):
            acc += 1
    
    return acc / len(test)

# Descobrindo a acurácia para todas as configurações possíveis
def evaluate_by_config(train, test):
    for k in range(1,11):
        print("K = " + str(k) + ", acc = " + str(evaluate(k, train, test)))

In [11]:
# Checando a melhor configuração
evaluate_by_config(trainset, testset)

K = 1, acc = 1.0
K = 2, acc = 1.0
K = 3, acc = 1.0
K = 4, acc = 1.0
K = 5, acc = 1.0
K = 6, acc = 1.0
K = 7, acc = 1.0
K = 8, acc = 1.0
K = 9, acc = 0.9871794871794872
K = 10, acc = 0.9871794871794872


In [7]:
evaluate_by_config(trainset, blind)

K = 1, acc = 0.99
K = 2, acc = 0.99
K = 3, acc = 0.995
K = 4, acc = 0.995
K = 5, acc = 0.995
K = 6, acc = 0.995
K = 7, acc = 0.995
K = 8, acc = 0.995
K = 9, acc = 0.995
K = 10, acc = 0.995
