In [1]:
import math
import re
import random


class Attribute(object):
    def __init__(self, name, domain):
        self.name = name
        self.domain = domain


def LOAD_ATTRIBUTES(path):
    attributes = list()
    with open(path) as f:
        for i in range(0, 96):
            f.readline()
        for i in range(96, 110):
            l = re.findall(r'[^:,\.\s]+', f.readline())
            if l[1:] == ['continuous']:
                attributes.append(Attribute(l[0], 'continuous'))
            else:
                attributes.append(Attribute(l[0], l[1:]))
    return attributes


def LOAD_TRAINING_EXAMPLES(path):
    missing_examples = list()
    counting_table = dict()
    counting_table_for_mode = dict()
    with open(path) as f:
        line = f.readline()
        while line != '\n':
            l = re.findall(r'[^,\s]+', line)
            for i in range(len(attributes)):
                name = attributes[i].name
                domain = attributes[i].domain
                if l[i] == '?':
                    missing_examples.append((l[-1], name))
                else:
                    if l[-1] not in counting_table:
                        counting_table[l[-1]] = dict()
                    if domain == 'continuous':
                        if name not in counting_table[l[-1]]:
                            counting_table[l[-1]][name] = list()
                        counting_table[l[-1]][name].append(float(l[i]))
                        if name not in counting_table_for_mode:
                            counting_table_for_mode[name] = list()
                        counting_table_for_mode[name].append(float(l[i]))
                    else:
                        if name not in counting_table[l[-1]]:
                            counting_table[l[-1]][name] = dict()
                            for value in domain:
                                counting_table[l[-1]][name][value] = 0
                        counting_table[l[-1]][name][l[i]] += 1
                        if name not in counting_table_for_mode:
                            counting_table_for_mode[name] = dict()
                            for value in domain:
                                counting_table_for_mode[name][value] = 0
                        counting_table_for_mode[name][l[i]] += 1
            line = f.readline()
    mode = dict()
    for attribute in attributes:
        name = attribute.name
        if attribute.domain == 'continuous':
            counting = counting_table_for_mode[name]
            mode[name] = sum(counting) / len(counting)
        else:
            mode[name] = max(counting_table_for_mode[name], key = lambda k: counting_table_for_mode[name][k])
    for label, name in missing_examples:
        if attribute.domain == 'continuous':
            counting_table[label][name].append(mode[name])
        else:
            counting_table[label][name][mode[name]] += 1
    counting_table_total = dict()
    mu = dict()
    sigma = dict()
    for attribute in attributes:
        name = attribute.name
        if attribute.domain == 'continuous':
            mu[name] = dict()
            sigma[name] = dict()
            for label in counting_table:
                counting = counting_table[label][name]
                n = len(counting)
                if label not in counting_table_total:
                    counting_table_total[label] = n
                mu[name][label] = sum(counting) / n
                sigma[name][label] = (sum([(t - mu[name][label]) ** 2 for t in counting]) / (n - 1)) ** 0.5
    return counting_table, counting_table_total, mu, sigma, mode


def NAIVE_BAYES_PREDICT(example_attributes, counting_table, counting_table_total, mu, sigma, mode, attributes):
    P_max = 0
    for label in counting_table:
        P = counting_table_total[label]
        for attribute in attributes:
            name = attribute.name
            value = example_attributes[name]
            if value == '?':
                value = mode[name]
            if attribute.domain == 'continuous':
                x = float(value)
                P *= math.exp(-(x - mu[name][label]) ** 2 / (2 * sigma[name][label] ** 2))\
                    / ((2 * math.pi) ** 0.5 * sigma[name][label])
            else:
                P *= counting_table[label][name][value] / counting_table_total[label]
        if P > P_max:
            P_max = P
            result = label
    return result


def NAIVE_BAYES_TESTING(path, counting_table, counting_table_total, mu, sigma, mode, attributes):
    TP = 0.0
    FP = 0.0
    TN = 0.0
    FN = 0.0
    positive = None
    with open(path) as f:
        f.readline()
        line = f.readline()
        while line != '\n':
            l = re.findall(r'[^,.\s]+', line)
            example_attributes = {attributes[i].name: l[:-1][i] for i in range(len(attributes))}
            if positive is None:
                positive = l[-1]
            classification = NAIVE_BAYES_PREDICT(example_attributes, counting_table, counting_table_total, mu, sigma, mode, attributes)
            if l[-1] == positive:
                if classification == positive:
                    TP += 1
                else:
                    FP += 1
            else:
                if classification != positive:
                    TN += 1
                else:
                    FN += 1
            line = f.readline()
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * precision * recall / (precision + recall)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1-score: ', f1_score)

In [2]:
attributes = LOAD_ATTRIBUTES("dataSet/adult.names")
counting_table, counting_table_total, mu, sigma, mode = LOAD_TRAINING_EXAMPLES("dataSet/adult.data")
NAIVE_BAYES_TESTING("dataSet/adult.test", counting_table, counting_table_total, mu, sigma, mode, attributes)

Accuracy:  0.8296173453719059
Precision:  0.9310012062726176
Recall:  0.8580004446750167
F1-score:  0.8930114162295588
