In [20]:
import csv
import random
import math
import sys
import numpy as np

In [2]:
def load_data(filename):
    lines = csv.reader(open(filename, "rb"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]

    return dataset

In [3]:
def separate_data(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)

    return separated

In [4]:
def split_data(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))

    return [trainSet, copy]

In [5]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

In [6]:
def standard_deviation(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)

    return math.sqrt(variance)

In [7]:
def summarize(dataset):
    summaries = [(mean(attribute), standard_deviation(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]

    return summaries

In [8]:
def summarize_by_class(dataset):
    separated = separate_data(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)

    return summaries

In [9]:
def calculate_prob(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

In [10]:
def calculate_class_prob(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculate_prob(x, mean, stdev)

    return probabilities

In [11]:
def predict(summaries, inputVector):
    probabilities = calculate_class_prob(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue

    return bestLabel

In [12]:
def get_predictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)

    return predictions

In [13]:
def get_accuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1

    return (correct / float(len(testSet))) * 100.0

def get_data_label(dataset):
    data = []
    label = []
    for x in dataset:
        data.append(x[:8])
        label.append(x[-1])

    return data, label


In [43]:
import pandas as pd
filename = 'data\heart.csv'
splitRatio = 0.8
dataset = pd.read_csv(filename).values
# dataset = load_data(filename)
# import pickle 

# # Đọc dữ liệu bệnh tim, phân loại nhị phân
# with open('preprocessing/heart/X.pkl', 'rb') as f:
#     X = pickle.load(f)
# with open('preprocessing/heart/y.pkl', 'rb') as f:
#     y = pickle.load(f)

# dataset = np.concatenate((X.values, y.values.reshape(-1, 1)), axis=1)
dataset 
trainingSet, testSet = split_data(dataset, splitRatio)
print(f'Data size {len(dataset)} \nTraining Size={len(trainingSet)} \nTest Size={len(testSet)}')

Data size 1025 
Training Size=820 
Test Size=205


In [44]:
summaries = summarize_by_class(trainingSet)
get_data_label(trainingSet)

([array([ 59.,   1.,   2., 150., 212.,   1.,   1., 157.]),
  array([ 45.,   0.,   1., 130., 234.,   0.,   0., 175.]),
  array([ 55.,   0.,   0., 128., 205.,   0.,   2., 130.]),
  array([ 44.,   0.,   2., 108., 141.,   0.,   1., 175.]),
  array([ 59.,   1.,   0., 138., 271.,   0.,   0., 182.]),
  array([ 53.,   0.,   0., 130., 264.,   0.,   0., 143.]),
  array([ 58.,   1.,   0., 125., 300.,   0.,   0., 171.]),
  array([ 61.,   0.,   0., 130., 330.,   0.,   0., 169.]),
  array([ 52.,   1.,   1., 120., 325.,   0.,   1., 172.]),
  array([ 55.,   0.,   0., 180., 327.,   0.,   2., 117.]),
  array([ 41.,   0.,   2., 112., 268.,   0.,   0., 172.]),
  array([ 59.,   1.,   2., 126., 218.,   1.,   1., 134.]),
  array([ 51.,   0.,   2., 130., 256.,   0.,   0., 149.]),
  array([ 43.,   1.,   0., 115., 303.,   0.,   1., 181.]),
  array([ 54.,   1.,   0., 110., 239.,   0.,   1., 126.]),
  array([ 52.,   1.,   0., 112., 230.,   0.,   1., 160.]),
  array([ 68.,   1.,   2., 180., 274.,   1.,   0., 150.]

In [45]:
# test model
predictions = get_predictions(summaries, testSet)
accuracy = get_accuracy(testSet, predictions)
print(f'Accuracy of my implement: {accuracy}%')

Accuracy of my implement: 82.92682926829268%


In [47]:
# Compare with sklearn
dataTrain, labelTrain = get_data_label(trainingSet)
dataTest, labelTest = get_data_label(testSet)

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
clfs = [GaussianNB(), BernoulliNB(), ComplementNB(), CategoricalNB()]
for clf in clfs:
    clf.fit(dataTrain, labelTrain)

    score = clf.score(dataTest, labelTest) * 100

    print(f'Accuracy of sklearn {clf}: {score}%')

Accuracy of sklearn GaussianNB(): 79.51219512195122%
Accuracy of sklearn BernoulliNB(): 76.09756097560975%
Accuracy of sklearn ComplementNB(): 72.1951219512195%
Accuracy of sklearn CategoricalNB(): 87.8048780487805%
