In [217]:
import csv
import random
import math
import sys
import numpy as np

In [218]:
def load_data(filename):
    lines = csv.reader(open(filename, "rb"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]

    return dataset

In [219]:
def separate_data(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)

    return separated

In [220]:
def split_data(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))

    return [trainSet, copy]

In [221]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

In [222]:
def standard_deviation(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)

    return math.sqrt(variance)

In [223]:
def summarize(dataset):
    summaries = [(mean(attribute), standard_deviation(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]

    return summaries

In [224]:
def summarize_by_class(dataset):
    separated = separate_data(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)

    return summaries

In [225]:
def calculate_prob(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

In [226]:
def calculate_class_prob(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        # print(len(classSummaries))
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            # print(mean, stdev)
            x = inputVector[i]
            if stdev != 0:
                probabilities[classValue] *= calculate_prob(x, mean, stdev)
        
    return probabilities

In [227]:
def predict(summaries, inputVector):
    probabilities = calculate_class_prob(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue

    return bestLabel

In [228]:
def get_predictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)

    return predictions

In [229]:
def get_accuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1

    return (correct / float(len(testSet))) * 100.0

def get_data_label(dataset):
    data = []
    label = []
    for x in dataset:
        data.append(x[:8])
        label.append(x[-1])

    return data, label


In [230]:
import pandas as pd
# filename = 'data\heart.csv'
splitRatio = 0.8
# dataset = pd.read_csv(filename).values
# dataset = load_data(filename)
import pickle 

# Đọc dữ liệu bệnh tim, phân loại nhị phân
with open('preprocessing/cirrhosis/X.pkl', 'rb') as f:
    X = pickle.load(f)
with open('preprocessing/cirrhosis/y.pkl', 'rb') as f:
    y = pickle.load(f)

dataset = np.concatenate((X.values, y.values.reshape(-1, 1)), axis=1)
dataset 
trainingSet, testSet = split_data(dataset, splitRatio)
print(f'Data size {len(dataset)} \nTraining Size={len(trainingSet)} \nTest Size={len(testSet)}')

Data size 412 
Training Size=329 
Test Size=83


In [231]:
dataTrain, labelTrain = get_data_label(trainingSet)
labelTrain

[2.0,
 3.0,
 3.0,
 0.0,
 1.0,
 1.0,
 3.0,
 2.0,
 1.0,
 2.0,
 3.0,
 2.0,
 2.0,
 0.0,
 3.0,
 1.0,
 2.0,
 2.0,
 2.0,
 1.0,
 2.0,
 0.0,
 3.0,
 0.0,
 1.0,
 0.0,
 3.0,
 3.0,
 1.0,
 3.0,
 0.0,
 2.0,
 1.0,
 3.0,
 3.0,
 1.0,
 1.0,
 2.0,
 1.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 2.0,
 1.0,
 2.0,
 2.0,
 2.0,
 1.0,
 3.0,
 1.0,
 3.0,
 0.0,
 3.0,
 3.0,
 3.0,
 0.0,
 2.0,
 1.0,
 3.0,
 3.0,
 2.0,
 1.0,
 3.0,
 3.0,
 2.0,
 3.0,
 3.0,
 3.0,
 2.0,
 3.0,
 3.0,
 3.0,
 0.0,
 3.0,
 1.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 1.0,
 1.0,
 2.0,
 3.0,
 2.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 2.0,
 2.0,
 2.0,
 3.0,
 2.0,
 2.0,
 2.0,
 3.0,
 2.0,
 3.0,
 2.0,
 1.0,
 3.0,
 1.0,
 2.0,
 2.0,
 3.0,
 3.0,
 3.0,
 3.0,
 2.0,
 2.0,
 3.0,
 3.0,
 1.0,
 2.0,
 2.0,
 3.0,
 2.0,
 3.0,
 2.0,
 3.0,
 1.0,
 0.0,
 3.0,
 1.0,
 2.0,
 3.0,
 1.0,
 2.0,
 3.0,
 0.0,
 3.0,
 2.0,
 1.0,
 2.0,
 3.0,
 2.0,
 2.0,
 3.0,
 2.0,
 3.0,
 2.0,
 2.0,
 1.0,
 2.0,
 3.0,
 2.0,
 3.0,
 3.0,
 1.0,
 1.0,
 1.0,
 2.0,
 1.0,
 3.0,
 3.0,
 1.0,
 2.0,
 3.0,
 1.0,
 3.0

In [232]:
summaries = summarize_by_class(trainingSet)
get_data_label(trainingSet)

([array([ 0.83136905, -0.61655607,  0.2523181 ,  0.42492495, -1.0308418 ,
         -0.60437573, -1.34663409,  0.11685337]),
  array([ 0.73601158, -0.05146197, -0.1510048 , -0.09463795,  0.39451412,
          0.01983422,  0.16162895,  0.0873027 ]),
  array([ 1.88030127, -0.59395231, -0.1510048 ,  1.36958476, -0.6182738 ,
         -0.70703123, -0.97108265,  0.0873027 ]),
  array([-2.12471266, -0.50353725, -0.66122633,  0.63747341, -0.27224902,
         -0.05562563, -0.50164335, -0.43168103]),
  array([-0.78970802, -0.57134854,  1.08943258,  0.40130845, -0.83121212,
         -0.33725644,  0.09297976, -0.35780434]),
  array([ 1.11744147, -0.48093348,  0.16504336,  0.23599299, -0.24563172,
          0.80162844,  0.65630691,  0.51024673]),
  array([-0.88506549,  0.51363213, -0.1510048 , -2.81053489,  0.39451412,
          0.01983422,  0.16162895,  0.0873027 ]),
  array([ 1.30815642, -0.54874478, -0.70253982,  0.87363836,  1.00538094,
          3.77799306,  0.10529623, -0.52402688]),
  array(

In [233]:
# test model
predictions = get_predictions(summaries, testSet)
accuracy = get_accuracy(testSet, predictions)
print(f'Accuracy of my implement: {accuracy}%')

Accuracy of my implement: 61.44578313253012%


In [234]:
# Compare with sklearn
dataTrain, labelTrain = get_data_label(trainingSet)
dataTest, labelTest = get_data_label(testSet)

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
clfs = [GaussianNB(), BernoulliNB()]
for clf in clfs:
    clf.fit(dataTrain, labelTrain)

    score = clf.score(dataTest, labelTest) * 100

    print(f'Accuracy of sklearn is {clf}: {score}%')

Accuracy of sklearn is GaussianNB(): 36.144578313253014%
Accuracy of sklearn is BernoulliNB(): 62.65060240963856%
