In [1]:
import csv
import random
import math
import sys

In [2]:
def load_data(filename):
    lines = csv.reader(open(filename, "rb"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]

    return dataset

In [3]:
def separate_data(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)

    return separated

In [4]:
def split_data(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))

    return [trainSet, copy]

In [5]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

In [6]:
def standard_deviation(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)

    return math.sqrt(variance)

In [7]:
def summarize(dataset):
    summaries = [(mean(attribute), standard_deviation(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]

    return summaries

In [8]:
def summarize_by_class(dataset):
    separated = separate_data(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)

    return summaries

In [9]:
def calculate_prob(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))

    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

In [10]:
def calculate_class_prob(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculate_prob(x, mean, stdev)

    return probabilities

In [11]:
def predict(summaries, inputVector):
    probabilities = calculate_class_prob(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue

    return bestLabel

In [12]:
def get_predictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)

    return predictions

In [13]:
def get_accuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1

    return (correct / float(len(testSet))) * 100.0

def get_data_label(dataset):
    data = []
    label = []
    for x in dataset:
        data.append(x[:8])
        label.append(x[-1])

    return data, label


In [23]:
import pandas as pd
filename = 'tieu_duong.csv'
splitRatio = 0.8
dataset = pd.read_csv(filename).values
#dataset = load_data(filename)
#dataset
trainingSet, testSet = split_data(dataset, splitRatio)
print(f'Data size {len(dataset)} \nTraining Size={len(trainingSet)} \nTest Size={len(testSet)}')

Data size 767 
Training Size=613 
Test Size=154


In [26]:
summaries = summarize_by_class(trainingSet)
get_data_label(trainingSet)

([array([  5.   , 117.   ,  86.   ,  30.   , 105.   ,  39.1  ,   0.251,
          42.   ]),
  array([ 13.   , 153.   ,  88.   ,  37.   , 140.   ,  40.6  ,   1.174,
          39.   ]),
  array([  6.   , 151.   ,  62.   ,  31.   , 120.   ,  35.5  ,   0.692,
          28.   ]),
  array([  3.   , 108.   ,  62.   ,  24.   ,   0.   ,  26.   ,   0.223,
          25.   ]),
  array([ 11.   , 143.   ,  94.   ,  33.   , 146.   ,  36.6  ,   0.254,
          51.   ]),
  array([  2.   , 129.   ,  74.   ,  26.   , 205.   ,  33.2  ,   0.591,
          25.   ]),
  array([  1.   , 112.   ,  80.   ,  45.   , 132.   ,  34.8  ,   0.217,
          24.   ]),
  array([  3.   , 107.   ,  62.   ,  13.   ,  48.   ,  22.9  ,   0.678,
          23.   ]),
  array([  6.   , 154.   ,  74.   ,  32.   , 193.   ,  29.3  ,   0.839,
          39.   ]),
  array([  3.   , 111.   ,  62.   ,   0.   ,   0.   ,  22.6  ,   0.142,
          21.   ]),
  array([ 5.   , 97.   , 76.   , 27.   ,  0.   , 35.6  ,  0.378, 52.   ]),
  arr

In [33]:
 # test model
predictions = get_predictions(summaries, testSet)
accuracy = get_accuracy(testSet, predictions)
print(f'Accuracy of my implement: {accuracy}%')

Accuracy of my implement: 74.02597402597402%


In [34]:
# Compare with sklearn
dataTrain, labelTrain = get_data_label(trainingSet)
dataTest, labelTest = get_data_label(testSet)

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(dataTrain, labelTrain)

score = clf.score(dataTest, labelTest) * 100

print(f'Accuracy of sklearn: {score}%')


Accuracy of sklearn: 74.02597402597402%
