In [28]:
# Naive Bayes On The Comb-Brush Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi

In [29]:
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [30]:
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [31]:
# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [32]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [33]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [34]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

print('X_train: ', X_train.shape, 'y_train: ',y_train.shape)
print('X_test: ', X_test.shape, 'y_test:', y_test.shape)

X_train:  (640, 13) y_train:  (640,)
X_test:  (160, 13) y_test: (160,)


In [35]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [36]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

In [37]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

In [38]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del (summaries[-1])
    return summaries


In [39]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [40]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent


In [41]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [42]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [43]:
# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return (predictions)


In [44]:
# Test Naive Bayes on Comb-Brush Dataset
seed(1)
filename = r'C:\Users\KylieOng\PycharmProjects\VdKI\data.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0]) - 1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0]) - 1)
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))

Scores: [55.625, 68.75, 64.375, 65.0, 61.875]
Mean Accuracy: 63.125%


### Compare Naive Bayes with sklearn

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import sklearn
import pandas as pd

dataset = pd.read_csv('data.csv')

X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]
print('X: ', X.shape, 'y: ',y.shape)

X:  (799, 13) y:  (799,)


In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

print('X_train: ', X_train.shape, 'y_train: ',y_train.shape)
print('X_test: ', X_test.shape, 'y_test:', y_test.shape)

X_train:  (639, 13) y_train:  (639,)
X_test:  (160, 13) y_test: (160,)


In [53]:
X_train.head()

Unnamed: 0,676,3,0.077310469,0.038085472,0.052713795,1.149037993,1.318301222,0.492630199,0.681845496,3.399329001,0.722495348,53,901
264,640,2,0.059019,0.036752,0.048813,1.0141,1.142181,0.622703,0.827067,4.743048,0.752904,46,417
615,889,3,0.030738,0.025729,0.028323,1.17338,1.234285,0.837048,0.921429,0.556863,0.908423,45,513
329,1745,18,0.035993,0.028244,0.028244,1.800531,1.800531,0.784706,0.784706,1.0,1.0,100,909
342,1084,4,0.066544,0.044372,0.056316,1.585979,1.828082,0.666813,0.846292,0.317864,0.787923,100,509
394,537,2,0.064391,0.041027,0.053576,0.972852,1.094513,0.637154,0.832038,0.202755,0.765776,52,301


In [54]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

bkbayes_clf = GaussianNB()

bkbayes_clf.fit(X_train, y_train)

print('Accuracy Bayes:', bkbayes_clf.score(X_test, y_test))

Accuracy Bayes: 0.64375
