In [28]:
import numpy as np
import pandas as pd
from math import sqrt, exp, pow, pi
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Select the class 
def get_class(group, dataset):
        size = len(dataset[0]) - 1
        for i in range(len(dataset)):
            dataset[i][size] = 1 if dataset[i][size] == group else 0
        return dataset

# Normalize
def normalize(dataset):
        for i in range(dataset.shape[1]-1):
            max_ = max(dataset[:, i])
            min_ = min(dataset[:, i])
            for j in range(dataset.shape[0]):
                dataset[j, i] = (dataset[j, i] - min_) / (max_ - min_)
        return dataset

# Insert the bias
def insert_bias(data):
    group = []
    for i in range(len(data)):
        group.append(np.insert(data[i], 0, -1))
    group = np.asarray(group)
    return group

# Divide into train and test
def divide_samples(dataset):
    return train_test_split(dataset, test_size=0.20)

# Divide into x and y 
def split_samples(dataset, n_attributes):
    return np.array(dataset[:, 0:n_attributes-1]), np.array(dataset[:, -1])
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

def calculate_mean_variance(x_train, y_train):
    n_class = len(np.unique(y_train))
    mean = np.zeros((x_train.shape[1], n_class)).T
    variance = np.zeros((x_train.shape[1], n_class)).T
    theta = np.zeros((n_class, 1))
    
    for c in y_train:
        theta[int(c)] += 1
    theta = theta / x_train.shape[0]

    for c in range(int(n_class)):
        N_y = len(np.where(y_train == c)[0])
        for i in range(x_train.shape[1]):
            indices = np.where(y_train == c)[0]
            mean[c][i] = (1. / (1. * N_y)) * sum(x_train[:, i][indices])
            variance[c][i] = (1. / (1. * N_y)) * sum((x_train[:, i][indices] - mean[c][i]) ** 2)
            
    return mean, variance, theta
    
def prob_feature_class(mean, variance, theta, x):
    Proby_x = np.zeros((mean.shape[0], 1))
    for c in range(mean.shape[0]):
        prob_class = np.log(theta[c])
        for k in range(mean.shape[1]):
            x_i = x[k]
            num = (1. / sqrt(2. * pi * variance[c][k]))
            den = (-1. / 2.) * ((x_i - mean[c][k]) / sqrt(variance[c][k])) ** 2
            product = num * np.exp(den)
            Proby_x[c] += np.log(product)
        Proby_x[c] += prob_class
    return Proby_x
        
# Training
def train(train):
    x_train, y_train = split_samples(train, len(train[0]))
    return calculate_mean_variance(x_train, y_train)

# Test
def test(mean, variance, theta, test):
    x_test, y_test = split_samples(test, len(test[0]))
    count = 0
    for i in range(x_test.shape[0]):
        Proby_x = prob_feature_class(mean, variance, theta, x_test[i])
        y_output = np.argmax(Proby_x)
        count += 1 if y_output == y_test[i] else 0
    return count / (1.0 * i)

# Evaluate an algorithm using hold-out
def execute(realizations, dataset):
    print("### Naive Bayes Gaussian ###")
    print("PARÂMETROS: ")
    print("\t Total de realizações: ", realizations, "\n")
    rates = []
	
    for i in range(realizations):
        training_set, test_set = divide_samples(dataset)
        print("### REALIZAÇÃO ", (i+1), "###")
        print("### FASE DE training ###")
        mean, variance, theta = train(training_set)
        print("### FASE DE TESTES ###")
        accuracy = test(mean, variance, theta, test_set)
        print("Taxa de acerto: ", accuracy, "\n")
        rates.append(accuracy)

    rates = np.array(rates)
    print("Acurácia: ", rates.mean())
    print("Variância da Acurácia: ", rates.var())
    print("Desvio Padrão da Acurácia: ", rates.std())
    print("### FIM do Naive Bayes Gaussian ###")
    
# Main
dataset = np.array(pd.read_csv("base/column_2C.dat", delimiter=",", header=None))
dataset = get_class("NO", dataset)
dataset = normalize(dataset)
execute(1, dataset)

### Naive Bayes Gaussian ###
PARÂMETROS: 
	 Total de realizações:  1 

### REALIZAÇÃO  1 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  0.7704918032786885 

Acurácia:  0.7704918032786885
Variância da Acurácia:  0.0
Desvio Padrão da Acurácia:  0.0
### FIM do Naive Bayes Gaussian ###
