# Naive Bayes - Trabalho

## Questão 1

Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:

** Attributos **
1. buying: vhigh, high, med, low
2. maint: vhigh, high, med, low
3. doors: 2, 3, 4, 5, more
4. persons: 2, 4, more
5. lug_boot: small, med, big
6. safety: low, med, high

** Classes **
1. unacc, acc, good, vgood

## Questão 2
Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) 

## Questão 3

Analise a acurácia dos dois algoritmos e discuta a sua solução.

In [1]:
#QUESTAO 1
import csv
import random
import math

In [2]:
def main(filename,splitRatio):
    dataset = carregar_e_tratar(filename)
    summaries = summarizeByClass(dataset)
    trainSet, testSet = splitDataset(dataset, splitRatio)
    predictions = getPredictions(summaries, testSet)
    return predictions, getAccuracy(testSet, predictions)

In [3]:
def carregar_e_tratar(filename):
    tratamento={"vhigh":4.0, "high":3.0, "med":2.0, "low":1.0,"small":1.0,"big":3.0,"5more":5.0,"more":5.0}
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        for x in range(len(dataset[i])-1):
            if dataset[i][x] in tratamento:
                dataset[i][x]=tratamento[dataset[i][x]]
            else:
                dataset[i][x]=float(dataset[i][x])
            
    return dataset

[[4.0, 4.0, 2.0, 2.0, 1.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 1.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 1.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 2.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 2.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 2.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 3.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 3.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 2.0, 3.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 1.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 1.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 1.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 2.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 2.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 2.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 3.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 3.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 4.0, 3.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 1.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 1.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 1.0, 3.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 2.0, 1.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 2.0, 2.0, 'unacc'], [4.0, 4.0, 2.0, 5.0, 2.0, 3.0, 'unacc'], [4.0, 4.0, 2.0,

In [4]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return trainSet, copy

In [5]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector[0:6])
    
    return separated

In [6]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [7]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [8]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * math.pow(stdev, 2))) * exponent

In [9]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [10]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [11]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [12]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [13]:
#Classificando
print(main('carData.csv',0.7))

(['unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'acc', 'unacc', 'acc', 'acc', 'acc', 'acc', 'acc', 'un