# Naive Bayes Classifier

In [14]:
import random
import math
import numpy as np
import pandas as pd

In [3]:
def splitDataset(dataset, splitRatio):
    trainSize = int(dataset.shape[0] * splitRatio)
    indices = np.random.permutation(dataset.shape[0])
    training_idx, test_idx = indices[:trainSize], indices[trainSize:]
    training, test = dataset[training_idx,:], dataset[test_idx,:]
    return training, test

In [4]:
def separateByClass(dataset):
    return {
        1: dataset[np.where(dataset[:, -1]==1), :],
        0: dataset[np.where(dataset[:, -1]==0), :]
    }

In [5]:
def summarize(dataset):
    means = dataset.mean(axis = 1)[0][:-1]
    stds = dataset.std(axis = 1, ddof = 1)[0][:-1]
    return means, stds

In [6]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [7]:
def calculateProbability(x, mean, stdev):
    return np.exp(-(x-mean)**2/(2*stdev**2))/(np.sqrt(2*np.pi)*stdev)

In [8]:
def calculateClassProbabilities(summaries, inputVector):

    probabilities = {}
    
    for classValue, classSummaries in summaries.items():
        
        means = classSummaries[0]
        stds  = classSummaries[1]
        
        probabilities[classValue] = np.prod(calculateProbability(inputVector[:-1], means, stds))
        
    return probabilities

In [9]:
def predict(summaries, inputVector):

    probabilities = calculateClassProbabilities(summaries, inputVector)  

    bestLabel, bestProb = None, -1
    
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    
    return bestLabel

In [10]:
def getPredictions(summaries, testSet):
    predictions = []
    
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)

    return predictions

In [11]:
def getAccuracy(testSet, predictions):
    correct = 0
    
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [12]:
def main(dataset):
    splitRatio = 0.67
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    
    print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
    
    summaries = summarizeByClass(trainingSet)
    
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%'.format(accuracy))

In [25]:
data = pd.read_excel('C:\\Users\\Galia\\desktop\\wine2.xlsx')
X = data[['Alcogol', 'OD OD280/OD315 of diluted wines', 'Site']][:100]
# print(X)
X = X.as_matrix()

In [26]:
main(X)

Split 100 rows into train=67 and test=33 rows
Accuracy: 57.57575757575758%


  
  ret, rcount, out=ret, casting='unsafe', subok=False)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret, rcount, out=ret, casting='unsafe', subok=False)
