In [1]:
# Import Libs
#conda install -c anaconda pandas 
import pandas as pd
import numpy as np
import random
import math
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from __future__ import division


#from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB



In [2]:
# Get CSV
columnNames=['Sex','Length','Diameter','Height','WholeWeight','ShuckedWeight','VisceraWeight','ShellWeight','classlabel'] 
train = pd.read_csv("abalone_dataset.txt",delimiter="\t", names=columnNames, header=None)


In [3]:
def p_x_given_y(x, mean_y, variance_y):
    # Input the arguments into a probability density function
    exponent = math.exp(-(math.pow(x - mean_y, 2) / (2 * variance_y)))
    return (1 / (math.sqrt(2 * math.pi * variance_y))) * exponent

In [27]:
def accuracy(result):
    true = 0
    total = len(result)
    cmExpected = []
    cmPredicted = []
    for i in range(len(result)):
        if result[i][0] == result[i][1]:
            true += 1
        cmExpected.append(result[i][1])
        cmPredicted.append(result[i][0])
    misclassification = total - true;
    cm = confusion_matrix(cmExpected, cmPredicted)
    return cm, total, true, misclassification, true/len(result)*100

In [5]:
def hotEncodeSex(data, test):
    # Hot Encode Sex for Data and Test
    sexHotEncodeData = pd.get_dummies(data['Sex'])
    data = data.drop('Sex', axis = 1)
    data = data.join(sexHotEncodeData)
    sexHotEncodeTest = pd.get_dummies(test['Sex'])
    test = test.drop('Sex', axis = 1)
    test = test.join(sexHotEncodeTest)
    
    return data, test
    

In [6]:
def preCalculations(data):
    numOf1 = data['classlabel'][data['classlabel'] == 1].count()
    numOf2 = data['classlabel'][data['classlabel'] == 2].count()
    numOf3 = data['classlabel'][data['classlabel'] == 3].count()
    
    totalClassLabel = data['classlabel'].count()
    
    prior1 = numOf1/totalClassLabel
    prior2 = numOf1/totalClassLabel
    prior3 = numOf1/totalClassLabel
    
    dataMean     = data.groupby('classlabel').mean()
    dataVariance = data.groupby('classlabel').var()
    
    return prior1, prior2, prior3, dataMean, dataVariance

In [7]:
def naiveBayes(attributes, data, test, expectedTest):
    data, test = hotEncodeSex(data, test)
    prior1, prior2, prior3, dataMean, dataVariance = preCalculations(data)
    priors = [prior1, prior2, prior3]
    classLabelTypes = [1, 2, 3]
    output = []
    
    for i in range(len(test)):
        expectedResult = expectedTest['classlabel'].iloc[i]
        maxOfClassLabels = []
        for classLabelType in range(len(classLabelTypes)):
            probabilityOfClassLabelType = priors[classLabelType]
            for attribute in range(len(attributes)):
                    mean_y = dataMean[attributes[attribute]][dataVariance.index == classLabelType + 1].values[0]
                    variance_y = dataVariance[attributes[attribute]][dataVariance.index == classLabelTypes[classLabelType]].values[0]
                    probabilityOfClassLabelType = probabilityOfClassLabelType * p_x_given_y(test[attributes[attribute]].iloc[i], mean_y, variance_y) 
            maxOfClassLabels.append(probabilityOfClassLabelType)
        output.append([maxOfClassLabels.index(max(maxOfClassLabels)) + 1,expectedResult])
    accuracy(output)

In [8]:
# Data Preparation for Cases
train = shuffle(train)
case1Data = train.iloc[:100, [0,1,2,8]]
case1Test = train.iloc[101:, [0,1,2]]
case1ExpectedTest = train.iloc[101:, [0,1,2,8]]
case2Data = train.iloc[:1000, [0,1,2,8]]
case2Test = train.iloc[1001:, [0,1,2]]
case2ExpectedTest = train.iloc[1001:, [0,1,2,8]]
case3Data = train.iloc[:2000, [0,1,2,8]]
case3Test = train.iloc[2001:, [0,1,2]]
case3ExpectedTest = train.iloc[2001:, [0,1,2,8]]
case4Data = train.iloc[:100, :]
case4Test = train.iloc[101:, :8]
case4ExpectedTest = train.iloc[101:, :]
case5Data = train.iloc[:1000, :]
case5Test = train.iloc[1001:, :8]
case5ExpectedTest = train.iloc[1001:, :]
case6Data = train.iloc[:2000, :]
case6Test = train.iloc[2001:, :8]
case6ExpectedTest = train.iloc[2001:, :]

In [9]:
usedFeaturesType1 = ['F', 'M', 'I', 'Length', 'Diameter']
usedFeaturesType2 = ['F', 'M', 'I', 'Length', 'Diameter', 'Height','WholeWeight','ShuckedWeight','VisceraWeight','ShellWeight']
result = naiveBayes(usedFeaturesType1, case1Data, case1Test, case1ExpectedTest)
result2 = naiveBayes(usedFeaturesType1, case2Data, case2Test, case2ExpectedTest)
result3 = naiveBayes(usedFeaturesType1, case3Data, case3Test, case3ExpectedTest)
result4 = naiveBayes(usedFeaturesType2, case4Data, case4Test, case4ExpectedTest)
result5 = naiveBayes(usedFeaturesType2, case5Data, case5Test, case5ExpectedTest)
result6 = naiveBayes(usedFeaturesType2, case6Data, case6Test, case6ExpectedTest)


In [33]:
for

[3, 3]