In [275]:
# Import Libs
#conda install -c anaconda pandas 
import pandas as pd
import numpy as np
import random
import math
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB



In [276]:
# Get CSV
columnNames=['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','classlabel'] 
train = pd.read_csv("abalone_dataset.txt",delimiter="\t", names=columnNames, header=None)


In [277]:
# Data Preparation for Cases
train = shuffle(train)
case1Data = train.iloc[:100, [0,1,2,8]]
case1Test = train.iloc[101:, [0,1,2]]
case1ExpectedTest = train.iloc[101:, [0,1,2,8]]
case2Data = train.iloc[:1000, [0,1,2,8]]
case2Test = train.iloc[1001:, [0,1,2]]
case2ExpectedTest = train.iloc[1001:, [0,1,2,8]]
case3Data = train.iloc[:2000, [0,1,2,8]]
case3Test = train.iloc[2001:, [0,1,2]]
case3ExpectedTest = train.iloc[2001:, [0,1,2,8]]
case4Data = train.iloc[:100, :]
case4Test = train.iloc[101:, :8]
case5Data = train.iloc[:1000, :]
case5Test = train.iloc[1001:, :8]
case6Data = train.iloc[:2000, :]
case6Test = train.iloc[2001:, :8]

In [278]:
def type1(data, test, expectedTest):
    # Hot Encode Sex for Data and Test
    sexHotEncodeData = pd.get_dummies(data['Sex'])
    data = data.drop('Sex', axis = 1)
    data = data.join(sexHotEncodeData)
    sexHotEncodeTest = pd.get_dummies(test['Sex'])
    test = test.drop('Sex', axis = 1)
    test = test.join(sexHotEncodeTest)
    
    
    numOf1 = data['classlabel'][data['classlabel'] == 1].count()
    numOf2 = data['classlabel'][data['classlabel'] == 2].count()
    numOf3 = data['classlabel'][data['classlabel'] == 3].count()
    
    totalClassLabel = data['classlabel'].count()
    
    prior1 = numOf1/totalClassLabel
    prior2 = numOf1/totalClassLabel
    prior3 = numOf1/totalClassLabel
    
    dataMean     = data.groupby('classlabel').mean()
    dataVariance = data.groupby('classlabel').var()
    

    
    #Means for 1,2,3
    oneFMean      = dataMean['F'][dataVariance.index == 1].values[0]
    oneIMean      = dataMean['I'][dataVariance.index == 1].values[0]
    oneMMean      = dataMean['M'][dataVariance.index == 1].values[0]
    oneLengthMean   = dataMean['Length'][dataVariance.index == 1].values[0]
    oneDiameterMean = dataMean['Diameter'][dataVariance.index == 1].values[0]
    
    twoFMean      = dataMean['F'][dataVariance.index == 2].values[0]
    twoIMean      = dataMean['I'][dataVariance.index == 2].values[0]
    twoMMean      = dataMean['M'][dataVariance.index == 2].values[0]
    twoLengthMean   = dataMean['Length'][dataVariance.index == 2].values[0]
    twoDiameterMean = dataMean['Diameter'][dataVariance.index == 2].values[0]
    
    threeFMean      = dataMean['F'][dataVariance.index == 3].values[0]
    threeIMean      = dataMean['I'][dataVariance.index == 3].values[0]
    threeMMean      = dataMean['M'][dataVariance.index == 3].values[0]
    threeLengthMean   = dataMean['Length'][dataVariance.index == 3].values[0]
    threeDiameterMean = dataMean['Diameter'][dataVariance.index == 3].values[0]
    
    #Variances for 1,2,3
    oneFVariance      = dataVariance['F'][dataVariance.index == 1].values[0]
    oneIVariance      = dataVariance['I'][dataVariance.index == 1].values[0]
    oneMVariance      = dataVariance['M'][dataVariance.index == 1].values[0]
    oneLengthVariance   = dataVariance['Length'][dataVariance.index == 1].values[0]
    oneDiameterVariance = dataVariance['Diameter'][dataVariance.index == 1].values[0]
    
    twoFVariance      = dataVariance['F'][dataVariance.index == 2].values[0]
    twoIVariance      = dataVariance['I'][dataVariance.index == 2].values[0]
    twoMVariance      = dataVariance['M'][dataVariance.index == 2].values[0]
    twoLengthVariance   = dataVariance['Length'][dataVariance.index == 2].values[0]
    twoDiameterVariance = dataVariance['Diameter'][dataVariance.index == 2].values[0]
    
    threeFVariance      = dataVariance['F'][dataVariance.index == 3].values[0]
    threeIVariance      = dataVariance['I'][dataVariance.index == 3].values[0]
    threeMVariance      = dataVariance['M'][dataVariance.index == 3].values[0]
    threeLengthVariance   = dataVariance['Length'][dataVariance.index == 3].values[0]
    threeDiameterVariance = dataVariance['Diameter'][dataVariance.index == 3].values[0]
    
    result = []
    
    for i in range(len(test)):
            probabilityOf1 = prior1 * \
            p_x_given_y(test['F'].iloc[i], oneFMean, oneFVariance) * \
            p_x_given_y(test['I'].iloc[i], oneIMean, oneIVariance) * \
            p_x_given_y(test['M'].iloc[i], oneMMean, oneMVariance) * \
            p_x_given_y(test['Length'].iloc[i], oneLengthMean, oneLengthVariance) * \
            p_x_given_y(test['Diameter'].iloc[i], oneDiameterMean, oneDiameterVariance)

            probabilityOf2 = prior2 * \
            p_x_given_y(test['F'].iloc[i], twoFMean, twoFVariance) * \
            p_x_given_y(test['I'].iloc[i], twoIMean, twoIVariance) * \
            p_x_given_y(test['M'].iloc[i], twoMMean, twoMVariance) * \
            p_x_given_y(test['Length'].iloc[i], twoLengthMean, twoLengthVariance) * \
            p_x_given_y(test['Diameter'].iloc[i], twoDiameterMean, twoDiameterVariance)

            probabilityOf3 = prior3 * \
            p_x_given_y(test['F'].iloc[i], threeFMean, threeFVariance) * \
            p_x_given_y(test['I'].iloc[i], threeIMean, threeIVariance) * \
            p_x_given_y(test['M'].iloc[i], threeMMean, threeMVariance) * \
            p_x_given_y(test['Length'].iloc[i], threeLengthMean, threeLengthVariance) * \
            p_x_given_y(test['Diameter'].iloc[i], threeDiameterMean, threeDiameterVariance)

            results = [probabilityOf1, probabilityOf2, probabilityOf3]
            minimumValue = max(results)
            if minimumValue == probabilityOf1:
                output = 1
            if minimumValue == probabilityOf2:
                output = 2
            if minimumValue == probabilityOf3:
                output = 3

            expectedResult = expectedTest['classlabel'].iloc[i]

            result.append([output, expectedResult])
    
    
    return result

In [279]:
def p_x_given_y(x, mean_y, variance_y):
    # Input the arguments into a probability density function
    exponent = math.exp(-(math.pow(x - mean_y, 2) / (2 * variance_y)))
    return (1 / (math.sqrt(2 * math.pi * variance_y))) * exponent

In [280]:
test = case1Test
sexHotEncodeTest = pd.get_dummies(test['Sex'])
test = test.drop('Sex', axis = 1)
test = test.join(sexHotEncodeTest)

In [281]:
result = type1(case1Data, case1Test, case1ExpectedTest)
result2 = type1(case2Data, case2Test, case2ExpectedTest)
result3 = type1(case3Data, case3Test, case3ExpectedTest)


In [282]:
def accuracy(result):
    true = 0
    for i in range(len(result)):
        if result[i][0] == result[i][1]:
            true += 1
    return true/len(result)

In [283]:
accuracy(result)

0.47178606476938173

In [284]:
accuracy(result2)

0.48268261964735515

In [285]:
accuracy(result3)

0.4774816176470588

Unnamed: 0,Sex,Length,Diameter,classlabel
1890,M,0.565,0.455,2
2203,M,0.615,0.480,3
1133,M,0.570,0.460,2
3834,M,0.335,0.260,1
246,I,0.320,0.245,3
2722,I,0.375,0.275,1
2919,F,0.600,0.480,2
32,M,0.665,0.525,3
1423,F,0.725,0.600,2
1321,I,0.560,0.425,2


In [286]:
gnb = GaussianNB()
used_features =[
    "F",
    "I",
    "M",
    "Length",
    "Diameter"
]
gnb.fit(
    case1Data[used_features].values,
    case1Data["classlabel"]
)
y_pred = gnb.predict(case1ExpectedTest[used_features])
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          case1ExpectedTest.shape[0],
          (case1ExpectedTest["classlabel"] != y_pred).sum(),
          100*(1-(case1ExpectedTest["classlabel"] != y_pred).sum()/case1ExpectedTest.shape[0])
))

KeyError: "['F' 'I' 'M'] not in index"

In [266]:
case1Data["Sex_cleaned"]=np.where(case1Data["Sex"]=="male",0,1)
case1Data

Unnamed: 0,Sex,Length,Diameter,classlabel,Sex_cleaned
1890,M,0.565,0.455,2,1
2203,M,0.615,0.480,3,1
1133,M,0.570,0.460,2,1
3834,M,0.335,0.260,1,1
246,I,0.320,0.245,3,1
2722,I,0.375,0.275,1,1
2919,F,0.600,0.480,2,1
32,M,0.665,0.525,3,1
1423,F,0.725,0.600,2,1
1321,I,0.560,0.425,2,1
