In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics 


fruit = pd.read_table("fruit_data_with_colors.txt")
fruit.head()


Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [2]:
X = fruit[["mass", "width", "height", "color_score"]]
Y = fruit["fruit_name"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)
scaler = StandardScaler()  
scaler.fit(X_train)
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)
for x in range(1, 9):
    classifier = KNeighborsClassifier(n_neighbors=x)  
    classifier.fit(X_train, Y_train)
    Y_pred = classifier.predict(X_test)
    print("N neighbours = " + str(x))
    print(confusion_matrix(Y_test, Y_pred))
    print("kNN %):", metrics.accuracy_score(Y_test, Y_pred)*100)


N neighbours = 1
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [2 0 0 7]]
kNN %): 88.88888888888889
N neighbours = 2
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [2 1 0 6]]
kNN %): 83.33333333333334
N neighbours = 3
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [1 0 0 8]]
kNN %): 94.44444444444444
N neighbours = 4
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [2 1 0 6]]
kNN %): 83.33333333333334
N neighbours = 5
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [1 1 0 7]]
kNN %): 88.88888888888889
N neighbours = 6
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [2 0 0 7]]
kNN %): 88.88888888888889
N neighbours = 7
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [1 0 0 8]]
kNN %): 94.44444444444444
N neighbours = 8
[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [3 0 0 6]]
kNN %): 83.33333333333334


In [3]:
gaussianClassifer = GaussianNB()
gaussianClassifer.fit(X_train, Y_train)
y_pred = gaussianClassifer.predict(X_test)
print(confusion_matrix(Y_test, Y_pred))
print("kNN %):", metrics.accuracy_score(Y_test, Y_pred)*100)

[[4 0 0 0]
 [0 3 0 0]
 [0 0 2 0]
 [3 0 0 6]]
kNN %): 83.33333333333334


In [4]:
# find p(c)
labels = fruit["fruit_label"]
totalCount = [0, 0, 0, 0]
for x in labels:
    totalCount[x-1] += 1
    
total = totalCount[0] + totalCount[1] + totalCount[2] + totalCount[3]

probC = [0.0, 0.0, 0.0, 0.0]
for x in range(0, len(probC)):
    probC[x] = totalCount[x] / total

In [5]:
def myGaussian(X_train, X_test):
    # sort by fruit label
    X_train_sorted = X_train.groupby('fruit_label') 

    # group into classes
    [X_train_sorted.get_group(x) for x in X_train_sorted.groups]

    # put grouped classes into array 
    splitClasses = [0, 0, 0, 0]
    for i in range(1, 5):
        splitClasses[i-1] = X_train_sorted.get_group(i)   

    # setup variables for gaussian calculations
    columns = ["mass", "width", "height", "color_score"]
    classProb = [0.0, 0.0, 0.0, 0.0]
    gaus = 0
    mean = 0.0
    variance = 0.0
    classPredictions = []
    counter = 0
    # for each test data
    for index, w in X_test.iterrows():
        # for each class
        for x in range(0, 4):
            currentClass = splitClasses[x]
            productGaus = 1.0
            # for each column in the class
            for y in range (0, 4):
                # find mean & variance of each column of the class
                mean = currentClass[columns[y]].mean()
                variance = currentClass[columns[y]].var()
                # find gaus of each column using mean & variance
                gaus = (1 / math.sqrt(2 * math.pi * variance) * math.exp(-0.5 * math.pow((w[columns[y]] - mean), 2) / variance))
                productGaus = gaus * productGaus
            # calculate final probability of being in class
            classProb[x] = productGaus * probC[x]
        # choose correct class for test data here
        classPredictions.append(classProb.index(max(classProb)) + 1)
    return classPredictions

In [7]:
# get data from fruits
X = fruit[["fruit_label", "mass", "width", "height", "color_score"]]
Y = fruit["fruit_label"]
# split into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)
# call my gaussian function
y_pred_mine = myGaussian(X_train, X_test)

# remove fruit_label from X data
X_test = X_test[["mass", "width", "height", "color_score"]]
X_train = X_train[["mass", "width", "height", "color_score"]]

# rerun gaussian on new training data
gaussianClassifer = GaussianNB()
gaussianClassifer.fit(X_train, Y_train)
Y_pred = gaussianClassifer.predict(X_test)

# output my class predictions
print("My classes: \n" + str(y_pred_mine))
print(confusion_matrix(Y_test, y_pred_mine))
# output sklearn class predicitions
print("Sklearn classes: \n" + str(Y_pred))
print(confusion_matrix(Y_test, Y_pred))
# output actual classes
print("Actual classes: \n" + str(Y_test.values.tolist()))
# find and print accuracy of my implementation vs sklearn's
accuracy = metrics.accuracy_score(Y_pred, y_pred_mine)*100
print("Sklearn vs mine: " + str(accuracy))


My classes: 
[1, 3, 1, 1, 4, 3, 2, 3, 1, 4, 1, 3, 2, 1, 1, 4, 1, 4]
[[7 0 0 0]
 [0 2 0 0]
 [1 0 4 0]
 [0 0 0 4]]
Sklearn classes: 
[1 3 1 1 4 3 2 3 1 4 1 3 2 1 1 4 1 4]
[[7 0 0 0]
 [0 2 0 0]
 [1 0 4 0]
 [0 0 0 4]]
Actual classes: 
[1, 3, 1, 1, 4, 3, 2, 3, 1, 4, 1, 3, 2, 3, 1, 4, 1, 4]
Sklearn vs mine: 100.0
