In [None]:
#Imports

from sklearn import datasets
import numpy as np
import math
import random
import os


from PIL import Image
import glob
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

In [None]:
class MNISTData :

    def __init__(self, dir):

        filelist = sorted(glob.glob(dir))
        self.x = np.array([np.array(Image.open(fname)) for fname in filelist])

        self.samples_per_class = 500
        self.number_of_classes = 10

        self.y = np.zeros(self.number_of_classes * self.samples_per_class,dtype=int)
        for cls in range(1,self.number_of_classes):
            self.y[(cls*500):(cls+1)*500] = cls


    def get_data(self):

        self.train_features, self.test_features, self.train_labels, self.test_labels = train_test_split(self.x, self.y, test_size=0.3,
                                                                                    random_state=42)
        self.train_normalised = self.train_features.reshape(3500, 400) / 255.0
        self.test_normalised = self.test_features.reshape(1500, 400) / 255.0

        return self.train_normalised, self.test_normalised, self.train_labels, self.test_labels


    def visualize_random(self):

        examples_per_class = 8
        for cls in range(self.number_of_classes):
            idxs = np.where(self.train_labels == cls)[0]
            idxs = np.random.choice(idxs, examples_per_class, replace=False)
            for i, idx in enumerate(idxs):
                plt.subplot(examples_per_class, self.number_of_classes, i * self.number_of_classes + cls + 1)
                plt.imshow(self.train_features[idx].astype('uint8'), cmap='gray')
                plt.axis('off')
                if i == 0:
                    plt.title(str(cls))
        plt.show()

    def visualize_wrong_class(self, pred, examples_per_class):
        for cls in range(self.number_of_classes):
            idxs = [idx for idx, value in enumerate(self.test_labels) if((value != cls) and (pred[idx] == cls))]
            if(len(idxs) > examples_per_class):
                idxs = np.random.choice(idxs, examples_per_class, replace=False)

            for i, idx in enumerate(idxs):
                plt.subplot(examples_per_class, self.number_of_classes, i * self.number_of_classes + cls + 1)
                plt.imshow(self.test_features[idx].astype('uint8'), cmap='gray')
                plt.axis('off')
                if i == 0:
                    plt.title(str(cls))
        plt.show()

In [None]:
class NCC:
    
    def __init__(self):
        pass
    
    def fit(self,data,target,classes):
        self.means = np.zeros((len(classes),len(data[0])))
        
        counter = np.zeros((len(classes),1))
        for i, val in enumerate(data):
            num = target[i]
            self.means[num][:] += val
            counter[num] +=1
        
        for i in range(len(classes)):
            self.means[i] = self.means[i]/counter[i]
            
    def predict(self,x):
        closest_dist = math.inf
        predicted_class = None
        for i in range(len(self.means)):
            dist = np.linalg.norm(self.means[i]-x)
            if dist<closest_dist:
                closest_dist = dist
                predicted_class = i
        return predicted_class

In [None]:
class NBC:
    
    def __init__(self):
        pass
    
    def fit(self,data,target,classes):
        self.classes=classes
        self.PY = {10:0} #The tenth index saves the total
        for c in classes:
            self.PY[c] = 0
        for t in target:
            self.PY[10] += 1
            self.PY[t] += 1
        self.PXY = {}
        for i in range(len(data[0])):
            self.PXY[i] = {}
        for j, t in enumerate(target):
            d = data[j]
            for i, pixel_val in enumerate(d):
                if pixel_val not in self.PXY[i]:
                    self.PXY[i][pixel_val] = {c:0 for c in classes}
                self.PXY[i][pixel_val][t] += 1
    
    def predict(self,pic):
        class_prob=np.zeros(10)
        for j, c in enumerate(self.classes):
            value= self.PY[c]/self.PY[10]
            for i, pixel_val in enumerate(pic):
                if pixel_val not in self.PXY[i]:
                    value=0
                    continue
                value*= self.PXY[i][pixel_val][c]/self.PY[c]
            class_prob[j] = value
        return np.argmax(class_prob)
                    

In [None]:
class GBC:
    
    def __init__(self):
        pass
    
    def fit(self,data,target,classes):
        self.classes=classes
        self.PY = {10:0} #The tenth index saves the total
        for c in classes:
            self.PY[c] = 0
        for t in target:
            self.PY[10] += 1
            self.PY[t] += 1
        self.Observations = {}
        for pixel_index in range(len(data[0])):
            self.Observations[pixel_index] = {c:[] for c in classes}
        for j, number in enumerate(target):
            pic = data[j]
            for pixel_index, pixel_val in enumerate(pic):
                self.Observations[pixel_index][number].append(pixel_val)
        self.means={c:{} for c in classes}
        self.vars ={c:{} for c in classes}
        for c in classes:
            self.means[c] = {}
            self.vars[c] = {}
            for pixel_index in range(64):
                self.means[c][pixel_index] = np.mean(self.Observations[pixel_index][c])
                self.vars[c][pixel_index] = np.var(self.Observations[pixel_index][c])+0.01
    
    def predict(self,pic,debug=False):
        class_prob=np.zeros(10)
        for j, number in enumerate(self.classes):
            value= 1 if debug else self.PY[number]/self.PY[10]
            for pixel_index, pixel_val in enumerate(pic):
                exp = math.exp(-((pixel_val-self.means[number][pixel_index]))**2 / (2*self.vars[number][pixel_index]))
                den = math.sqrt(2*math.pi*self.vars[number][pixel_index])
                value*= exp/den
            class_prob[j] = value
        if debug:
            return class_prob
        return np.argmax(class_prob)

In [None]:
def load_data(augment=False, normalize=False):
    digits = datasets.load_digits()
    
    if augment:
        for image in digits.data:
            for pixel in image:
                if pixel<5:
                    pixel = 0
                elif pixel <10:
                    pixel = 1
                else:
                    pixel = 2
    if normalize:
        for image in digits.data:
            for pixel in image:
                pixel=pixel/16

    temp = list(zip(digits.data,digits.target))
    random.shuffle(temp)

    train = temp[0:int(len(temp)*0.7)]
    test = temp[int(len(temp)*0.7):]

    train_data = []
    train_target = []
    for x, y in train:
        train_data.append(x)
        train_target.append(y)
    train_data = np.array(train_data)
    train_target= np.array(train_target)

    test_data = []
    test_target = []
    for x, y in test:
        test_data.append(x)
        test_target.append(y)
    test_data = np.array(test_data)
    test_target = np.array(test_target)
    
    return train_data,train_target,test_data,test_target, digits.target_names

In [None]:
def loadMNIST():
    path = os.getcwd() + '/data/MNIST_Light/*/*.png'
    mnist_data = MNISTData(path)
    train_features, test_features, train_labels, test_labels = mnist_data.get_data()
    target_names = [i for i in range(10)]
    return train_features, train_labels, test_features, test_labels, target_names

In [None]:
#Create and evaluate model

train_data,train_target,test_data,test_target,target_names = load_data()

model = NCC()
model.fit(train_data,train_target,target_names)

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

In [None]:
#Create and evaluate model with augmented pixel value spectra

train_data,train_target,test_data,test_target,target_names = load_data(True)

model = NCC()
model.fit(train_data,train_target,target_names)

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

In [None]:
#Create and evaluate model with MNIST_data

train_data,train_target,test_data,test_target,target_names = loadMNIST()

model = NCC()
model.fit(train_data,train_target,target_names)

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Accuracy: ", correct/(len(test_target)))

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

mnist.visualize_wrong_class(y_pred, 8)

In [None]:
#Main_MNIST.py

from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

mnist = MNISTData(os.getcwd() + '/data/MNIST_Light/*/*.png')

train_features, test_features, train_labels, test_labels = mnist.get_data()

mnist.visualize_random()

gnb = GaussianNB( var_smoothing=0.01)
gnb.fit(train_features, train_labels)
y_pred = gnb.predict(test_features)

print("Classification report SKLearn GNB:\n%s\n"
  % (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn GNB:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

mnist.visualize_wrong_class(y_pred, 8)



In [None]:
#Create and evaluate NBC model

train_data,train_target,test_data,test_target,target_names = load_data()

model = NBC()
model.fit(train_data,train_target,target_names)
#print(model.printstuff())

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

In [None]:
#Create and evaluate NBC model with augmented pixel value spectra

train_data,train_target,test_data,test_target,target_names = load_data(True)

model = NBC()
model.fit(train_data,train_target,target_names)
#print(model.printstuff())

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

In [None]:
#Create and evaluate GBC model

train_data,train_target,test_data,test_target,target_names = load_data()

model = GBC()
model.fit(train_data,train_target,target_names)

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))

In [None]:
#Create and evaluate GBC model with augmented pixel value spectra

train_data,train_target,test_data,test_target,target_names = load_data(True)

model = GBC()
model.fit(train_data,train_target,target_names)

correct = 0
y_pred = []
for i, val in enumerate(test_target):
    y = model.predict(test_data[i])
    y_pred.append(y)
    if y == val:
        correct+=1

print("Classification report:\n%s\n"
  % (metrics.classification_report(test_target, y_pred)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, y_pred))