In [1]:
import itertools
import math
import struct

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics, preprocessing

pd.options.display.max_rows = 2000
pd.options.display.max_columns = 1000
pd.options.display.width = 1500

In [2]:
def read_idx(filename):
    with open(filename, "rb") as f:
        zero, data_type, dims = struct.unpack(">HBB", f.read(4))
        shape = tuple(struct.unpack(">I", f.read(4))[0] for d in range(dims))
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)


train_data = read_idx("./train-images.idx3-ubyte")
train_label = read_idx("./train-labels.idx1-ubyte")
test_data = read_idx("./t10k-images.idx3-ubyte")
test_label = read_idx("./t10k-labels.idx1-ubyte")

In [None]:
class NaiveBayes_Con():
    def __init__(self, train_data, train_label):
        self.train_label = train_label
        self.__buildTrainingData(train_data)

    def log_gaussian(self, x, mean, sigma):
        if sigma == 0:
            sigma = 10**-3
        return np.log(1 / math.sqrt(2 * math.pi * sigma)) - ((
            (x - mean)**2) / (2 * sigma))

    def __buildTrainingData(self, train_data):
        self.class_count = [0 for _ in range(10)]
        flatten_data = self.flatten(train_data)
        self.pixel_data = [[[] for _ in range(784)] for _ in range(10)]
        for i, img in enumerate(flatten_data, 0):
            self.class_count[self.train_label[i]] += 1
            for pixel in range(784):
                self.pixel_data[self.train_label[i]][pixel].append(img[pixel])
        self.mean = [[] for _ in range(784)]
        self.variance = [[] for _ in range(784)]
        for num in range(10):
            for pixel in range(784):
                self.mean[num].append(np.mean(self.pixel_data[num][pixel]))
                self.variance[num].append(np.var(self.pixel_data[num][pixel]))

    def predict(self, test_data, test_label):
        flatten_data = self.flatten(test_data)
        posterior = [[] for _ in range(len(flatten_data))]
        predict = []
        for i, img in enumerate(flatten_data, 0):
            for pixel in range(784):
                for num in range(10):
                    lhood = self.log_gaussian(img[pixel],
                                              self.mean[num][pixel],
                                              self.variance[num][pixel])
                    posterior[i].append(lhood + self.class_count[num] / 60000)
            pred = np.argmax(posterior[i])
            predict.append(pred)
        return posterior, predict

    def flatten(self, data):
        flatten_data = []
        for i in range(len(data)):
            flatten_data.append(
                [item for sublist in data[i] for item in sublist])
        return flatten_data

In [None]:
result_con = NaiveBayes_Con(train_data, train_label)

In [None]:
con_post, con_pred = result_con.predict(test_data, test_label)

In [None]:
acc = metrics.accuracy_score(test_label, con_pred)
print("Accuracy: %f\nError rate: %f" % (acc, 1 - acc))

In [None]:
print(con_post[87])