In [135]:
import matplotlib.pyplot as plt
import numpy as np
import math
from pandas import crosstab

def safelog(x):
    return np.log(x + 1e-100)

In [136]:
data_set_images = np.genfromtxt("Naive_bayes_classifier_images.csv", delimiter = ",")
data_set_labels = np.genfromtxt("Naive_bayes_classifier_labels.csv", delimiter = "\n")

print(data_set_images.shape)
#print(data_set_labels.T)

(35000, 784)


In [149]:
# training dataset
train_set_x = data_set_images[0:30000,:]
train_set_y = data_set_labels[0:30000].astype(int)

#print(train_set_y.shape)
#testing dataset
test_set_x = data_set_images[30000:35000, :]
test_set_y = data_set_labels[30000:35000].astype(int)

#print(test_set_x.shape)

N = train_set_x.shape[0]
D = train_set_x.shape[1]
K = np.max(train_set_y)

30000


## Mean

In [138]:
sample_means = np.array((np.mean(train_set_x[train_set_y == 1], axis = 0),
                            np.mean(train_set_x[train_set_y == 2], axis = 0),
                            np.mean(train_set_x[train_set_y == 3], axis = 0),
                            np.mean(train_set_x[train_set_y == 4], axis = 0),
                            np.mean(train_set_x[train_set_y == 5], axis = 0))).T
print(sample_mean)                           

[[254.99866689 254.98416931 254.85619063 ... 254.67905349 254.87818697
  254.95934011]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666611 254.98983164 254.91415236 ... 252.39473246 254.4415736
  254.93665611]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


## Standard Deviation

In [139]:
sample_deviations = np.array((
                        np.std(train_set_x[train_set_y == 1], axis = 0),
                        np.std(train_set_x[train_set_y == 2], axis = 0),
                        np.std(train_set_x[train_set_y == 3], axis = 0),
                        np.std(train_set_x[train_set_y == 4], axis = 0),
                        np.std(train_set_x[train_set_y == 5], axis = 0))).T
print(sample_deviations)

[[ 0.09127736  0.2065419   0.05163547  0.18436076  0.04471018]
 [ 0.25609108  0.2065419   0.04081939  0.21617116  0.64582342]
 [ 1.31090756  0.2163818   0.16002465  1.81046936  3.03248555]
 ...
 [ 5.29826629  1.04076669 18.43665868 15.67799977 23.62576428]
 [ 3.9117332   0.47057267  6.7881694   6.34549162 13.9167006 ]
 [ 1.93959091  0.70062226  1.1061344   1.79971911  4.4727787 ]]


 ## Priors

In [140]:
class_priors = np.array([np.mean(train_set_y == (c + 1)) for c in range(K)])
print([np.mean(train_set_y == (c + 1)) for c in range(K)])

[0.2, 0.2, 0.2, 0.2, 0.2]


In [141]:
def score_function(x):
    constants = -D/2 * np.log(2 * math.pi)
    #sum_log_std = [np.sum(safelog(sample_deviations[:, c])) for c in range(K)]
    #sum_log_exp = [np.sum(((x - sample_means[:, c]) ** 2) / (2 * (sample_deviations[:, c] ** 2))) for c in range(K)]
    total = [
            - np.sum(safelog(sample_deviations[:, c]))
            - np.sum(((x - sample_means[:, c]) ** 2) / (2 * (sample_deviations[:, c] ** 2)))
            + np.log(class_priors[c]) + constants
        for c in range(K)
    
    ]
    return total
                     

In [142]:
train_scores_y = [score_function(x) for x in train_set_x]
#print(train_scores_y)
pred_y = np.argmax(train_scores_y, axis = 1) + 1
print(pred_y)

[3 1 2 ... 1 3 5]


In [143]:
print(crosstab(pred_y, train_set_y, rownames=["y_pred"], colnames=["y_truth"]))

y_truth     1     2     3     4     5
y_pred                               
1        3685    49     4   679     6
2        1430  5667  1140  1380   532
3         508   208  4670  2948   893
4         234    60   123   687   180
5         143    16    63   306  4389


In [147]:
test_scores_y = [score_function(x) for x in test_set_x]
#print(train_scores_y)
pred_test_y = np.argmax(test_scores_y, axis = 1) + 1
print(pred_test_y)

[1 2 5 ... 3 5 5]


In [148]:
print(crosstab(pred_test_y, test_set_y, rownames=["y_pred"], colnames=["y_truth"]))

y_truth    1    2    3    4    5
y_pred                          
1        597    6    0  114    1
2        237  955  188  267   81
3         92   25  785  462  167
4         34   11   16  109   29
5         40    3   11   48  722
