In [35]:
# Importing Modules
import getpass # For users
import h5py    # For data files input
import numpy as np
import time

In [36]:
# Setting up data input directory
user = getpass.getuser()
desktop_or_laptop = 'd' # 'l' # This is for Dan

if user == 'scgst':
    with h5py.File('images_training.h5','r') as H:
        X_train_raw = np.copy(H['data'])
    with h5py.File('labels_training.h5','r') as H:
        y_train_raw = np.copy(H['label'])
    with h5py.File('images_testing.h5','r') as H:
        X_test_raw = np.copy(H['data'])
    with h5py.File('labels_testing_2000.h5','r') as H:
        y_test_raw = np.copy(H['label']) 

In [37]:
# Transforming the feature inputs
X_train_reshape = X_train_raw.reshape(-1, 784)
X_test_reshape = X_test_raw.reshape(-1, 784)

# # check
# print(X_train_reshape.shape)
# print(X_test_reshape.shape)
# print(y_train_raw.shape)
# print(y_test_raw.shape)

In [None]:
# # Normalisation on features
# from sklearn.preprocessing import normalize
# X_train = X_train.astype('float32')
# X_train = normalize(X_train)

# X_test = X_test.astype('float32')
# X_test = normalize(X_test)

In [38]:
# Split Train into train + validation
from sklearn.model_selection import train_test_split

validate_percentage = 0.3
X_train_post_validate, X_validate, y_train_post_validate, y_validate = train_test_split(X_train_reshape, y_train_raw, test_size = test_percentage, random_state = 109)

# # check
# print(X_train.shape)
# print(X_validate.shape)
# print(y_train.shape)
# print(y_validate.shape)

In [39]:
# PCA
from sklearn.decomposition import PCA
component_nbr = 100

pca = PCA(n_components = component_nbr, svd_solver = 'randomized', whiten = True).fit(X_train_post_validate)

X_train_pca = pca.transform(X_train_post_validate)
X_validate_pca = pca.transform(X_validate)
X_test_pca = pca.transform(X_test_reshape)

In [40]:
# Align with array names
X_train = X_train_pca
X_validate = X_validate_pca
X_test = X_test_pca
y_train = y_train
y_validate = y_validate
y_test = y_test_raw

# print(X_train.shape)
# print(X_validate.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_validate.shape)
# print(y_test.shape)

## Gaussian Naive Bayes - Homemade

In [42]:
# Split data by class
import numpy as np
def split_by_class(x, y, target_class):
    pos_of_class = np.where(y == target_class)[0]
    x_class = x[pos_of_class]
    return x_class

# # Check
# sum_list = 0
# for i in range(0, 10, 1):
#     print("------", i)
#     a = split_by_class(X_train, y_train, i).shape[0]
#     sum_list = sum_list + a
#     print(sum_list)
# 
# print(X_train.shape[0])

In [43]:
# Calculate mean and standard deviation for attributes by class
def attribute_mean_std_by_class(x, y, target_class):
    x_by_class = split_by_class(x, y, target_class)
    attribute_mean = np.mean(x_by_class, axis = 0)
    attribute_std = np.std(x_by_class, axis = 0)
    return attribute_mean, attribute_std

# # Check
# mean, std = attribute_mean_std_by_class(X_train, y_train, 0)
# print(mean.shape)
# print(std.shape)

In [44]:
def cal_class_prob(y, target_class):
    class_count = list(y).count(target_class)
    prob = class_count/len(y)
    return prob

# # Check
# prob_list = 0
# for i in range(0, 10, 1):
#     print("------", i)
#     a = cal_class_prob(y_train, i)
#     prob_list = prob_list + a
#     print(prob_list)

In [45]:
import math
import numpy as np

# for i in range(len(X_train)):
#     X = X_train[i]

i = 0
X = X_train[i]

for target_class in range(0, 10, 1):
    mean, std = attribute_mean_std_by_class(X_train, y_train, target_class)
    
    exponent = np.exp(-np.power(X-mean, 2)/(2*np.power(std, 2)))
    prob = exponent/(std*np.sqrt(2*math.pi))
    prod_prob = np.nanprod(prob)
    print(prod_prob)

1.074763483624114e-64
2.7295459881324038e-80
5.937413013289599e-59
1.0786571440076576e-69
1.9691722288271415e-62
1.0339771286029378e-71
3.7860511772261355e-61
5.3128135235363345e-89
8.424851945934824e-66
1.7910866827428203e-70


## Gaussian Naive Bayes - SKlearn

In [41]:
# Gaussian Naive Bayes
# Advantages v.s. Disadvantages https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn
# Pros v.s. Cons https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/
# https://blog.sicara.com/naive-bayes-classifier-sklearn-python-example-tips-42d100429e44
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# print(X_train.shape)
# print(X_validate.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_validate.shape)
# print(y_test.shape)

gnb = GaussianNB()
gnb_fit = gnb.fit(X_train, y_train)
y_pred = gnb_fit.predict(X_validate)

print(gnb_pred)
print(metrics.confusion_matrix(y_validate, y_pred))
print(metrics.accuracy_score(y_validate, y_pred))

[0 2 8 ... 1 8 5]
[[207   0   7  22   1   0  23   0  18   0]
 [  2 305   3  15   0   0   5   0   6   0]
 [  2   0 217   2  38   5  35   0  19   0]
 [ 20   2   4 238   3   3  20   0   9   0]
 [  4   0  39  12 194   1  44   0  16   0]
 [  1   0   0   0   0 219   8  60  15   6]
 [ 48   1  31   6  18   3 149   0  34   0]
 [  0   0   0   0   0  13   2 262   0  17]
 [  8   0   2   2   2   7  12   7 248   0]
 [  1   0   0   0   0   4   2  19   3 249]]
0.7626666666666667
