In [1]:
import os
import cv2
import numpy as np
import csv
import math

In [2]:
def data_loader(path, data_size):
    
    file_list = os.listdir(path)
    feature_info = list()
    label_info = list()
    
    for idx in range(data_size):
        img_path = os.path.join(path, file_list[idx])
        
        feature = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        feature = list(map(float,feature.flatten()))
        label = int(file_list[idx][-5])
        
        feature_info.append(feature)
        label_info.append(label)
    
    return feature_info, label_info

In [3]:
def get_GaussianNBC(train_samples, train_labels):
    
    fashion_class_0_samples = []
    fashion_class_1_samples = []
    fashion_class_2_samples = []
    fashion_class_3_samples = []
    fashion_class_4_samples = []
    fashion_class_5_samples = []
    fashion_class_6_samples = []
    fashion_class_7_samples = []
    fashion_class_8_samples = []
    fashion_class_9_samples = []
    
    for k in range(len(train_samples)):
        sample = train_samples[k]
        label = train_labels[k]
        
        if label == 0:
            fashion_class_0_samples.append(sample)
        elif label == 1:
            fashion_class_1_samples.append(sample)
        elif label == 2:
            fashion_class_2_samples.append(sample)
        elif label == 3:
            fashion_class_3_samples.append(sample)
        elif label == 4:
            fashion_class_4_samples.append(sample)
        elif label == 5:
            fashion_class_5_samples.append(sample)
        elif label == 6:
            fashion_class_6_samples.append(sample)
        elif label == 7:
            fashion_class_7_samples.append(sample)
        elif label == 8:
            fashion_class_8_samples.append(sample)
        elif label == 9:
            fashion_class_9_samples.append(sample)
            
    samples_by_classes = [
        fashion_class_0_samples,
        fashion_class_1_samples,
        fashion_class_2_samples,
        fashion_class_3_samples,
        fashion_class_4_samples,
        fashion_class_5_samples,
        fashion_class_6_samples,
        fashion_class_7_samples,
        fashion_class_8_samples,
        fashion_class_9_samples
    ]
    
    numOf_classes = 10
    means_by_classes = []
    stdevs_by_classes = []
    
    for C in range(numOf_classes):
        means = []
        stdevs = []
        for features in zip(*samples_by_classes[C]):
            means.append(np.mean(features))
            stdevs.append(np.std(features))
        means_by_classes.append(means)
        stdevs_by_classes.append(stdevs)
    
    return means_by_classes, stdevs_by_classes

In [4]:
def Gaussian_PDF(x, mean, stdev):
    if stdev == 0.0:
        if x == mean:
            return 1.0
        else:
            return 0.0
    return (1/(math.sqrt(2 * math.pi)*stdev)) *(math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))))

In [5]:
def predict(means, stdevs, test_samples):
    
    pred_classes = []
    numOf_classes = 10
    numOf_features = 784
    
    for i in range(len(test_samples)):
        prob_by_classes = []
        for C in range(numOf_classes):
            prob = 1
            for j in range(numOf_features):
                mean = means[C][j]
                stdev = stdevs[C][j]
                x = test_samples[i][j]
                prob *= Gaussian_PDF(x, mean, stdev)
            prob_by_classes.append(prob)
        bestProb = -1
        for C in range(numOf_classes):
            if prob_by_classes[C] > bestProb:
                bestProb = prob_by_classes[C]
                pred_Label = C
        pred_classes.append(pred_Label)
        
    return pred_classes

In [6]:
def get_Acc(pred_classes_of_testset, gt_of_testset):
    accuracy = np.equal(pred_classes_of_testset, gt_of_testset)
    return list(accuracy).count(True)/len(accuracy)*100

### P1. 제공된 Fashion-MNIST 데이터 셋을 gaussian naive bayesian classifier를 통해 분류 수행

In [7]:
train_set_path = 'Fashion-MNIST/train'
test_set_path = 'Fashion-MNIST/test'

train_samples, train_labels = data_loader(train_set_path, 60000)
test_samples, test_labels = data_loader(test_set_path, 10000)

means_by_classes, stdevs_by_classes = get_GaussianNBC(train_samples, train_labels)
    
pred_classes = predict(means_by_classes, stdevs_by_classes, test_samples)
    
acc = get_Acc(pred_classes, test_labels)

print('Acc: %s' %acc)

Acc: 10.0


### P2. P1에서 만든 모델에 min-max 정규화 방법을 적용해 적용 전후의 성능 비교 분석 수행

In [8]:
train_samples_minmax = train_samples
test_samples_minmax = test_samples
    
for i in range(len(train_samples)):
    for j in range(784) :
        train_samples_minmax[i][j] = train_samples_minmax[i][j]/255
            
for i in range(len(test_samples)):
    for j in range(784) :
        test_samples_minmax[i][j] = test_samples_minmax[i][j]/255
    
means_by_classes_minmax, stdevs_by_classes_minmax = get_GaussianNBC(train_samples_minmax, train_labels)
    
pred_classes_minmax = predict(means_by_classes_minmax, stdevs_by_classes_minmax, test_samples_minmax)
    
acc_minmax = get_Acc(pred_classes_minmax, test_labels)
    
print('Before min-max normalization) Acc: %s' %acc)
print('After min-max normalization) Acc: %s' %acc_minmax)

  prob *= Gaussian_PDF(x, mean, stdev)
  prob *= Gaussian_PDF(x, mean, stdev)


Before min-max normalization) Acc: 10.0
After min-max normalization) Acc: 54.87


### P3. P1에서 만든 모델을 변경하여 온라인 학습을 수행할 수 있도록 하고, 기존 배치 학습 방법과 비교

In [9]:
def get_GaussianNBC_online(train_samples, train_labels):
    
    fashion_class_0_samples = []
    fashion_class_1_samples = []
    fashion_class_2_samples = []
    fashion_class_3_samples = []
    fashion_class_4_samples = []
    fashion_class_5_samples = []
    fashion_class_6_samples = []
    fashion_class_7_samples = []
    fashion_class_8_samples = []
    fashion_class_9_samples = []
    
    for k in range(len(train_samples)):
        sample = train_samples[k]
        label = train_labels[k]
        
        if label == 0:
            fashion_class_0_samples.append(sample)
        elif label == 1:
            fashion_class_1_samples.append(sample)
        elif label == 2:
            fashion_class_2_samples.append(sample)
        elif label == 3:
            fashion_class_3_samples.append(sample)
        elif label == 4:
            fashion_class_4_samples.append(sample)
        elif label == 5:
            fashion_class_5_samples.append(sample)
        elif label == 6:
            fashion_class_6_samples.append(sample)
        elif label == 7:
            fashion_class_7_samples.append(sample)
        elif label == 8:
            fashion_class_8_samples.append(sample)
        elif label == 9:
            fashion_class_9_samples.append(sample)
            
    samples_by_classes = [
        fashion_class_0_samples,
        fashion_class_1_samples,
        fashion_class_2_samples,
        fashion_class_3_samples,
        fashion_class_4_samples,
        fashion_class_5_samples,
        fashion_class_6_samples,
        fashion_class_7_samples,
        fashion_class_8_samples,
        fashion_class_9_samples
    ]
    
    numOf_classes = 10
    means_by_classes = []
    stdevs_by_classes = []
    
    for C in range(numOf_classes):
        means = []
        stdevs = []
        for features in zip(*samples_by_classes[C]):
            features = list(features)
            mean = features[0]
            var = 0
            for i in range(1, len(features)) :
                old_mean = mean;
                old_var = var;
                mean = old_mean + ((features[i]-old_mean)/(i+1))
                var = old_var + ((((features[i]-old_mean)*(features[i]-mean))-old_var)/(i+1))
                
            means.append(mean)
            stdevs.append(math.sqrt(var))
            
        means_by_classes.append(means)
        stdevs_by_classes.append(stdevs)
    
    return means_by_classes, stdevs_by_classes

In [10]:
means_by_classes_online, stdevs_by_classes_online = get_GaussianNBC_online(train_samples, train_labels)
    
pred_classes_online = predict(means_by_classes_online, stdevs_by_classes_online, test_samples)
    
acc_online = get_Acc(pred_classes_online, test_labels)
    
print('Online learning - before min-max normalization) Acc: %s' %acc_online)
     
means_by_classes_minmax_online, stdevs_by_classes_minmax_online = get_GaussianNBC_online(train_samples_minmax, train_labels)

pred_classes_minmax_online = predict(means_by_classes_minmax_online, stdevs_by_classes_minmax_online, test_samples_minmax)
    
acc_minmax_online = get_Acc(pred_classes_minmax_online, test_labels)

print('Online learning - after min-max normalization) Acc: %s' %acc_minmax_online)

Online learning - before min-max normalization) Acc: 54.87
Online learning - after min-max normalization) Acc: 54.87
