# 1. Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 2. Write Function

### 2.1 สร้าง Dictionary สำหรับนับจำนวนสมาชิก และ เก็บความน่าจะเป็น

In [2]:
def NB_create_DictStorage(Feature_Name, Feature_Value, All_Class):
    DictStorage = {}
    n_class = len(All_Class)
    n_columns = n_class + 1
    D = len(Feature_Name)
    for d in range(D):
        fn = Feature_Name[d]
        n_value = len(Feature_Value[d])
        n_rows = n_value + 1
        MatrixStorage = np.zeros([n_rows, n_columns], dtype='object')
        for c in range(n_class):
            MatrixStorage[0, c+1] = All_Class[c]
        for r in range(n_value):
            MatrixStorage[r+1, 0] = Feature_Value[d][r]
        DictStorage[fn] = MatrixStorage
    return DictStorage

In [3]:
def NB_find_Feature_Value(X_Train, Feature_Name):
    Feature_Value = []
    D = len(Feature_Name)
    for d in range(D):
        Value = np.unique(X_Train[:, d])
        Feature_Value.append(Value)
    return Feature_Value

### 2.2 นับจำนวนสมาชิกจาก Training Set เพื่อใส่ใน Count_Matrix

In [4]:
def NB_count_element(X_Train, Y_Train, Feature_Name, empty_Count):
    D = len(Feature_Name)
    for d in range(D):
        fn = Feature_Name[d]
        n_rows, n_columns = empty_Count[fn].shape
        for c in range(1, n_columns):
            filter_class = (Y_Train[:, 0] == empty_Count[fn][0, c])
            for r in range(1, n_rows):
                filter_feature = (X_Train[:, d] == empty_Count[fn][r, 0])
                empty_Count[fn][r, c] = (filter_feature & filter_class).sum()
    return empty_Count

### 2.3 คำนวณความน่าจะเป็นจาก Count_Matrix เพื่อใส่ใน Prob_Matrix

In [5]:
def NB_calculate_prob(empty_Prob, Count_Matrix, Count_All_Class, Feature_Name):
    D = len(Feature_Name)
    for d in range(D):
        fn = Feature_Name[d]
        empty_Prob[fn][1:,1:] = Count_Matrix[fn][1:,1:]/Count_All_Class
    return empty_Prob

### 2.4 เรียนรู้

In [6]:
def NBDD_fit(X_Train, Y_Train, Feature_Name, Feature_Value, All_Class, Count_All_Class):
    empty_Count = NB_create_DictStorage(Feature_Name, Feature_Value, All_Class)
    Count_Matrix = NB_count_element(X_Train, Y_Train, Feature_Name, empty_Count)
    empty_Prob = NB_create_DictStorage(Feature_Name, Feature_Value, All_Class)
    Prob_Matrix = NB_calculate_prob(empty_Prob, Count_Matrix, Count_All_Class, Feature_Name)
    return Count_Matrix, Prob_Matrix

### 2.5 พยากรณ์

In [7]:
def NBDD_predict(X_Test, Prob_Matrix, Feature_Name, Feature_Value, All_Class, Count_All_Class, top = 1):
    prob_class = Count_All_Class/(Count_All_Class.sum())
    Yhat = []
    D = len(Feature_Name)
    C = len(All_Class)
    for x_test in X_Test:
        prob_fgc = np.zeros([D, C])
        for d in range(D):
            fn = Feature_Name[d]
            r = np.argwhere(Feature_Value[d] == x_test[d]).ravel()[0]
            prob_fgc[d, :] = Prob_Matrix[fn][r+1, 1:]
        prob_afgc = prob_fgc.prod(axis=0)
        prob_nb = prob_class*prob_afgc
        sorted_class = All_Class[prob_nb.argsort()[::-1]]
        Yhat.append(sorted_class)
    return np.array(Yhat)[:, :top]

In [8]:
def find_error_classification(Y, Yhat):
    N = Y.shape[0]
    error = (100/N)*(Y != Yhat).sum()
    return error

# 3. Read Data & Prepare Data

In [9]:
Data = pd.read_excel('Men_or_Women.xlsx')

In [10]:
Data

Unnamed: 0,hair_style,sound_style,has_earring,Class
0,short,low-pitched,yes,men
1,long,high-pitched,yes,women
2,long,low-pitched,yes,women
3,long,low-pitched,no,men
4,long,high-pitched,yes,women
...,...,...,...,...
2032,long,high-pitched,yes,women
2033,long,high-pitched,yes,women
2034,long,low-pitched,yes,women
2035,short,low-pitched,no,men


In [11]:
DataMatrix = Data.values

In [12]:
DataMatrix.shape

(2037, 4)

In [13]:
D = DataMatrix.shape[1] - 1

In [14]:
X = DataMatrix[:, :D]
Y = DataMatrix[:, D:]

In [15]:
start_train = 0
end_train = -150
# end_valid = -50
# end_test = -1

In [16]:
X_Train = X[start_train:end_train, :]
Y_Train = Y[start_train:end_train, :]

# X_Valid = X[end_train:end_valid, :]
# Y_Valid = Y[end_train:end_valid, :]

X_Test = X[end_train:, :]
Y_Test = Y[end_train:, :]

In [25]:
X_Train[0]

array(['short', 'low-pitched', 'yes'], dtype=object)

In [26]:
X_Test[0]

array(['long', 'high-pitched', 'yes'], dtype=object)

# 4. Create Model

In [18]:
Feature_Name = np.array(Data.columns[:-1])
Feature_Value = NB_find_Feature_Value(X_Train, Feature_Name)
All_Class, Count_All_Class = np.unique(Y_Train[:, :], return_counts = True)

In [27]:
print(Feature_Name)

['hair_style' 'sound_style' 'has_earring']


In [28]:
print(Feature_Value)

[array(['long', 'short'], dtype=object), array(['high-pitched', 'low-pitched'], dtype=object), array(['no', 'yes'], dtype=object)]


In [29]:
print(All_Class)

['men' 'women']


In [30]:
print(Count_All_Class)

[897 990]


In [19]:
Count_Matrix, Prob_Matrix = NBDD_fit(X_Train, Y_Train, Feature_Name, Feature_Value, All_Class, Count_All_Class)

# 5. Making Prediction

In [20]:
Yhat_Test = NBDD_predict(X_Test, Prob_Matrix, Feature_Name, Feature_Value, All_Class, Count_All_Class, top=1)

In [31]:
Y_Test

array([['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['women'],
       ['men'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       [

In [23]:
Yhat_Test

array([['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['women'],
       ['women'],
       ['women'],
       ['women'],
       ['women'],
       ['men'],
       ['men'],
       ['women'],
       ['men'],
       ['men'],
       ['men'],
      

In [21]:
error = find_error_classification(Y_Test, Yhat_Test)

In [22]:
error

2.6666666666666665