In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
class GNB:
    def __init__(self, prior=None, n_class=None, mean=None, variance=None, classes=None):
        self.prior = prior
        self.n_class = n_class
        self.mean = mean
        self.variance = variance
        self.classes = classes

    # get the mean and variance of the x values
    def fit(self, x, y):
        self.x = x
        self.y = y
        self.mean = x.groupby(by=y).mean()
        self.variance = x.groupby(by=y).var()
        self.n_class = len(np.unique(y))
        self.classes = np.unique(y)
        self.prior = 1/self.n_class
        return self

    def mean_var(self):
        m = np.array(self.mean)
        v = np.array(self.variance)

        self.mean_var = []
        for i in range(len(m)):
            m_row = m[i]
            v_row = v[i]
            for idx, value in enumerate(m_row):
                mean = value
                var = v_row[idx]
                self.mean_var.append([mean, var])
        mean_var_array = np.array(self.mean_var)
        return mean_var_array

    def split_by_class(self):
        mean_var_array = self.mean_var()
        summary_data_by_class = np.vsplit(mean_var_array, self.n_class)
        return summary_data_by_class

    # gaussian naive bayes probability equation
    def gnb(self, x_value, x_mean, x_var):
        """
        GNB = Gaussian naive bayes
        x_value = x from test sample
        x_mean = mean the feature by the class
        x_var = variance the feature by the class
        """
        self.x_value = x_value
        self.x_mean = x_mean
        self.x_var = x_var

        # first part of the equation
        eq_1 = 1/(np.sqrt(2 * np.pi * x_var))

        # first part of the equation
        # denominator
        denominator = 2 * x_var

        # numerator
        numerator = (x_value - x_mean) ** 2

        # exponent
        expo = np.exp(-(numerator/denominator))

        probability = eq_1 * expo

        return probability

    def predict(self, x_test):
        summary_data_by_class = self.split_by_class()
        prob = []
        for i in range(self.n_class):
            class_k = summary_data_by_class[i]
            for i in range(len(class_k)):
                class_k_mean = class_k[i][0]
                class_k_var = class_k[i][1]
                x_value = x_test[i]
                prob.append([self.gnb(x_value, class_k_mean, class_k_var)])

        prob_array = np.array(prob)
        prob_array_by_class = np.vsplit(prob_array, self.n_class)

        final_probabilities = []
        prob_of_class = []
        
        for i in prob_array_by_class:
            final_probabilities.append(np.prod(i) * self.prior)
        evidence = np.sum(final_probabilities)
        
        for i in range(len(final_probabilities)):
            prob_of_class.append([final_probabilities[i]/evidence])
        #print(final_probabilities)
        print(prob_of_class)

        maximum_prob = max(prob_of_class)
        prob_index = prob_of_class.index(maximum_prob)
        prediction = self.classes[prob_index]
        
        return prediction

In [42]:
def main():

    iris_data = pd.read_csv("Iris.csv")
    X = iris_data[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]]
    y = iris_data["Species"].to_frame()
    y = y.values.flatten()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    gnb = GNB()
    gnb.fit(X_train, y_train)
    result = gnb.predict([5.7, 3.8, 1.7, 0.3])  # iris-setosa
    # result = gnb.predict([6.1, 2.8, 4.7, 1.2])  # iris-versicolor
    # result = gnb.predict([7.2, 3.6, 6.1, 2.5])  # iris-virginica
    print(result)
    
main()

[[0.9999999999998984], [1.0155078797792485e-13], [2.158248447741561e-20]]
Iris-setosa
