In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [2]:
class NaiveBayes(object):
    
    def __init__(self):
        self.probability_dict = {}
        self.features_dict = {}

    def __total_count(self, cross_table):
        total_count = dict()
        for column in cross_table.columns:
            for row in cross_table.iterrows():
                total_count[row[0]] = 0
    
        for column in cross_table.columns:
            for row in cross_table.iterrows():
                total_count[row[0]] += cross_table[column][row[0]]
                
        return total_count

    def __init_probability_dict(self, y):
        df = pd.DataFrame(y)
        
        for label in set(y):
            self.probability_dict[label] = {}
            self.probability_dict[label]["P"] = df.value_counts()[label] / df.size
    
    def fit(self, X, y):
        self.__init_probability_dict(y)
        
        dfx = pd.DataFrame(X)

        for idx, feature in enumerate(dfx):
            self.features_dict[idx] = feature
            join_data = pd.DataFrame({"variable":dfx[feature], "label":y})
            cross_table = pd.crosstab(join_data.variable, join_data.label)

            total_count = self.__total_count(cross_table)

            #print(feature)
            
            for column in cross_table.columns:
                for row in cross_table.iterrows():
                    self.probability_dict[column][f"{feature}_{row[0]}"] = cross_table[column][row[0]] / total_count[row[0]]

        return self.probability_dict
        
    
    def predict(self, X):
        predict_result = []
        
        for item in X:
            max_probability = 0
            predict_key = "";
            for key, value in self.probability_dict.items():
                probability = self.probability_dict[key]["P"]
                for idx, feature in enumerate(item):
                    probability *= value[f"{self.features_dict[idx]}_{feature}"]

                if max_probability < probability:
                    max_probability = probability
                    predict_key = key

            predict_result.append(predict_key)

        return predict_result

    def get_params(self, deep=False):
        return {}
        

In [3]:
def cross_validation(df, clf):
    clone_classifier = clone(clf)
    df_train, df_test = train_test_split(data, test_size=0.3, random_state=77)

    y_train = df_train["class"].to_numpy()
    X_train = df_train.drop("class", axis=1).to_numpy()
    
    y_test = df_test["class"].to_numpy()
    X_test = df_test.drop("class", axis=1).to_numpy()
    clone_classifier.fit(X_train, y_train)
    labels_predict = clone_classifier.predict(X_test)
    n_correct = sum(labels_predict == y_test)
    return n_correct / len(labels_predict)

In [4]:
data = pd.read_csv('data/mushrooms.csv')
y_play_badminton = data['class']

x_play_badminton = data.drop("class", axis=1)

clf_naive_bayes = NaiveBayes()
results = clf_naive_bayes.fit(x_play_badminton, y_play_badminton)
print(results)

cross_val = cross_validation(data, clf_naive_bayes)
print(f"Accuracy = {cross_val}")

{'p': {'P': 0.48202855736090594, 'cap-shape_b': 0.10619469026548672, 'cap-shape_c': 1.0, 'cap-shape_f': 0.4936548223350254, 'cap-shape_k': 0.7246376811594203, 'cap-shape_s': 0.0, 'cap-shape_x': 0.4671772428884026, 'cap-surface_f': 0.3275862068965517, 'cap-surface_g': 1.0, 'cap-surface_s': 0.5524256651017214, 'cap-surface_y': 0.5363748458692972, 'cap-color_b': 0.7142857142857143, 'cap-color_c': 0.2727272727272727, 'cap-color_e': 0.584, 'cap-color_g': 0.4391304347826087, 'cap-color_n': 0.44658493870402804, 'cap-color_p': 0.6111111111111112, 'cap-color_r': 0.0, 'cap-color_u': 0.0, 'cap-color_w': 0.3076923076923077, 'cap-color_y': 0.6268656716417911, 'bruises_f': 0.6933445661331087, 'bruises_t': 0.1848341232227488, 'odor_a': 0.0, 'odor_c': 1.0, 'odor_f': 1.0, 'odor_l': 0.0, 'odor_m': 1.0, 'odor_n': 0.034013605442176874, 'odor_p': 1.0, 'odor_s': 1.0, 'odor_y': 1.0, 'gill-attachment_a': 0.08571428571428572, 'gill-attachment_f': 0.4925448572150619, 'gill-spacing_c': 0.5584263065179096, 'gill-