In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [2]:
class NaiveBayes(object):
    
    def __init__(self):
        self.probability_dict = {}
        self.features_dict = {}

    def __total_count(self, cross_table):
        total_count = dict()
        for column in cross_table.columns:
            for row in cross_table.iterrows():
                total_count[row[0]] = 0
    
        for column in cross_table.columns:
            for row in cross_table.iterrows():
                total_count[row[0]] += cross_table[column][row[0]]
                
        return total_count

    def __init_probability_dict(self, y):
        df = pd.DataFrame(y)
        
        for label in set(y):
            self.probability_dict[label] = {}
            self.probability_dict[label]["P"] = df.value_counts()[label] / df.size
    
    def fit(self, X, y):
        self.__init_probability_dict(y)
        
        dfx = pd.DataFrame(X)

        for idx, feature in enumerate(dfx):
            self.features_dict[idx] = feature
            join_data = pd.DataFrame({"variable":dfx[feature], "label":y})
            cross_table = pd.crosstab(join_data.variable, join_data.label)

            total_count = self.__total_count(cross_table)

            #print(feature)
            
            for column in cross_table.columns:
                for row in cross_table.iterrows():
                    self.probability_dict[column][f"{feature}_{row[0]}"] = cross_table[column][row[0]] / total_count[row[0]]

        return self.probability_dict
        
    
    def predict(self, X):
        predict_result = []
        
        for item in X:
            max_probability = 0
            predict_key = "";
            for key, value in self.probability_dict.items():
                probability = self.probability_dict[key]["P"]
                for idx, feature in enumerate(item):
                    probability *= value[f"{self.features_dict[idx]}_{feature}"]

                if max_probability < probability:
                    max_probability = probability
                    predict_key = key

            predict_result.append(predict_key)

        return predict_result

    def get_params(self, deep=False):
        return {}
        

In [3]:
def cross_validation(df, clf):
    clone_classifier = clone(clf)
    df_train, df_test = train_test_split(data, test_size=0.3, random_state=77)

    y_train = df_train["class"].to_numpy()
    X_train = df_train.drop("class", axis=1).to_numpy()
    
    y_test = df_test["class"].to_numpy()
    X_test = df_test.drop("class", axis=1).to_numpy()
    clone_classifier.fit(X_train, y_train)
    labels_predict = clone_classifier.predict(X_test)
    n_correct = sum(labels_predict == y_test)
    return n_correct / len(labels_predict)

In [4]:
data = pd.read_csv('data/mushrooms.csv')
y_play_badminton = data['class']

x_play_badminton = data.drop("class", axis=1)

clf_naive_bayes = NaiveBayes()
results = clf_naive_bayes.fit(x_play_badminton, y_play_badminton)
print(results)

cross_val = cross_validation(data, clf_naive_bayes)
print(f"Accuracy = {cross_val}")

{'Yes': {'P': 0.3333333333333333, 'Outlook_Overcast': 0.5, 'Outlook_Rain': 0.0, 'Outlook_Sunny': 0.5, 'Temperature_Cool': 0.3333333333333333, 'Temperature_Hot': 0.3333333333333333, 'Temperature_Mild': 0.3333333333333333, 'Humidity_High': 0.3333333333333333, 'Humidity_Normal': 0.3333333333333333, 'Wind_Strong': 0.0, 'Wind_Weak': 0.6666666666666666}, 'No': {'P': 0.6666666666666666, 'Outlook_Overcast': 0.5, 'Outlook_Rain': 1.0, 'Outlook_Sunny': 0.5, 'Temperature_Cool': 0.6666666666666666, 'Temperature_Hot': 0.6666666666666666, 'Temperature_Mild': 0.6666666666666666, 'Humidity_High': 0.6666666666666666, 'Humidity_Normal': 0.6666666666666666, 'Wind_Strong': 1.0, 'Wind_Weak': 0.3333333333333333}}
Accuracy = 0.7272727272727273
