In [1]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.class_priors = None
        self.cond_probs = None

    def fit(self, X, y):
        num_classes = len(np.unique(y))
        num_features = X.shape[1]

        # 클래스 사전확률
        self.class_priors = np.zeros(num_classes)
        for c in range(num_classes):
            self.class_priors[c] = np.mean(y == c)

        # 조건부확률
        self.cond_probs = np.zeros((num_classes, num_features))
        for c in range(num_classes):
            for f in range(num_features):
                self.cond_probs[c, f] = np.mean(X[y == c, f])

    def predict(self, X):
        num_classes = self.class_priors.shape[0]
        num_data = X.shape[0]
        y_pred = np.zeros(num_data)

        # 사후확률
        for i in range(num_data):
            likelihoods = np.zeros(num_classes)
            for c in range(num_classes):
                likelihoods[c] = np.prod(self.cond_probs[c, X[i, :] == 1]) * \
                                 np.prod(1 - self.cond_probs[c, X[i, :] == 0])
                likelihoods[c] *= self.class_priors[c]
            #likelihoods /= np.sum(likelihoods)

            y_pred[i] = np.argmax(likelihoods)

        return y_pred

In [2]:
# 데이터 생성
num_samples = 1000

features = np.random.randint(0, 2, size=(num_samples, 10))

labels = np.zeros(num_samples)
for i in range(num_samples):
    if np.sum(features[i][:5]) >= 3 and np.sum(features[i][5:]) <= 2:
        labels[i] = 1

split = int(0.8 * num_samples)
X_train, y_train = features[:split], labels[:split]
X_test, y_test = features[split:], labels[split:]

# 인스턴스 생성
clf = NaiveBayesClassifier()

# 모델학습
clf.fit(X_train, y_train)

# 테스트 데이터에 모델적용
y_pred = clf.predict(X_test)

# 정확도 계산
accuracy = sum(y_pred == y_test) / len(y_test)
print("Accuracy:", accuracy)

Accuracy: 0.88


In [3]:
# \mu_{1i} = p(x_i=1|C_1)
# C_1 클래스일 때 첫 5번째 원소들이 1일 확률이 크다
# C_1 클래스일 때 뒤 5번째 원소들이 0일 확률이 크다
clf.cond_probs[1]

array([0.67171717, 0.7020202 , 0.66161616, 0.67171717, 0.69191919,
       0.35353535, 0.24747475, 0.31818182, 0.33838384, 0.25757576])

In [4]:
# \mu_{2i} = p(x_i=1|C_2)
clf.cond_probs[0]

array([0.44352159, 0.43189369, 0.43023256, 0.44019934, 0.41694352,
       0.56312292, 0.55149502, 0.57475083, 0.57807309, 0.56478405])

In [5]:
# P(C_2), P(C_1)
clf.class_priors

array([0.7525, 0.2475])