$$y=\arg \max_{c_k} P(Y=c_k|X) $$

$$P(Y|X)=\frac{P(X,Y)}{P(X)}=\frac{P(X|Y)P(Y)}{P(X)}\\
对于每一个c_k,P(X)都一样，所以分母P(X)可以去掉$$

朴素贝叶斯的基本假设是条件独立性，即:

$$\begin{align}
P(X=x|Y=c_k)&=P(X^{(1)}=x^{(1)},X^{(2)}=x^{(2)},\cdots,X^{(n)}=x^{(n)}|Y=c_K)\\
&=\prod_{j=1}^n P(X^{(j)}=x^{(j)}|Y=c_k)\\
\end{align}$$

特征的概率假设为高斯分布，则

$$P(X^{(j)}=x_{j}|Y=c_k) = \frac{1}{\sqrt{2\pi} \sigma_{Y=c_k}} \exp (- \frac{(x_{(j)} - \mu_{Y=c_k})}{2\sigma_{Y=c_k}^2})$$

最终

$$\begin{align}
y&=\arg \max_{c_k} P(Y=c_k|X)\\
&=\arg \max_{c_k} P(Y=c_k) \prod_{j=1}^n P(X^{(j)}=x^{(j)}|Y=c_k)\\
&=\arg \max_{c_k} P(Y=c_k) \prod_{j=1}^n \frac{1}{\sqrt{2\pi} \sigma_{Y=c_k}} \exp (- \frac{(x_{(j)} - \mu_{Y=c_k})}{2\sigma_{Y=c_k}^2})\\
\end{align}$$

In [1]:
import math
from collections import defaultdict
from collections import namedtuple

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

GD = namedtuple("GD", ["mean", "std"])


class GussianNaiveBayes:
    def __init__(self):
        self.model = {}

    def fit(self, X, y):
        # {
        #     label1:[[feature1,feature2,...],...],
        #     label2:[[feature1,feature2,...],...],
        #     ...
        # }
        d = defaultdict(list)
        for features, label in zip(X, y):
            d[label].append(features)

        # {
        #     label1:[{mean:1.0,std:2},{mean:1.0,std:2},...],
        #     label2:[{mean:1.0,std:2},{mean:1.0,std:2},...],
        #     ...
        # }
        for label, features in d.items():
            self.model[label] = []
            for column in np.transpose(features):
                self.model[label].append(GD(self.mean(column), self.std(column)))

    def predict(self, x):
        # 当前例子中先验(P(y=ck)一样，所以就没计算
        label = sorted(self.calculate_probabilities(x).items(), key=lambda o: o[-1])[-1][0]
        return label

    def score(self, X_test, y_test):
        right = 0
        for x, yi in zip(X_test, y_test):
            label = self.predict(x)
            if label == yi:
                right += 1
        return right / float(len(X_test))

    def calculate_probabilities(self, x):
        probabilities = {}
        for label, gds in self.model.items():
            probabilities[label] = 1
            for i, gd in enumerate(gds):
                probabilities[label] *= self.gaussian_probability(x[i], gd.mean, gd.std)
        return probabilities

    def gaussian_probability(self, xi, mean, std):
        return (1 / (math.sqrt(2 * np.pi) * std)) * math.exp(-((xi - mean) ** 2) /( 2 * (std ** 2)))

    def mean(self, x):
        return sum(x) / float(len(x))

    def std(self, x):
        return math.sqrt(sum([pow(xi - self.mean(x), 2) for xi in x]) / float(len(x)))


def create_data():
    iris = datasets.load_iris()
    return iris.data[iris.target < 2], iris.target[iris.target < 2]


X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)

model = GussianNaiveBayes()
model.fit(X_train, y_train)
print(model.model)
print(f'Score:{model.score(X_test, y_test)}')

{0: [GD(mean=4.955, std=0.35422450508117026), GD(mean=3.370000000000001, std=0.36891733491393436), GD(mean=1.4575, std=0.1744813743641424), GD(mean=0.24500000000000002, std=0.10712142642814276)], 1: [GD(mean=5.931428571428571, std=0.5181127448022588), GD(mean=2.7742857142857145, std=0.29889592755707467), GD(mean=4.282857142857144, std=0.47718562682564064), GD(mean=1.3485714285714285, std=0.18726822409341648)]}
Score:1.0
