In [3]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

## 机器学习基础

### 最大似然估计与贝叶斯统计

In [10]:
# MAP 最大后验估计

class NativeBayes:

    def __init__(self) -> None:
        self.parameters = []  # 保存每个特征针对每个类的均值和方差
        self.y = None
        self.classes = None

    def fit(self, X, y):
        self.y = y
        self.classes = np.unique(y)  # 类别
        # 计算每个特征针对每个类别的均值和方差
        for i, c in enumerate(self.classes):
            # 选择类别为c的数据
            X_where = X[np.where(y == c)]
            self.parameters.append([])
            for col in X_where.T:
                parameters = {"mean": col.mean(), "var": col.var()}
                self.parameters[i].append(parameters)

    def _calculate_prior(self, c):
        # 先验概率【基于训练集】
        frequency = np.mean(self.y == c)
        return frequency
    
    def _calculate_likelihood(self, mean, var, X):
        # 似然函数
        eps = 1e-4 # 防止分母为0
        coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
        exponent = math.exp(-(math.pow(X - mean, 2) / (2 * var + eps)))
        return coeff * exponent
    
    def _calculate_probabilities(self, X):
        # 计算每个类别的概率
        probabilities = []
        for i, c in enumerate(self.classes):
            prior = self._calculate_prior(c)
            likelihood = 1.0
            for feature, params in zip(X, self.parameters[i]):
                # 独立性假设
                # p(x1,x2,x3|y) = p(x1|y) * p(x2|y) * p(x3|y)  # x1, x2, x3此处是该样本对应的特征值
                likelihood *= self._calculate_likelihood(params["mean"], params["var"], feature)
            probabilities.append(prior * likelihood)
        
        # 返回具有最大后验概率的类别
        return self.classes[np.argmax(probabilities)], probabilities    
    
    def predict(self, X):
        # 预测
        y_pred = [self._calculate_probabilities(sample)[0] for sample in X]
        return y_pred
    
    def score(self, X, y):
        # 计算准确率
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == y)
        return accuracy

In [4]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    data = np.array(df.iloc[:100, :])
    return data[:, :-1], data[:, -1]

X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
print(X_train[0], y_train[0])

[5.2 3.5 1.5 0.2] 0.0


In [17]:
model = NativeBayes()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

1.0


In [15]:
# 从sklearn中导入高斯朴素贝叶斯估计 GaussianNB

from sklearn.naive_bayes import GaussianNB
skl_model = GaussianNB()
skl_model.fit(X_train, y_train)
print(skl_model.score(X_test, y_test))

1.0
