In [95]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import math

In [96]:
def create_data():
    iris = load_iris() # 鸢尾花数据集
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    print(df)
    df['label'] = iris.target
    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
    return iris.data, iris.target

In [97]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


In [98]:
class NaiveBayes:
    def __init__(self):
        self.model = None

    @staticmethod    
    def mean(X):
        """数学期望"""
        return sum(X) / float(len(X))

    def stdev(self, X):
        """标准差"""
        avg = self.mean(X)
        return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X)))

    def gaussian_probability(self, x, mean, stdev):
        """正态分布概率密度函数"""
        exponent = math.exp(-(math.pow(x - mean, 2) /
                              (2 * math.pow(stdev, 2))))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

    def summarize(self, train_data):
        """每个实例的均值与标准差"""
        summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]
        return summaries

    def fit(self, X, y):
        labels = list(set(y)) # y所包含的所有的类标记
        data = {label: [] for label in labels} # 类标记和属于该类标记组成的字典
        for f, label in zip(X, y):
            data[label].append(f)
        self.model = {
            label: self.summarize(value) for label, value in data.items()
        }

    def calculate_probabilities(self, input_data):
        """计算每个类标记对应的后验概率"""
        probabilities = {}
        for label, value in self.model.items():
            probabilities[label] = 1
            for i in range(len(value)):
                mean, stdev = value[i]
                probabilities[label] *= self.gaussian_probability(
                    input_data[i], mean, stdev) # 高斯朴素贝叶斯(连续属性)
        return probabilities

    def predict(self, X_test):
        """输出后验概率最大的类标记"""
        label = sorted(
            self.calculate_probabilities(X_test).items(),
            key=lambda x: x[-1])[-1][0]
        return label

    def score(self, X_test, y_test):
        """计算预测正确率(即精度)"""
        right = 0
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right += 1

        return right / float(len(X_test))


In [99]:
model = NaiveBayes()
model.fit(X_train, y_train)

In [100]:
print(model.predict([4.4,  3.2,  1.3,  0.2]))

0


In [101]:
model.score(X_test, y_test)

0.9333333333333333