In [1]:
#朴素贝叶斯算法
import numpy as np
import pandas as pd

class NaiveBayes(object):
    def __init__(self, X_train, y_train):
        self.X_train = X_train  #样本特征
        self.y_train = y_train  #样本类别
        #训练集样本中每个类别(二分类)的占比，即P(类别)，供后续使用
        self.P_label = {1: np.mean(y_train.values), 0: 1-np.mean(y_train.values)}

    #在数据集data中, 特征feature的值为value的样本所占比例
    #用于计算P(特征|类别)、P(特征)
    def getFrequency(self, data, feature, value):
        num = len(data[data[feature]==value]) #个数
        return num / (len(data))

    def predict(self, X_test):
        self.prediction = [] #预测类别
        # 遍历样本
        for i in range(len(X_test)):
            x = X_test.iloc[i]      # 第i个样本
            P_feature_label0 = 1    # P(特征|类别0)之和
            P_feature_label1 = 1    # P(特征|类别1)之和
            P_feature = 1           # P(特征)之和
            # 遍历特征
            for feature in X_test.columns:
                # 分子项，P(特征|类别)
                data0 = self.X_train[self.y_train.values==0]  #取类别为0的样本
                P_feature_label0 *= self.getFrequency(data0, feature, x[feature]) #计算P(feature|0)

                data1 = self.X_train[self.y_train.values==1]  #取类别为1的样本
                P_feature_label1 *= self.getFrequency(data1, feature, x[feature]) #计算P(feature|1)

                # 分母项，P(特征)
                P_feature *= self.getFrequency(self.X_train, feature, x[feature])

            #属于每个类别的概率
            P_0 = (P_feature_label0*self.P_label[0]) / P_feature
            P_1 = (P_feature_label1 * self.P_label[1]) / P_feature
            #选出大概率值对应的类别
            self.prediction.append([1 if P_1>=P_0 else 0])
        return self.prediction

In [2]:
#加入拉普拉斯平滑
import numpy as np
import pandas as pd

class NaiveBayes(object):
    def __init__(self, X_train, y_train):
        self.X_train = X_train  # 训练集样本特征
        self.y_train = y_train  # 训练集样本类别
        # 训练集样本中每个类别(二分类)的占比，即P(类别)，供后续使用
        self.P_label = {1: np.mean(y_train.values), 0: 1 - np.mean(y_train.values)}
        self.alpha = 1.0  # 拉普拉斯平滑的平滑参数

    # 在数据集data中，特征feature的值为value的样本所占比例，应用拉普拉斯平滑
    # 用于计算P(特征|类别)、P(特征)
    def getFrequency(self, data, feature, value):
        num = len(data[data[feature] == value])  # 特征为value的样本数量
        return (num + self.alpha) / (len(data) + self.alpha * len(data[feature].unique()))  # 应用拉普拉斯平滑

    def predict(self, X_test):
        self.prediction = []  # 预测类别
        # 遍历样本
        for i in range(len(X_test)):
            x = X_test.iloc[i]  # 第i个样本
            P_feature_label0 = 1  # P(特征|类别0)之积
            P_feature_label1 = 1  # P(特征|类别1)之积
            P_feature = 1  # P(特征)之积
            # 遍历特征
            for feature in X_test.columns:
                # 分子项，P(特征|类别)
                data0 = self.X_train[self.y_train.values == 0]  # 取类别为0的样本
                P_feature_label0 *= self.getFrequency(data0, feature, x[feature])  # 计算P(feature|0)

                data1 = self.X_train[self.y_train.values == 1]  # 取类别为1的样本
                P_feature_label1 *= self.getFrequency(data1, feature, x[feature])  # 计算P(feature|1)

                # 分母项，P(特征)
                P_feature *= self.getFrequency(self.X_train, feature, x[feature])

            # 属于每个类别的概率
            P_0 = (P_feature_label0 * self.P_label[0]) / P_feature
            P_1 = (P_feature_label1 * self.P_label[1]) / P_feature
            # 选出大概率值对应的类别
            self.prediction.append([1 if P_1 >= P_0 else 0])
        return self.prediction


In [13]:
# 从CSV文件加载数据
# data_df = pd.read_csv('data/data239141/iris_data.csv')
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#加载数据
X, y = load_iris(return_X_y=True)
X, y = pd.DataFrame(X[:100]), pd.DataFrame(y[:100])

#训练集、测试集划分
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3,random_state=42)

model = NaiveBayes(X_train, y_train)    #训练
y_pre = model.predict(X_test)           #预测
print(accuracy_score(y_pre, y_test))    #评分：0.8

0.9666666666666667


**作业任务：
本次实验通过西瓜数据集，学习贝叶斯分类的概念与原理，熟练掌握贝叶斯分类技巧，通过合理运用贝叶斯分类方法对西瓜多属性进行品质好坏的正确分类。**
![](https://ai-studio-static-online.cdn.bcebos.com/f1c4372cc4774429a000c3dd422797b88612a344b41a4d63b0decc58f7a59e93)
![](https://ai-studio-static-online.cdn.bcebos.com/b65dca4af5d545babb8a88224330ccc2675fa756e8e144f982753e136f36582e)


In [3]:
import numpy as np
import pandas as pd

class NaiveBayes(object):
    def __init__(self, X_train, y_train):
        self.X_train = X_train # 训练集样本特征
        self.y_train = y_train # 训练集样本类别
        # 训练集样本中每个类别(二分类)的占比，即P(类别)，供后续使用
        self.P_label = {1: np.mean(y_train.values), 0: 1 - np.mean(y_train.values)}

    # 在数据集data中，特征feature的值为value的样本所占比例
    # 用于计算P(特征|类别)、P(特征)
    def getFrequency(self, data, feature, value):
        num = len(data[data[feature] == value]) # 个数
        return num / (len(data))

    def predict(self, X_test):
        self.prediction = [] # 预测类别
        # 遍历样本
        for i in range(len(X_test)):
            x = X_test.iloc[i] # 第i个样本
            P_feature_label0 = 1 # P(特征|类别0)之积
            P_feature_label1 = 1 # P(特征|类别1)之积
            P_feature = 1 # P(特征)之积
            # 遍历特征
            for feature in X_test.columns:
                data0 = self.X_train[self.y_train.values == 0] # 取类别为0的样本
                P_feature_label0 *= self.getFrequency(data0, feature, x[feature]) # 计算P(feature|0)

                data1 = self.X_train[self.y_train.values == 1] # 取类别为1的样本
                P_feature_label1 *= self.getFrequency(data1, feature, x[feature]) # 计算P(feature|1)

                # 分母项，P(特征)
                P_feature *= self.getFrequency(self.X_train, feature, x[feature])
                 # 属于每个类别的概率
            P_0 = (P_feature_label0 * self.P_label[0]) / P_feature
            P_1 = (P_feature_label1 * self.P_label[1]) / P_feature
            # 选出大概率值对应的类别
            self.prediction.append([1 if P_1 >= P_0 else 0])
        return self.prediction

# 手动录入训练数据
train_data = {
    '色泽': ['青绿', '乌黑', '乌黑', '青绿', '浅白', '青绿', '乌黑', '乌黑', '乌黑', '青绿', '浅白', '浅白', '青绿', '浅白', '乌黑', '浅白', '青绿'],
    '根蒂': ['蜷缩', '蜷缩', '蜷缩', '蜷缩', '蜷缩', '稍蜷', '稍蜷', '稍蜷', '稍蜷', '硬挺', '硬挺', '蜷缩', '稍蜷', '稍蜷', '稍蜷', '蜷缩', '蜷缩'],
    '敲声': ['浊响', '沉闷', '浊响', '沉闷', '浊响', '浊响', '浊响', '浊响', '沉闷', '清脆', '清脆', '浊响', '浊响', '沉闷', '浊响', '浊响', '沉闷'],
    '纹理': ['清晰', '清晰', '清晰', '清晰', '清晰', '清晰', '稍糊', '清晰', '稍糊', '清晰', '模糊', '模糊', '稍糊', '稍糊', '清晰', '模糊', '稍糊'],
    '脐部': ['凹陷', '凹陷', '凹陷', '凹陷', '凹陷', '稍凹', '稍凹', '稍凹', '稍凹', '平坦', '平坦', '平坦', '凹陷', '凹陷', '稍凹', '平坦', '稍凹'],
    '触感': ['硬滑', '硬滑', '硬滑', '硬滑', '硬滑', '软粘', '软粘', '硬滑', '硬滑', '软粘', '硬滑', '软粘', '硬滑', '硬滑', '软粘', '硬滑', '硬滑'],
    '密度': [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243, 0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719],
    '含糖量': [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267, 0.057, 0.097, 0.161, 0.198, 0.370, 0.042, 0.103],
    '好瓜': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
}
train_df = pd.DataFrame(train_data)

# 手动录入测试数据
test_data = {
    '色泽': ['青绿'],
    '根蒂': ['蜷缩'],
    '敲声': ['浊响'],
    '纹理': ['清晰'],
    '脐部': ['凹陷'],
    '触感': ['硬滑'],
    '密度': [0.697],
    '含糖量': [0.460]
}
test_df = pd.DataFrame(test_data)

# 训练模型并预测
X_train = train_df.drop('好瓜', axis=1)
y_train = train_df['好瓜']
model = NaiveBayes(X_train, y_train)

X_test = test_df
y_pre = model.predict(X_test)

print("预测结果:", y_pre)
print("预测结果解释: 1表示好瓜，0表示坏瓜")

预测结果: [[1]]
预测结果解释: 1表示好瓜，0表示坏瓜
