In [27]:
from sklearn import datasets
import numpy as np
import math
import random 

In [28]:
def load_iris(ratio=0.8):
    features, target = datasets.load_iris(True)
    
    num_samples = len(target)
    num_train = math.ceil(num_samples * ratio)
    
    # 随机打乱数据
    idx = np.random.permutation(np.arange(num_samples))
    traindata = features[idx[:num_train]], target[idx[:num_train]]
    validdata = features[idx[num_train:]], target[idx[num_train:]]
    
    return traindata, validdata

# 作业三

## 二、朴素贝叶斯分类器

要求：

* 用朴素贝叶斯构造一个iris数据集的分类器
* 在尽量不修改代码结构的前提下完成工作

ETA：1-5 hours

## 定义模型

In [29]:
# 只需要修改这一部分 -- 代码量在40行以内
class NaiveBayes:
    def __init__(self, smooth = 1):
        self.smooth = smooth # lambda
        self.conditional_prob = None # 条件概率
        self.prior_prob = None # 先验概率
        
        
    def __call__(self, features):
        return self.predict(features)
        
        
    def fit(self, features, target):
        """
        
        给定特征及真实结果，拟合分类器
        
        将预测过程中所需要用到的条件概率及先验概率全部计算好
        """
        
        self.num_features = features.shape[-1] # 特征的数目；iris数据集中共有4个特征
        self.target_labels = np.unique(target) # 预测结果的可能值：c_k

        # features_labels[j][l]表示第 j 个特征的第 l 个可能值: a_{jl}
        self.features_labels = [np.unique(features[:, i]) for i in range(features.shape[-1])]
        self.prior_prob = self._prior_prob(target)
        self.conditional_prob = self._conditional_prob(features, target)
        
        
    def predict(self, features):
        # 预测单个数据
        if len(features.shape) == 1:
            return np.array([self._predict_single(features)])
        # 批量预测
        elif len(features.shape) == 2:
            N = features.shape[0]
            return np.array([self._predict_single(features[i, :]) for i in range(N)])
        else:
            raise(ValueError("Unsupported features size, should be 1 or 2 dimensional"))
            
    
    def _predict_single(self, feature):
        # 实现它
        K=len(feature)
        C=len(self.target_labels)
        maxx=0
        answer=-1
        for i in range(C):
            p=self.prior_prob[i]
            for j in range(K):
                l=np.where(self.features_labels[j]==feature[j])
                p=p*self.conditional_prob[i][j][l]
            if p>maxx:
                maxx=p
                answer=i
        if answer==-1:
            return  random.randint(0,2)
        return answer
    
    def _prior_prob(self, target):
        # 利用式4.11估计先验概率
        N = len(target)
        K = len(self.target_labels)
        prob=[0 for i in range(K)]
        for i in range(N):  ##统计每一类别出现的次数
            prob[target[i]]+=1
        for i in range(K):### 拉普拉斯平滑 计算类的概率
            prob[i]=(prob[i]+1)/(N+K)
        return prob
    
    
    def _conditional_prob(self, features, target):
        # 利用式4.10估计条件概率
        # prob[k][j][l] is P(X^{(j)} = a_{jl} | Y = c_k)
        prob = [[np.ones(feature.shape) for feature in self.features_labels] ##改为ones 使得拉普拉斯平滑时不用再加1
                for _ in self.target_labels] # prob里一共存储了多少个数？
        N = len(target)
        K = len(self.target_labels)
        ck=[0 for i in range(K)]
        for i in range(N): ##先统计每一类各自出现的次数
            ck[target[i]]+=1
        n=features.shape[0]
        m=features.shape[1]
        for k in range(n):
            for j in range(m): ###每一类中每一特征的每一属性出现次数统计累加
                l=np.where(self.features_labels[j]==features[k][j]) ###获得该属性在这一特征中的序号值
                prob[target[k]][j][l]+=1
        for k in self.target_labels:##拉普拉斯平滑 计算每一特征每一属性的概率
            for j in range(len(self.features_labels)):
                for l in range(len(self.features_labels[j])):
                    prob[k][j][l]=prob[k][j][l]/(ck[k]+len(self.features_labels[j]))
        return prob

读取数据

In [30]:
(X_train, Y_train), (X_valid, Y_valid) = load_iris()

创建模型并拟合数据

In [31]:
model = NaiveBayes()
model.fit(X_train, Y_train)

预测结果

In [32]:
Y_pred = model.predict(X_valid)
accuracy = np.sum(Y_pred == Y_valid)/len(Y_valid)
print(f"accuracy: {accuracy:.4f}")

accuracy: 0.8000


