In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# 利用朴素贝叶斯法进行手写数字集的模型建立
### 准确率大概在80%左右

In [3]:
## 载入数据 取1000个训练样本
trainDataPd = pd.read_csv("trainData.csv")
trainDataPd.iloc[:,:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
trainData = trainDataPd.iloc[:,:]
labels,imgdata = trainData.iloc[:,0],trainData.iloc[:,1:]
imgdata= imgdata.to_numpy()

## 计算p(y|x) = $\frac{p(x,y)}{p(x)}$
### 同时由于特征为多维特征,朴素贝叶斯有一个比较强的假设==>即输入特征之间相对独立(独立性假设)，即$p(x=x_0,x_1,x_2....|y=y_k)=p(x=x_0|y=y_k)*p(x=x_1|y=y_k)*p(x=x_2|y=y_k)....$

#### 其中分母p(x)可按照全概率公式拆解为 $\sum_{k=0}^{m}p(y=y_k)*p(x=x_0,x_1,x_2....|y=y_k)$
#### 于是最后的计算变成了 p(y|x) = argmax $\frac{p(x=x_0,x_1,x_2....|y=y_k)=p(x=x_0|y=y_k)*p(x=x_1|y=y_k)*p(x=x_2|y=y_k)....}{\sum_{k=0}^{m}p(y=y_k)*p(x=x_0,x_1,x_2....|y=y_k}$
#### 由于分母不变,求解可转换为 p(y|x) = argmax ${p(x=x_0,x_1,x_2....|y=y_k)=p(x=x_0|y=y_k)*p(x=x_1|y=y_k)*p(x=x_2|y=y_k)....}$

#### p(y|x) = argmax ${p(x=x_0,x_1,x_2....|y=y_k)}$ = argmax $ {{p(y=y_k)*\prod_{i=0}^n p(x=x_i|y=y_k)}}$

#### 其中$p(x=x_i|y=y_k)=\frac{\sum_{j=1}^{n}I(x=x_i,y=y_k)}{\sum_{j=1}^{n}I(y=y_k)}$

In [5]:
## 计算每个类别的总数,然后计算每个特征的条件概率值
Lambda = 0.00001  ##防止有些地方的概率值为0导致后续的概率计算被消除,当特征总数过大时，lambda的值尽量取小，减少影响
YlabelCnt = np.array([np.sum(labels==i) for i in range(10)])
py=(YlabelCnt+1)/(YlabelCnt.sum()+Lambda*YlabelCnt.shape[0]) # 获得p(y=y_k)的概率值
YlabelCnt,py

(array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949],
       dtype=int64),
 array([0.09873333, 0.11238333, 0.09931667, 0.1022    , 0.09738333,
        0.09036667, 0.09865   , 0.10443333, 0.09753333, 0.09916667]))

In [6]:
px_y = [imgdata[labels==index] for index,IY in enumerate(YlabelCnt)]## 获得y=yk所在的图像的所有特征
px_y[0].shape

(5923, 784)

In [7]:
Lambda = 0.00001 ##防止有些地方的概率值为0导致后续的概率计算被消除,当特征总数过大时，lambda的值尽量取小，减少影响
pxi_y = []
for pxy in px_y:
    cnt= pxy.shape[0]
    pxiykCnt = np.zeros([2,784])
    for i in range(784):
        pxiykCnt[0][i]+=(np.sum(pxy[:,i]==0)+Lambda)/(pxy.shape[0]+Lambda*784)# 加入拉普拉斯平滑
        pxiykCnt[1][i]+=(np.sum(pxy[:,i]==1)+Lambda)/(pxy.shape[0]+Lambda*784)# 加入拉普拉斯平滑
    pxi_y.append(pxiykCnt)
pxi_y## 获得每个p(xi=cj|y=yk)的条件概率值 

[array([[9.99998678e-01, 9.99998678e-01, 9.99998678e-01, ...,
         9.99998678e-01, 9.99998678e-01, 9.99998678e-01],
        [1.68833138e-09, 1.68833138e-09, 1.68833138e-09, ...,
         1.68833138e-09, 1.68833138e-09, 1.68833138e-09]]),
 array([[9.99998839e-01, 9.99998839e-01, 9.99998839e-01, ...,
         9.99998839e-01, 9.99998839e-01, 9.99998839e-01],
        [1.48323767e-09, 1.48323767e-09, 1.48323767e-09, ...,
         1.48323767e-09, 1.48323767e-09, 1.48323767e-09]]),
 array([[9.99998686e-01, 9.99998686e-01, 9.99998686e-01, ...,
         9.99998686e-01, 9.99998686e-01, 9.99998686e-01],
        [1.67841337e-09, 1.67841337e-09, 1.67841337e-09, ...,
         1.67841337e-09, 1.67841337e-09, 1.67841337e-09]]),
 array([[9.99998723e-01, 9.99998723e-01, 9.99998723e-01, ...,
         9.99998723e-01, 9.99998723e-01, 9.99998723e-01],
        [1.63105321e-09, 1.63105321e-09, 1.63105321e-09, ...,
         1.63105321e-09, 1.63105321e-09, 1.63105321e-09]]),
 array([[9.99998660e-01, 9.99998

In [8]:
pxi_y[0][0],py[0]## 抽取一个观察统计的概率情况,p(xi=0,y=0) p(y=0)

(array([0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99982984,
        0.99982984, 0.99982984, 0.99999868, 0.99982984, 0.99982984,
        0.99982984, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99999868,
        0.99999868, 0.99999868, 0.99999868, 0.99999868, 0.99982984,
        0.99982984, 0.99999868, 0.99982984, 0.99966101, 0.99982984,
        0.99999868, 0.99999868, 0.99982984, 0.99966101, 0.99932335,
        0.99966101, 0.99966101, 0.99932335, 0.99

In [9]:
## 获得模型 p(xi=cj,y=y_k)，即pxi_y
# 加载测试集用于测试模型
testData = pd.read_csv("testData.csv")
Testlabel = testData.iloc[:2000,0] # 取2000个测试样本点
testVector = testData.iloc[:2000,1:].to_numpy()

###### 给定p(y|x) =  argmax ${p(x=x_0,x_1,x_2....|y=y_k)= argmax \prod_{i=0}^n p(x=x_i|y=y_k)}$

In [10]:
# 对于每个样本点进行测试
labelsPre=[]
with tqdm(total=testVector.shape[0]) as bar:
    for vetor in testVector:
        inference=[]
        for cls,yk in enumerate(py):
            p=1.0
            for index,point in enumerate(vetor):# 从第1个特征向量到第784个特征向量分别验证
                p*=np.double(pxi_y[cls][int(point)][index]) # p(y=yk,xi=c) # 由于数值太小,计算器容易数值溢出，因此乘以10进行适当的放大
            inference.append(p*yk)
        label = np.argmax(np.array(inference))
        labelsPre.append(label)
        bar.update(1)
        bar.set_postfix({"labels:":labelsPre})

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=testVector.shape[0]) as bar:


  0%|          | 0/2000 [00:00<?, ?it/s]

In [11]:
labelsPre=np.array(labelsPre).reshape(1,-1)
labelsPre.shape

(1, 2000)

In [12]:
Testlabel=Testlabel.to_numpy().reshape(1,-1)

In [13]:
np.sum(Testlabel==labelsPre)/Testlabel.shape[1] *100 ### 预测正确率 # 样本集的增大也会提升准确率

79.5