# ch4 朴素贝叶斯

贝叶斯准则:
$$
P(B|A)=\frac{P(A|B)*P(B)}{P(A)}
$$


## 文本分类

## 4-1 准备数据：从文本构建词向量

考虑文档中所有单词，然后将每篇文档住转换为词汇表上的向量。 

In [1]:
def loadData():
    """
    创建数据集
    :return: 单词列表postingList, 所属类别classVec
    """
    dataList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    labelList = [0, 1, 0, 1, 0, 1] 
    return dataList, labelList

In [2]:
def createVocabList(dataSet):
    # 所有单词的集合
    vocabList=[]
    vocabSet=set([])
    for textLine in dataSet:
        vocabSet=vocabSet | set(textLine)
    vocabList=list(vocabSet)
    vocabList.sort()
    return vocabList

## 4-2 创建词袋

In [3]:
def setOfWords2Vec(vocabList,textLine):
    retVec=[0]*len(vocabList)
    for word in textLine:
        if word in vocabList:
            retVec[vocabList.index(word)]=1
    return retVec

**测试**

In [4]:
dataList,dataLabel=loadData()
vocalList=createVocabList(dataList)
vocalList[:5]

['I', 'ate', 'buying', 'cute', 'dalmation']

setOfWord2Vec(vocalList,dataList[0])

## 4-3 计算先验概率

In [5]:
import numpy as np

def getProb(trainData, trainCategory):
    # 参数：文本单词矩阵，文本类型
    # 样本个数
    numTrainDocs = len(trainData)
    # 词袋的单词数
    numWords = len(trainData[0])
    # 样本类别
    classNum=2
    # 类被为1，先验概率
    Py = np.sum(trainCategory) / float(numTrainDocs)
    # 条件概率，两个类别，对应两个向量
    Px_y=np.ones( (classNum,numWords) )
    
    # 整个数据集，不同类别下，单词出现的总数
    p_sum=np.zeros( (classNum,1) )
    p_sum+=2.0
    # 矩阵：不同类别下，每个单词出现的概率
    pVec=np.zeros( (classNum,numWords) )
    
    # 遍历每一个文本
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            Px_y[1] += trainData[i] 
            # 对向量中的所有元素进行求和，
            # 即计算类别为1的文件中，出现的单词的总数
            p_sum[1] += np.sum(trainData[i])
        else:
            # 类别为0
            Px_y[0] += trainData[i]
            p_sum[0] += np.sum(trainData[i])
    # 每个类别下，每个单词出现的概率
    # 如，类别0，正常文档，[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表
    pVec=np.log(Px_y / p_sum)

    return pVec, Py   

## 4-4 分类

In [6]:
def classify(textVec, pVec, pClass1):
    # 两个类别
    # P(w|c1) * P(c1)，即贝叶斯准则的分子
    p1 = np.sum(textVec * pVec[1]) + np.log(pClass1) 
    # P(w|c0) * P(c0)
    p0 = np.sum(textVec * pVec[0]) + np.log(1.0 - pClass1) 
    if p1 > p0:
        return 1
    else:
        return 0

## 4-5 测试

In [7]:
def testing():
    # 1. 加载数据集
    listOPosts, listClasses = loadData()
    # 2. 创建单词集合
    myVocabList = createVocabList(listOPosts)
    # 3. 计算单词是否出现并创建数据矩阵
    trainMat = []
    for postinDoc in listOPosts:
        # 返回m*len(myVocabList)的矩阵， 记录的都是0，1信息
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    # 4. 训练数据
    pV, pAb = getProb(np.array(trainMat), np.array(listClasses))
    print('p0V',pV)
    print('pAb',pAb)
    
    # 5. 测试数据
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    
    print (testEntry, 'classified as: ', classify(thisDoc, pV, pAb))
    
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    
    print (testEntry, 'classified as: ', classify(thisDoc, pV, pAb))
testing()

p0V [[-2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936
  -2.56494936 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -2.15948425
  -2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
  -1.87180218 -3.25809654 -3.25809654 -2.56494936 -3.25809654 -2.56494936
  -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654
  -2.56494936 -3.25809654]
 [-3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -1.94591015
  -3.04452244 -2.35137526 -2.35137526 -3.04452244 -3.04452244 -2.35137526
  -3.04452244 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244
  -3.04452244 -2.35137526 -2.35137526 -3.04452244 -2.35137526 -3.04452244
  -2.35137526 -3.04452244 -3.04452244 -2.35137526 -1.65822808 -2.35137526
  -2.35137526 -1.94591015]]
pAb 0.5
['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## 4-6 使用朴素贝叶斯进行邮件分类

将文本文件解析成词条向量

In [8]:
def textSplit(text):
    import re
    retList=[]
    reg = re.compile('\W')
    wordList = reg.split(text)
    retList=[word.lower() for word in wordList if len(word)>2]
    return retList
# f=open('./email/ham/1.txt').read()
f='this book is the best book on python.'
d=textSplit(f)
d

['this', 'book', 'the', 'best', 'book', 'python']

测试

In [9]:
def loadData():
    dataList=[]
    labelList=[]
    # 训练集 ：测试集合,划分比例：rate
    rate=0.7
    for i in range(1, 23):
        # spam
        wordList = textSplit(open('./email/spam/%d.txt' % i).read())
        dataList.append(wordList)
        labelList.append(1)
        # ham
        wordList = textSplit(open('./email/ham/%d.txt' % i).read())
        dataList.append(wordList)
        labelList.append(0)
    # train
    trainDataArr=dataList[:int(rate*len(dataList))]
    trainLabelArr=labelList[:int(rate*len(dataList))]
    # test
    testDataArr=dataList[int(rate*len(dataList)):]
    testLabelArr=labelList[int(rate*len(dataList)):]

    return trainDataArr,trainLabelArr,testDataArr,testLabelArr
    

In [15]:
def data2Mat(trainDataArr,trainLabelArr):
    # 文本数据转为词向量矩阵
    # 创建词汇表    
    vocabList = createVocabList(trainDataArr)
    # 文本数据转为词向量矩阵
    dataMat = []
    labels = []
    for i in range(len(trainDataArr)):
        dataMat.append(setOfWords2Vec(vocabList,trainDataArr[i] ))
        labels.append(trainLabelArr[i])
    return dataMat,labels,vocabList

In [16]:
def modelTest():
    
    # 加载数据
    trainDataArr,trainLabelArr,testDataArr,testLabelArr=loadData()
    
    # 训练用的文本数据，转为词向量矩阵
    trainDataMat,trainLabels ,vocabList= data2Mat(trainDataArr,trainLabelArr)
    
    pV, pSpam = getProb(np.array(trainDataMat), np.array(trainLabels))
    
    errorCount = 0
    for i in range(len(testDataArr)):
        wordVector = setOfWords2Vec(vocabList,testDataArr[i] )
        if classify(np.array(wordVector), pV, pSpam) != testLabelArr[i]:
            errorCount += 1
    print ('the errorCount is: ', errorCount)
    print ('the testSet length is :', len(testLabelArr))
    print ('the accu is: :', 1.0-float(errorCount)/len(testLabelArr))

In [17]:
modelTest()

the errorCount is:  1
the testSet length is : 14
the accu is: : 0.9285714285714286
