# 问题

给定一封邮件，判定它是否属于垃圾邮件。用D表示这封邮件，注意D由N个单词组成。我们用y+表示垃圾邮件，y-表示正常邮件。

问题的数学表达为：

$P(y+|D) = \frac{P(y+)*P(D|y+)}{P(D)}$

$P(y-|D) = \frac{P(y-)*P(D|y-)}{P(D)}$

$P(y+),P(y-)$表示先验概率，即表示邮件库里面垃圾邮件和正常邮件的比例即可。

D里面有N个单词 $d_1,d_2,...,d_n$

$P(D|y+)=P(d_1,d_2,...,d_n|y+)$表示垃圾邮件中出现和这封邮件一模一样的概率有多大

$P(d_1,d_2,...,d_n|y+)$可以扩展为$P(d_1|y+)P(d_2|d_1,y+)P(d_3|d_1,d_2,y+)...$

假设$d_i$和$d_{i-1}$是完全条件无关的(朴素贝叶斯假设特征之间相互独立，互不影响)，

扩展可以简化为$P(d_1|y+)P(d_2|y+)P(d_3|y+)...$

对于$𝑃(𝑑1|𝑦+)𝑃(𝑑2|𝑦+)𝑃(𝑑3|𝑦+)...$，只需要统计$d_i$个单词在垃圾邮件中出现的频率

在程序代码实现时：

考虑到仅仅是比较$P(y+|D)$和$P(y-|D)$的大小

对概率P的等式两边都套用log，且分母${P(D)}$是常数，不用考虑

$logP(y+|D)$等效于$log𝑃(𝑑_1|𝑦+) + log𝑃(𝑑_2|𝑦+)+...+log𝑃(𝑑_n|𝑦+)$

$logP(y-|D)$等效于$log𝑃(𝑑_1|𝑦-) + log𝑃(𝑑_2|𝑦-)+...+log𝑃(𝑑_n|𝑦-)$

# 代码实现

In [18]:
import numpy as np
import re
import random

def textParse(input_string):
    listofTokens = re.split(r"\W+",input_string)
    return [token.lower() for token in listofTokens if len(listofTokens) > 2]
    
def createVocablist(doclist):
    vocabSet = set([])
    for document in doclist:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
    
def setOfWord2Vec(vocablist,inputSet):
    returnVec = [0]*len(vocablist)
    for word in inputSet:
        if word in vocablist:
            returnVec[vocablist.index(word)] = 1
    return returnVec
    
def trainNB(trainSet,trainClass):
    numTrainDocs = len(trainSet)
    numWords = len(trainSet[0])
    p1 = sum(trainClass)/float(numTrainDocs)  #先验概率：垃圾邮件的概率
    p0Num = np.ones(numWords)  #不用9初始化，避免因为某个词不存在，导致概率为0，进而导致整个累乘的结果为0
    p1Num = np.ones(numWords)  
    p0Denom = 2       #拉普拉斯平滑，也就是分母不能用0初始化，通常设置成类别个数，这是是2分类，所以设置为2
    p1Denom = 2
    for i in range(numTrainDocs):
        if trainClass[i] == 1:          #表示垃圾邮件
            p1Num += trainSet[i]   
            p1Denom += sum(trainSet[i])   #分母对垃圾邮件中出现的单词总数求和
        else:                           #表示正常邮件
            p0Num += trainSet[i]   
            p0Denom += sum(trainSet[i])   #分母对正常邮件中出现的单词总数求和
    
    p1Vec = np.log(p1Num/p1Denom)   #这里的概率可能很小，使用np.log将概率值对%%latex化
    p0Vec = np.log(p0Num/p0Denom)
    return p0Vec,p1Vec,p1
   
def classifyNB(wordVec,p0Vec,p1Vec,p1Class):
    p1 = np.log(p1Class)+sum(wordVec*p1Vec)#对数化
    p0 = np.log(1-p1Class)+sum(wordVec*p0Vec)#对数化
    if p0 > p1:
        return 0
    else:
        return 1
    
def spam():
    doclist = []
    classlist = []
    for i in range(1,26):
        wordlist = textParse(open(f'email/spam/{i}.txt','r',encoding='utf-8').read())
        doclist.append(wordlist)
        classlist.append(1)  # 1表示垃圾邮件
        
        wordlist = textParse(open(f'email/ham/{i}.txt','r',encoding='utf-8').read())
        doclist.append(wordlist)
        classlist.append(0)  # 1表示垃圾邮件
    
    vocablist = createVocablist(doclist)
    trainSet = list(range(50))
    testSet = []
    for i in range(10):
        randInx = int(random.uniform(0,len(trainSet)))
        testSet.append(trainSet[randInx])
        del (trainSet[randInx])
        
    trainMat = []
    trainClass = []
    for docIndex in trainSet:
        trainMat.append(setOfWord2Vec(vocablist,doclist[docIndex]))
        trainClass.append(classlist[docIndex])
    p0Vec,p1Vec,p1 = trainNB(np.array(trainMat),np.array(trainClass))
    errorCount = 0
    for docIndex in testSet:
        wordVec = setOfWord2Vec(vocablist,doclist[docIndex])
        if classifyNB(np.array(wordVec),p0Vec,p1Vec,p1) != classlist[docIndex]:
            errorCount += 1
    print(f"当前10个测试样本，错了{errorCount}个")
    
if __name__ == "__main__":
    spam()

当前10个测试样本，错了0个
