In [42]:
import numpy as np
from math import log

In [19]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
                 
def createVocabList(dataSet):  # 获取所有文档不相同的次数
    vocabSet = set()  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):  # 统计文档中各词出现情况
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [20]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(myVocabList)

['problems', 'love', 'flea', 'posting', 'stop', 'how', 'steak', 'not', 'worthless', 'take', 'ate', 'buying', 'mr', 'my', 'garbage', 'is', 'licks', 'please', 'park', 'has', 'cute', 'dog', 'maybe', 'quit', 'him', 'I', 'food', 'dalmation', 'so', 'stupid', 'to', 'help']


In [21]:
setOfWords2Vec(myVocabList, listOPosts[0])

[1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

In [32]:
def trainNBO(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = p1Denom = 0.0  # 分母
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:  # 条件概率 记录情况为垃圾文档时各个属性的累加和
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])  # 记录垃圾邮件的总词数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom  # 由于是累加和所以要除以总词量求各个属性在垃圾邮件的概率
    p0Vect = p0Num / p0Denom
    return p0Vect , p1Vect, pAbusive

In [33]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = list()
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0v, p1v,pAb = trainNBO(trainMat, listClasses)
print(p0v, p1v, pAb)

[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0]
[0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
[0.04166667 0.04166667 0.04166667 0.         0.04166667 0.04166667
 0.04166667 0.         0.         0.         0.04166667 0.
 0.04166667 0.125      0.         0.04166667 0.04166667 0.04166667
 0.         0.04166667 0.04166667 0.04166667 0.         0.
 0.08333333 0.04166667 0.         0.04166667 0.04166667 0.
 0.04166667 0.04166667] [0.         0.         0.         0.05263158 0.05263158 0.
 0.         0.05263158 0.10526316 0.05263158 0.         0.05263158
 0.         0.         0.05263158 0.         0.         0.
 0.05263158 0.         0.         0.10526316 0.05263158 0.05263158
 0.05263158 0.         0.05263158 0.         0.         0.15789474
 0.05263158 0.        ] 0.5


# 拉普拉斯修正
由于连乘时如果有一项为0那么结果为零，显然不合理，为了避免这种情况采用“拉普拉斯修正”
具体来说，令N表示训练集D中可能的类数，$N_{i}$表示第$i$个属性可能的取值。

那么对于先验概率: 
$\hat{P}(c) = \frac{\left |D_{c} \right| + 1}{\left |D \right| + N}$

对于后验概率: $\hat{P}(x_{i}|c) = \frac{\left |D_{c,x_{i}} \right| + 1}{\left |D \right| + N_{i}}$

# 计算下溢

计算时如果存在多个非常小的数会导致无法得到正确结果，这是由于浮点数性质导致的（可以测试计算多个很小的浮点数看看结果是否正常）。

由于采用贝叶斯假设所以各个属性完全独立，此时可以采用两边同时取对数： 

$ln(f(x_{1},x_{2} ... x_{i}|c)) = \sum_{1}^{i} ln(f(x_{i}|c))$

采用自然对数处理则不会有任何损失，由于变换后为复合函数，其中$ln(x)$的导数严格大于0不影响原函数的单调性，仍可以在原极值点处取得极值点，所以不对结果照成影响。

In [37]:
# 根据现实情况修改分类器
def trainNBO(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)  # 对应于拉普拉斯修正的分母，初始值为1
    p1Num = np.ones(numWords)
    p0Denom = p1Denom = 2.0  # 拉普拉斯修正的分母，由于属性的取值只能为0或1两种所以初始值为2
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:  # 条件概率 记录情况为垃圾文档时各个属性的累加和
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])  # 记录垃圾邮件的总词数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom  # 由于是累加和所以要除以总词量求各个属性在垃圾邮件的概率
    p0Vect = p0Num / p0Denom
    return p0Vect , p1Vect, pAbusive

In [45]:
"""
由于我们已经知道了垃圾邮箱和非垃圾邮箱下所有属性取值的概率，当给定我们一个样本时由于原始矩阵中不存在的属性值为0
所以与条件概率矩阵相乘后就求得

"""
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    print(vec2Classify)
    print(p1Vec)
    print(vec2Classify * p1Vec)
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNBO(np.array(trainMat),np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testingNB()

[0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
[0.04761905 0.04761905 0.04761905 0.0952381  0.0952381  0.04761905
 0.04761905 0.0952381  0.14285714 0.0952381  0.04761905 0.0952381
 0.04761905 0.04761905 0.0952381  0.04761905 0.04761905 0.04761905
 0.0952381  0.04761905 0.04761905 0.14285714 0.0952381  0.0952381
 0.0952381  0.04761905 0.0952381  0.04761905 0.04761905 0.19047619
 0.0952381  0.04761905]
[0.         0.04761905 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.04761905 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.04761905 0.         0.
 0.         0.        ]
['love', 'my', 'dalmation'] classified as:  0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
[0.04761905 0.04761905 0.04761905 0.0952381  0.0952381  0.04761905
 0.04761905 0.0952381  0.14285714 0.0952381  0.04761905 0.0952381
 0.04761905 0