In [1]:
import numpy as np
from math import log

In [2]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec
                 
def createVocabList(dataSet):  # 获取所有文档单词的并集
    vocabSet = set()  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):  # 标准化函数，构建输入矩阵 用于统计文档中各词出现情况
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [3]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(myVocabList)

['garbage', 'love', 'flea', 'licks', 'ate', 'maybe', 'is', 'him', 'my', 'to', 'worthless', 'steak', 'park', 'stop', 'problems', 'cute', 'dalmation', 'mr', 'posting', 'stupid', 'I', 'has', 'quit', 'take', 'not', 'how', 'please', 'dog', 'help', 'food', 'so', 'buying']


In [4]:
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0]

In [5]:
def trainNBO(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = p1Denom = 0.0  # 分母
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:  # 条件概率 记录情况为垃圾文档时各个属性的累加和
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])  # 记录垃圾邮件的总词数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom  # 由于是累加和所以要除以总词量求各个属性在垃圾邮件的概率
    p0Vect = p0Num / p0Denom
    return p0Vect , p1Vect, pAbusive

In [7]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = list()
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0v, p1v,pAb = trainNBO(trainMat, listClasses)

# 拉普拉斯修正
由于连乘时如果有一项为0那么结果为零，显然不合理，为了避免这种情况采用“拉普拉斯修正”
具体来说，令N表示训练集D中可能的类数，$N_{i}$表示第$i$个属性可能的取值。

那么对于先验概率: 
$\hat{P}(c) = \frac{\left |D_{c} \right| + 1}{\left |D \right| + N}$

对于后验概率: $\hat{P}(x_{i}|c) = \frac{\left |D_{c,x_{i}} \right| + 1}{\left |D \right| + N_{i}}$

# 计算下溢

计算时如果存在多个非常小的数会导致无法得到正确结果，这是由于浮点数性质导致的（可以测试计算多个很小的浮点数看看结果是否正常）。

由于采用贝叶斯假设所以各个属性完全独立，此时可以采用两边同时取对数： 

$ln(f(x_{1},x_{2} ... x_{i}|c)) = \sum_{1}^{i} ln(f(x_{i}|c))$

采用自然对数处理则不会有任何损失，由于变换后为复合函数，其中$ln(x)$的导数严格大于0不影响原函数的单调性，仍可以在原极值点处取得极值点，所以不对结果照成影响。

In [8]:
# 根据现实情况修改分类器
def trainNBO(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)  # 对应于拉普拉斯修正的分母，初始值为1
    p1Num = np.ones(numWords)
    p0Denom = p1Denom = 2.0  # 拉普拉斯修正的分母，由于属性的取值只能为0或1两种所以初始值为2
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:  # 条件概率 记录情况为垃圾文档时各个属性的累加和
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])  # 记录垃圾邮件的总词数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom  # 由于是累加和所以要除以总词量求各个属性在垃圾邮件的概率
    p0Vect = p0Num / p0Denom
    return p0Vect , p1Vect, pAbusive

In [9]:
"""
由于我们已经知道了垃圾邮箱和非垃圾邮箱下所有属性取值的概率，我们根据输入矩阵各项的属性值找到条件概率中符合条件的概率然后连乘
但是此处比较特殊，由于我们只考虑词存在的概率（词集模型），当该词不存在的概率我们不需要考虑，所以直接拿输入矩阵与条件概率矩阵相乘
又因为对连乘取对数之后结果就变成了各项相加

"""
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """
    输入矩阵与条件概率矩阵相乘后相加
    """
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)  # 构造单词训练集合
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))  # 构建训练矩阵（每一行是一个文档样本）
    p0V,p1V,pAb = trainNBO(np.array(trainMat),np.array(listClasses))  # 计算条件概率矩阵和先验概率
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))  # 构建输入矩阵
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))  # 根据贝叶斯公式计算结果
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


# 文档词袋模型
目前为止，我们将每个词的出现与否作为一个特征，称为词集模型。
如果一个词在文档中出现不止一次，这可能意味着包含该词是否出现在文档中所不能表现的某种信息,这种模型称为词袋模型。
在词袋中，每个单词可以出现多次，而在词集中每个词只能出现一次。
下面对我们的标准化函数进行修改：

In [None]:
def bagOfWordsVecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in vocabList:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [31]:
# 测试邮件分类
emailText = open("email/ham/6.txt", "r",encoding="ISO-8859-1").read()  # 此处文件编码为ISO

# 接下来利用正则表达式对文本进行切分和正则化（最小化经验误差函数上加约束）

import re

regEx = re.compile("\\W*")  # 切分文本

listOfTokens = regEx.split(emailText)  

print(listOfTokens)

['Hello', 'Since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'Google', 'Groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'or', 'files', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'February', '2011', 'We', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'Google', 'Groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'Instead', 'of', 'these', 'features', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'as', 'Google', 'Docs', 'and', 'Google', 'Sites', 'For', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'on', 'Google', 'Sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en',

  # Remove the CWD from sys.path while we load stuff.


In [32]:
# 使用朴素贝叶斯进行交叉验证

def textParse(bigString):
    import re
    listOfTokens = re.split(r"\W*", bigString)  # 切分文本
    return [tok.lower() for tok in listOfTokens if len(tok > 2)]  # 若不是单字母单词(如 I) ，则去掉大写

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        