# 概述

贝叶斯决策理论  
核心思想：选择具有最高概率的决策。  
另一个假设：每个特征同样重要。

条件概率 ： $p(X|Y) = p(XY) / p(Y)$  
贝叶斯准则 ： 已知$p(x|c)$，要求$p(c|x)$  
$$p(c|x) = \frac{p(x|c)p(c)}{p(x)}$$

给定某个点(x, y)，则该数据点来自类别$c_1$的概率表示为$p(c_1|x, y)$  
应用贝叶斯准则得到：  
$$p(c_i|x, y) = \frac{p(x, y|c_i)p(c_i)}{p(x, y)}$$

# 使用python进行文本分类

## 准备数据：从文本中构建词向量

In [5]:
# 词表到向量的转换函数
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]    #1 is abusive, 0 not
    return postingList, classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 并集！！！
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 # 对应位置的值赋为1
        else:
            print(" the word: %s is not in my Vocabulary!" % word)
    return returnVec

listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

## 训练算法：从词向量计算概率

$$p(c_i|w) = \frac{p(w|c_i)p(c_i)}{p(w)}$$  
$p(c_i)$：类别i出现的概率  
$p(w|c_i)$：将$w$展开为一个特征可以写作$p(w_0,w_1,w_2,w_3,...,w_n|c_i)$，此外假设所有词相互独立，则  
$p(w|c_i)=p(w_0,w_1,w_2,w_3,...,w_n|c_i)=p(w_0|c_i)p(w_1|c_i)p(w_2|c_i)p(w_3|c_i)...p(w_n|c_i)$

In [11]:
import numpy as np
# 朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix) # 样本的数量
    numWords = len(trainMatrix[0]) # 样本的向量数，在这即为文本样本所有单词数
    pAbusive = sum(trainCategory) / float(numTrainDocs) # p(c_i)，其实len(trainCategory)也一样
    p0Num = np.zeros(numWords) # 创建一个单词特征长度的零向量
    p1Num = np.zeros(numWords)
    p0Denom = 0
    p1Denom = 0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1: 
            p1Num += trainMatrix[i] # 统计样本每个词特征出现次数，累加
            p1Denom += sum(trainMatrix[i]) # 统计样本的词汇数量，累加
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom # 统计每个词特征在该类别中出现的次数
    p0Vect = p0Num / p0Denom
    return p0Vect, p1Vect, pAbusive

listOPosts, listClasses = loadDataSet() # 读取数据
myVocabList = createVocabList(listOPosts) # 获取文档所有出现过的词（去重）
trainMat = [] # 把文档转化为词向量作为输入样本
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [19]:
p1V

array([0.        , 0.05263158, 0.        , 0.05263158, 0.05263158,
       0.        , 0.        , 0.05263158, 0.05263158, 0.05263158,
       0.05263158, 0.05263158, 0.        , 0.        , 0.10526316,
       0.        , 0.        , 0.05263158, 0.05263158, 0.        ,
       0.        , 0.        , 0.        , 0.15789474, 0.        ,
       0.        , 0.        , 0.05263158, 0.10526316, 0.        ,
       0.        , 0.05263158])

## 测试算法：根据现实情况修改分类器

In [79]:
# 优化1：考虑若一个概率值为0会导致最后乘机也为0，更改初始化分子为1分母为0
# 优化2：太多很小的数相乘会导致下溢出，通过对乘积取自然对数解决
# ln(a*b)=ln(a)+ln(b)，且f(x)与ln(f(x))同增同减，在相同点取到极值

import numpy as np
# 朴素贝叶斯分类器训练函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix) # 样本的数量
    numWords = len(trainMatrix[0]) # 样本的向量数，在这即为文本样本所有单词数
    pAbusive = sum(trainCategory) / float(numTrainDocs) # p(c_i)，其实len(trainCategory)也一样
    # 考虑若一个概率值为0会导致最后乘机也为0，更改初始化分子为1分母为0
    p0Num = np.ones(numWords) # 创建一个单词特征长度的1向量
    p1Num = np.ones(numWords)
    p0Denom = 2
    p1Denom = 2
    for i in range(numTrainDocs):
        if trainCategory[i] == 1: 
            p1Num += trainMatrix[i] # 统计样本每个词特征出现次数，累加
            p1Denom += sum(trainMatrix[i]) # 统计样本的词汇数量，累加
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom) # 统计每个词特征在该类别中出现的次数
    p0Vect = np.log(p0Num / p0Denom) # 避免下溢出套上ln()
    return p0Vect, p1Vect, pAbusive

listOPosts, listClasses = loadDataSet() # 读取数据
myVocabList = createVocabList(listOPosts) # 获取文档所有出现过的词（去重）
trainMat = [] # 把文档转化为词向量作为输入样本
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [25]:
# 朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    p1 = sum(vec2Classify * p1Vect) + np.log(pClass1) # vec2Classify为样本词特征向量（存在为1不存在为0），相乘结果为样本出现的词对应类别中的概率
    p0 = sum(vec2Classify * p0Vect) + np.log(1 - pClass1) # 由于套上了ln所有相乘变相加
    if p1>p0:
        return 1
    else:
        return 0
    
def testingNB():
    listOPosts, listClasses = loadDataSet() # 读取数据
    myVocabList = createVocabList(listOPosts) # 获取文档所有出现过的词（去重）
    trainMat = [] # 把文档转化为词向量作为输入样本
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, "classified as : ", classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, "classified as : ", classifyNB(thisDoc, p0V, p1V, pAb))

testingNB()

['love', 'my', 'dalmation'] classified as :  0
['stupid', 'garbage'] classified as :  1


## 准备数据：文档词袋模型

上述将每个词是否出现作为特征，可以描述为词集模型  
词袋模型则为记录一个词可能在文档中出现不止一次

In [27]:
# 朴素贝叶斯词袋模型
def bagOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1 # 对应位置的值加1
    return returnVec

# 示例：使用朴素贝叶斯过滤垃圾邮件

## 测试算法：使用朴素贝叶斯进行交叉验证

In [102]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():
    # 读取样本
    docList = []; classList=[]; fullText = []
    for i in [j for j in range(1, 26) if j not in [6, 17, 23]]: # 文件6有点问题
        with open(r'./email/spam/%d.txt' % i) as file1:
            wordList = textParse(file1.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        with open(r'./email/ham/%d.txt' % i) as file0:
            wordList = textParse(file0.read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList) # 变量并集
    # 随机抽取训练集、测试集（留存交叉验证）
    trainingSet = list(range(44))
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    # 训练模型
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))

    # 测试模型
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            print('classification error', docList[docIndex])
    print("the error rate is :", errorCount/len(testSet))
    
spamTest()

classification error ['experience', 'with', 'biggerpenis', 'today', 'grow', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', 'ma1eenhancement', 'supplement', 'trusted', 'millions', 'buy', 'today']
the error rate is : 0.1


# 示例：使用朴素贝叶斯分类器从个人广告中获取区域倾向

## 收集数据：导入RSS源

In [111]:
# RSS源分类器及高频词去除函数
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token) # 统计每个词出现的频数
    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30] # 返回出现频数最高的30个词

# 直接复制
def localWords(feed1, feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # 读取样本
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW in vocabList:
            vocabList.remove(pairW)
    trainingSet = range(2*minLen); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V

def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])