# 使用朴素贝叶斯过滤垃圾邮件

**说明:**

将 `email` 文件夹放在当前目录下。

In [28]:
import numpy as np
import pandas as pd
import re

In [37]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet.union(set(document))
    return list(vocabSet)

def setOfWord2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:print('no such word')
    return returnVec
    

In [8]:
# 计算单个类别的词频率
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2
    p1Denom = 2
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive

In [65]:
# 朴素贝叶斯分类函数

def classifyNB(vec,p0Vec,p1Vec,pClass):
    p1 = sum(vec*p1Vec)+np.log(pClass)
    p0 = sum(vec*p0Vec)+np.log(1-pClass)
    return 1 if p1>p0 else 0

In [10]:
def testingNB():
    return

In [11]:
# 朴素贝叶斯词袋模型

def bagOfWord2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [None]:
# 文件解析

def textParse(bigString):
    import re 
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():
    

In [63]:
#贝叶斯算法实例：过滤垃圾邮件

#处理数据长字符串
#1 对长字符串进行分割，分隔符为除单词和数字之外的任意符号串
#2 将分割后的字符串中所有的大些字母变成小写lower(),并且只
#保留单词长度大于3的单词
def testParse(bigString):
    import re
    listOfTokens=re.split(r'\W',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def spamTest():
    #新建三个列表
    docList=[];classList=[];fullTest=[]
    #i 由1到26
    for i in range(1,26):
        #打开并读取指定目录下的本文中的长字符串，并进行处理返回
        wordList=testParse(open('email/spam/%d.txt' %i).read())
        #将得到的字符串列表添加到docList
        docList.append(wordList)
        #将字符串列表中的元素添加到fullTest
        fullTest.extend(wordList)
        #类列表添加标签1
        classList.append(1)
        #打开并取得另外一个类别为0的文件，然后进行处理
        wordList=testParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullTest.extend(wordList)
        classList.append(0)
   
    #将所有邮件中出现的字符串构建成字符串列表
    vocabList=createVocabList(docList)
    #构建一个大小为50的整数列表和一个空列表
    trainingSet=list(range(50));testSet=[]
    #随机选取1~50中的10个数，作为索引，构建测试集
    for i in range(10):
        #随机选取1~50中的一个整型数
        randIndex=int(np.random.uniform(0,len(trainingSet)))
        #将选出的数的列表索引值添加到testSet列表中
        testSet.append(trainingSet[randIndex])
        #从整数列表中删除选出的数，防止下次再次选出
        #同时将剩下的作为训练集
        del(trainingSet[randIndex])
    #新建两个列表
    trainMat=[];trainClasses=[]
    #遍历训练集中的每个字符串列表
    for docIndex in trainingSet:
        #将字符串列表转为词条向量，然后添加到训练矩阵中
        trainMat.append(setOfWord2Vec(vocabList,fullTest[docIndex]))
        #将该邮件的类标签存入训练类标签列表中
        trainClasses.append(classList[docIndex])
    #计算贝叶斯函数需要的概率值并返回
    p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount=0
    #遍历测试集中的字符串列表
    for docIndex in testSet:
        #同样将测试集中的字符串列表转为词条向量
        wordVector=setOfWord2Vec(vocabList,docList[docIndex])
        #对测试集中字符串向量进行预测分类，分类结果不等于实际结果
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
            errorCount+=1
        print('the error rate is:',float(errorCount)/len(testSet))

In [66]:
spamTest()

no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word
no such word

  app.launch_new_instance()


In [23]:
open('email/ham/%d.txt' %5).read()

'There was a guy at the gas station who told me that if I knew Mandarin\nand Python I could get a job with the FBI.'

In [53]:
a = 'Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY'

In [56]:
re.split(r'\W',a)

['Incredib1e',
 'gains',
 'in',
 'length',
 'of',
 '3',
 '4',
 'inches',
 'to',
 'yourPenis',
 '',
 'PERMANANTLY']