<b>朴素贝叶斯</b><br/>
<b>优点：</b>在数据较少的情况下仍然有效，可以处理多类别问题。<br/>
<b>缺点：</b>对于输入数据的准备方式较为敏感。<br/>
<b>适用数据类型：</b>标称型数据。<br/>

<b>示例1：文本分类  侮辱性文字和正常言论</b>

In [1]:
from numpy import *

In [2]:
#词表到向量的转换函数
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

def createVocabList(dataSet):
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [3]:
dataSet, classVec = loadDataSet()

In [4]:
vocabSet = createVocabList(dataSet)

In [5]:
print(vocabSet)

['take', 'buying', 'food', 'has', 'to', 'so', 'quit', 'how', 'love', 'please', 'worthless', 'stop', 'posting', 'steak', 'help', 'is', 'ate', 'dalmation', 'garbage', 'not', 'licks', 'him', 'stupid', 'I', 'maybe', 'problems', 'dog', 'flea', 'park', 'cute', 'mr', 'my']


In [6]:
Vec = setOfWords2Vec(vocabList=vocabSet, inputSet=classVec)

the word: 0 is not in my Vocabulary!
the word: 1 is not in my Vocabulary!
the word: 0 is not in my Vocabulary!
the word: 1 is not in my Vocabulary!
the word: 0 is not in my Vocabulary!
the word: 1 is not in my Vocabulary!


In [7]:
len(vocabSet),len(Vec)

(32, 32)

In [8]:
print(Vec)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [9]:
#朴素贝叶斯分类器训练函数   训练文档矩阵、训练类别标签向量
# p0Vect - 侮辱类的条件概率数组    p1Vect - 非侮辱类的条件概率数组   pAbusive - 文档属于侮辱类的概率
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    #若其中一个概率值为0，那么最后的乘积也为0.为降低此种影响，一般会采用初始化分子分母不等于0
    #在此函数当中，分子初始化为1，分母初始化为2
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 
    p0Denom = 2.0; p1Denom = 2.0                        
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #当概率非常小的时候，相乘会造成下溢出，也就是相乘之后的结果约等于0（四舍五入）
    #解决此种情况，一般会采用对数
    p1Vect = log(p1Num/p1Denom)          #change to log()
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

In [14]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print('myVocabList:\n', myVocabList)
trainMat = []
for postinDoc in postingList:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

myVocabList:
 ['take', 'buying', 'food', 'has', 'to', 'so', 'quit', 'how', 'love', 'please', 'worthless', 'stop', 'posting', 'steak', 'help', 'is', 'ate', 'dalmation', 'garbage', 'not', 'licks', 'him', 'stupid', 'I', 'maybe', 'problems', 'dog', 'flea', 'park', 'cute', 'mr', 'my']


In [22]:
print(trainMat)
print(listOPosts)

[[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]]
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]


In [17]:
p0V,p1V ,pAb = trainNB0(trainMat,listClasses)

In [18]:
print(p0V)

[-3.25809654 -3.25809654 -3.25809654 -2.56494936 -2.56494936 -2.56494936
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936
 -3.25809654 -3.25809654 -2.56494936 -2.15948425 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -1.87180218]


In [19]:
print(p1V)

[-2.35137526 -2.35137526 -2.35137526 -3.04452244 -2.35137526 -3.04452244
 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -1.94591015 -2.35137526
 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244 -3.04452244
 -2.35137526 -2.35137526 -3.04452244 -2.35137526 -1.65822808 -3.04452244
 -2.35137526 -3.04452244 -1.94591015 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -3.04452244]


In [20]:
print(pAb)

0.5


In [21]:
print(listClasses)

[0, 1, 0, 1, 0, 1]


In [24]:
print(setOfWords2Vec(myVocabList,listOPosts[0]))

[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1]


In [25]:
#朴素贝叶斯分类函数
""" vec2Classify - 待分类的词条数组
    p0Vec - 侮辱类的条件概率数组
    p1Vec -非侮辱类的条件概率数组
    pClass1 - 文档属于侮辱类的概率
"""   
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1  #属于侮辱类
    else: 
        return 0  #属于非侮辱类

In [27]:
#测试朴素贝叶斯分类器
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [29]:
testEntry = ['love', 'my', 'dalmation']                                 #测试样本1
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))              #测试样本向量化
if classifyNB(thisDoc,p0V,p1V,pAb):
    print(testEntry,'属于侮辱类')                                        #执行分类并打印分类结果
else:
    print(testEntry,'属于非侮辱类')                                       #执行分类并打印分类结果
testEntry = ['stupid', 'garbage']                                       #测试样本2

thisDoc = array(setOfWords2Vec(myVocabList, testEntry))              #测试样本向量化
if classifyNB(thisDoc,p0V,p1V,pAb):
    print(testEntry,'属于侮辱类')                                        #执行分类并打印分类结果
else:
    print(testEntry,'属于非侮辱类')

['love', 'my', 'dalmation'] 属于非侮辱类
['stupid', 'garbage'] 属于侮辱类


In [109]:
#采用scikit-learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split

In [110]:
X_train,X_test,y_train,y_test = train_test_split(trainMat,listClasses,test_size=0.2, random_state=0)

In [111]:
clf = MultinomialNB().fit(X_train,y_train)

In [112]:
clf.predict(X_test)

array([1, 0])

In [113]:
print(y_test)

[1, 0]


In [114]:
print(clf.score(X_test,y_test))

1.0


<b>文档词袋模型</b><br />
在上面我们将每个词的出现与否作为一个特征，这可以描述为词集模型（set-of-words model）。如果一个词在文档中出现不止一次，这可能意味着该词是否出现在文档中所不能表达的某种信息，这种方法被称为词袋模型（bag-of-words model）。在词袋中，每个单词可以出现多次，而在词集中，每个词只能出现一次。

In [30]:
#朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

<b>示例2：使用朴素贝叶斯过滤垃圾邮件</b>

In [31]:
#准备数据 切分文本
import re
def textParse(bigString):    #input is big string, #output is word list
    listOfTokens = re.split(r'\W*', bigString)  #匹配任何非单词字符。等价于 '[^A-Za-z0-9_]'  ‘*’匹配0个或多个的表达式
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [32]:
emailText = open('email/ham/6.txt').read()
listOfTokens = textParse(emailText)
print(listOfTokens)

['hello', 'since', 'you', 'are', 'owner', 'least', 'one', 'google', 'groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'files', 'are', 'writing', 'inform', 'you', 'that', 'will', 'longer', 'supporting', 'these', 'features', 'starting', 'february', '2011', 'made', 'this', 'decision', 'that', 'can', 'focus', 'improving', 'the', 'core', 'functionalities', 'google', 'groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'instead', 'these', 'features', 'encourage', 'you', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'google', 'docs', 'and', 'google', 'sites', 'for', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'google', 'sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'answer', '174623', 'with', 'the', 'members', 'your', 'group', 'you', 'can', 'also', 'store', 'your', 'files', 'the', 'site', 'atta

  return _compile(pattern, flags).split(string, maxsplit)


In [33]:
print(emailText)

Hello,

Since you are an owner of at least one Google Groups group that uses the customized welcome message, pages or files, we are writing to inform you that we will no longer be supporting these features starting February 2011. We made this decision so that we can focus on improving the core functionalities of Google Groups -- mailing lists and forum discussions.  Instead of these features, we encourage you to use products that are designed specifically for file storage and page creation, such as Google Docs and Google Sites.

For example, you can easily create your pages on Google Sites and share the site (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=174623) with the members of your group. You can also store your files on the site by attaching files to pages (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=90563) on the site. If you抮e just looking for a place to upload your files so that your group members can download them, we suggest you try Google

In [59]:
#对贝叶斯垃圾邮件分类器进行自动化处理
#将文件解析为词列表，构建测试集和训练集，分类
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in list(range(1,26)):
        wordList = textParse(open('email/spam/%d.txt' % i,errors="ignore").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,errors="ignore").read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    trainingSet = list(range(50)); testSet=[]           #create test set
    for i in list(range(10)):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print ("classification error",docList[docIndex])
    print ('the error rate is: ',float(errorCount)/len(testSet))
    #return vocabList,fullText

In [64]:
spamTest()  #因为是随机选择训练集，所以每次执行的结果会是不一样的

classification error ['scifinance', 'now', 'automatically', 'generates', 'gpu', 'enabled', 'pricing', 'risk', 'model', 'source', 'code', 'that', 'runs', '300x', 'faster', 'than', 'serial', 'code', 'using', 'new', 'nvidia', 'fermi', 'class', 'tesla', 'series', 'gpu', 'scifinance', 'derivatives', 'pricing', 'and', 'risk', 'model', 'development', 'tool', 'that', 'automatically', 'generates', 'and', 'gpu', 'enabled', 'source', 'code', 'from', 'concise', 'high', 'level', 'model', 'specifications', 'parallel', 'computing', 'cuda', 'programming', 'expertise', 'required', 'scifinance', 'automatic', 'gpu', 'enabled', 'monte', 'carlo', 'pricing', 'model', 'source', 'code', 'generation', 'capabilities', 'have', 'been', 'significantly', 'extended', 'the', 'latest', 'release', 'this', 'includes']
the error rate is:  0.1


  return _compile(pattern, flags).split(string, maxsplit)


In [66]:
spamTest()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


<b>示例3：使用朴素贝叶斯分类器从个人广告中获取区域倾向</b>

In [68]:
#收集数据 ：导入RSS数据源，用feedparser,采用Craigslist上的个人广告

In [85]:
import feedparser
import operator

In [77]:
ny = feedparser.parse('https://www.nasa.gov/rss/dyn/image_of_the_day.rss')
#https://www.nasa.gov/rss/dyn/image_of_the_day.rss
#https://sports.yahoo.com/nba/teams/hou/rss.xml

In [78]:
type(ny)

feedparser.FeedParserDict

In [79]:
ny.keys()

dict_keys(['feed', 'entries', 'status', 'bozo', 'headers', 'version', 'namespaces', 'href', 'encoding'])

In [81]:
len(ny['entries'])

60

In [84]:
print(ny['entries'][0]['summary'])

On July 24, 1969, the Apollo 11 crew splashed down in the Pacific Ocean.


In [103]:
def calcMostFreq(vocabList,fullText):
    #计算出现的频率
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30] 

def localWords(feed1,feed0):
    #RSS源分类器
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in list(range(minLen)):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen)); testSet=[]           #create test set
    for i in list(range(20)):
        randIndex = int(random.uniform(0,len(trainingSet)))
        if randIndex >= minLen:
            break
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print ('the error rate is: ',float(errorCount),len(testSet))
    return vocabList,p0V,p1V

In [86]:
sf = feedparser.parse('https://sports.yahoo.com/nba/teams/hou/rss.xml')

In [98]:
print(sf.keys()),len(sf['entries']),len(ny['entries'])

dict_keys(['etag', 'feed', 'entries', 'status', 'bozo', 'headers', 'version', 'namespaces', 'href', 'encoding'])


(None, 3, 60)

In [108]:
vocabList, pSF, pNY = localWords(ny,sf)

the error rate is:  0.0 0


  return _compile(pattern, flags).split(string, maxsplit)


In [116]:
#分析数据：显示地域相关的用词
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print (item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print (item[0])

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-116-694885892861>, line 12)