In [36]:
import numpy as np

Define our training set documents for our Naive Bayes classifier.

**docs** contains the documents. **labels** are the classes - True=Pro Government, False=Anti Government

**unknowns** are the documents we want to classify

In [37]:
docs = ["surplus good economy jobs", 
        "good government listens",
        "best budget investments",
        "corrupt highest levels",
        "resign crooks",
        "government good jobs friends"]

labels=[True, True, True, False, False, False]

unknowns = ["This corrupt corrupt government should resign", "Good economy is generating new jobs and a budget surplus"]

Tokenise the document corpus...

In [38]:
words = [docs[i].split() for i in range(0, len(docs))]
words

[['surplus', 'good', 'economy', 'jobs'],
 ['good', 'government', 'listens'],
 ['best', 'budget', 'investments'],
 ['corrupt', 'highest', 'levels'],
 ['resign', 'crooks'],
 ['government', 'good', 'jobs', 'friends']]

... and generate a Vocabulary vector and also calculate word counts split by class.

In [39]:
V = []
VCntPro=[]
VCntAnti=[]

for d in range(0, len(words)):
    for w in range(0, len(words[d])):
        word = words[d][w]
        if word in V:
            ndx = V.index(word)
            if labels[d]:
                VCntPro[ndx] = VCntPro[ndx]+1
            else:
                VCntAnti[ndx] = VCntAnti[ndx]+1
        else:
            V.append(word)
            if labels[d]:
                VCntPro.append(1)
                VCntAnti.append(0)
            else:
                VCntAnti.append(1)
                VCntPro.append(0)
        
        
print "Vocabulary:", V
print "Counts(Pro Government):", VCntPro
print "Counts(Anti Government):", VCntAnti

Vocabulary: ['surplus', 'good', 'economy', 'jobs', 'government', 'listens', 'best', 'budget', 'investments', 'corrupt', 'highest', 'levels', 'resign', 'crooks', 'friends']
Counts(Pro Government): [1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
Counts(Anti Government): [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


Calculate term frequencies per corpus document

In [40]:
tf = np.zeros([len(words), len(V)])

for d in range(0, len(words)):
    for w in range(0, len(words[d])):
        word = words[d][w]
        ndx = V.index(word)
        tf[d, ndx] = tf[d, ndx]+1
        
tf

array([[ 1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         1.,  0.],
       [ 0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.]])

Calculate probabilities for all the terms

In [41]:
PPro = [float(VCntPro[i]+1)/(sum(VCntPro)+len(V)) for i in range(0,len(V))]
PAnti = [float(VCntAnti[i]+1)/(sum(VCntAnti)+len(V)) for i in range(0,len(V))]

print "Term probability(Pro Government):", PPro
print "Term probability(Anti Government):", PAnti

Term probability(Pro Government): [0.08, 0.12, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04]
Term probability(Anti Government): [0.041666666666666664, 0.08333333333333333, 0.041666666666666664, 0.08333333333333333, 0.08333333333333333, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.041666666666666664, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333]


Calculate global class probablities

In [42]:
sumPro = sum(labels)
sumAnti = len(labels)-sum(labels)

PProG = float(sumPro)/(sumPro+sumAnti)
PAntiG = float(sumAnti)/(sumPro+sumAnti)

print PProG
print PAntiG

0.5
0.5


Attempt to classify our unknown documents...first generate term vector.

In [43]:
unknownWords = [unknowns[i].split() for i in range(0, len(unknowns))]

termVecs = np.zeros([len(unknownWords), len(V)])
for d in range(0, len(unknownWords)):
    for w in range(0, len(unknownWords[d])):
        word = unknownWords[d][w]
        if word in V:
            print word, "*" 
            ndx = V.index(word)
            termVecs[d,ndx]+=1
        else:
            print word


print unknownWords
print termVecs

This
corrupt *
corrupt *
government *
should
resign *
Good
economy *
is
generating
new
jobs *
and
a
budget *
surplus *
[['This', 'corrupt', 'corrupt', 'government', 'should', 'resign'], ['Good', 'economy', 'is', 'generating', 'new', 'jobs', 'and', 'a', 'budget', 'surplus']]
[[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  2.  0.  0.  1.  0.  0.]
 [ 1.  0.  1.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]]


Calculate probabilities using Bayes' algorithm for both Pro and Anti classes for the unknowns and compare... 

In [44]:
predictions = np.zeros([len(unknowns)])
for d in range(0, len(termVecs)):
    PProW = float(PProG)
    PAntiW = float(PAntiG)
    for ndx in range(0, len(termVecs[d])):
        if termVecs[d,ndx]>0:
            PProW = PProW * termVecs[d,ndx]*PPro[ndx]
            PAntiW = PAntiW * termVecs[d,ndx]*PAnti[ndx]
        #print V[ndx], "pro=", PProW, ", Anti=", PAntiW
    predictions[d] = PProW>PAntiW
    print "P(Pro)=", PProW, "P(Anti)=", PAntiW, " => ", PProW>PAntiW
    
print "Predictions=",predictions

P(Pro)= 0.000128 P(Anti)= 0.000578703703704  =>  False
P(Pro)= 2.048e-05 P(Anti)= 3.01408179012e-06  =>  True
Predictions= [ 0.  1.]


In [46]:
predictions = np.zeros([len(unknowns)])
for d in range(0, len(unknownWords)):
    PProW = float(PProG)
    PAntiW = float(PAntiG)
    for w in range(0, len(unknownWords[d])):
        word = unknownWords[d][w]
        if word in V:
            print word, "*" 
            ndx = V.index(word)
            PProW = PProW * PPro[ndx]
            PAntiW = PAntiW * PAnti[ndx]
        else:
            print word
    predictions[d] = PProW>PAntiW
    print "P(Pro)=", PProW, "P(Anti)=", PAntiW, " => ", PProW>PAntiW
    
print "Predictions=",predictions

This
corrupt *
corrupt *
government *
should
resign *
P(Pro)= 2.56e-06 P(Anti)= 2.4112654321e-05  =>  False
Good
economy *
is
generating
new
jobs *
and
a
budget *
surplus *
P(Pro)= 2.048e-05 P(Anti)= 3.01408179012e-06  =>  True
Predictions= [ 0.  1.]


In [54]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(tf, labels)

print clf.predict(termVecs)
print clf.predict_proba(termVecs)
print clf.predict_log_proba(termVecs)

[False  True]
[[ 0.90402155  0.09597845]
 [ 0.12829111  0.87170889]]
[[-0.10090208 -2.3436316 ]
 [-2.05345331 -0.13729975]]
