In [12]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import numpy as np
import pandas as pd
posting_list = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'],
        ['i', 'want', 'to', 'hear', 'stories', 'about', 'your', 'dogs'],
        ['you', 'shut', 'up', 'you', 'idiot']
    ]
class_vec = [0, 1, 0, 1, 0, 1, 0, 1]

test_set = [
    ['love', 'my', 'dalmation'],
    ['stupid', 'garbage'],
    ['stop', 'posting', 'about', 'your', 'idiot', 'dog'],
    # ['some', 'text', 'not', 'important'],
]

### 计数 + 多项式朴素贝叶斯

In [21]:
def createVocabList(data_set):
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)

In [22]:
from collections import Counter
def setOfWords2Count(vocab_list, input_set):
    cnt = Counter(input_set)
    return [cnt[x] for x in vocab_list]

In [23]:
vocab_list = createVocabList(posting_list)
vocab_cnt = list(map(lambda post: setOfWords2Count(vocab_list, post), posting_list))
X = np.array(vocab_cnt)
Y = np.array(class_vec)
df = pd.DataFrame(X, columns=vocab_list, index=[Y, [' '.join(post) for post in posting_list]]).sort_index()
# df['is_offensive'] = Y
df

Unnamed: 0,Unnamed: 1,please,you,I,has,to,hear,ate,your,dalmation,not,...,garbage,dog,love,cute,food,i,posting,licks,park,problems
0,i want to hear stories about your dogs,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
0,mr licks ate my steak how to stop him,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
0,my dalmation is so cute I love him,0,0,1,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
0,my dog has flea problems help please,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,maybe not take him to dog park stupid,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,quit buying worthless dog food stupid,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,stop posting stupid worthless garbage,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,you shut up you idiot,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
print('class 0 word count:', df.loc[0,].sum().sum())
print('class 1 word count:', df.loc[1,].sum().sum())
alpha = 1.0
Pk0 = (df.loc[0,].sum() + alpha)/(df.loc[0,].sum().sum() + alpha * len(df.columns))
Pk1 = (df.loc[1,].sum() + alpha)/(df.loc[1,].sum().sum() + alpha * len(df.columns))

P = pd.DataFrame([Pk0, Pk1], index=['Pk0', 'Pk1'])
print(P.sum(axis=1))
P

class 0 word count: 32
class 1 word count: 24
Pk0    1.0
Pk1    1.0
dtype: float64


Unnamed: 0,please,you,I,has,to,hear,ate,your,dalmation,not,...,garbage,dog,love,cute,food,i,posting,licks,park,problems
Pk0,0.026667,0.013333,0.026667,0.026667,0.04,0.026667,0.026667,0.026667,0.026667,0.013333,...,0.013333,0.026667,0.026667,0.026667,0.013333,0.026667,0.013333,0.026667,0.013333,0.026667
Pk1,0.014925,0.044776,0.014925,0.014925,0.029851,0.014925,0.014925,0.014925,0.014925,0.029851,...,0.029851,0.044776,0.014925,0.014925,0.029851,0.014925,0.029851,0.014925,0.029851,0.014925


In [25]:
clf = MultinomialNB()
clf.fit(X, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
test = np.array(list(map(lambda post: setOfWords2Count(vocab_list, post), test_set)))
clf.predict(test)

array([0, 1, 1])

### 词向量 + 伯努利朴素贝叶斯

In [27]:
def setOfWords2Vec(vocab_list, input_set):
    return_vec = [0] * len(vocab_list)
    for word in input_set:
        assert word in vocab_list, "the word: %s is not in my Vocabulary!" % (word,)
        return_vec[vocab_list.index(word)] = 1
    return return_vec

In [28]:
vocab_list = createVocabList(posting_list)
vocab_vec = list(map(lambda post: setOfWords2Vec(vocab_list, post), posting_list))
X = np.array(vocab_vec)
Y = np.array(class_vec)
df = pd.DataFrame(X, columns=vocab_list, index=[Y, [' '.join(post) for post in posting_list]]).sort_index()
df

Unnamed: 0,Unnamed: 1,please,you,I,has,to,hear,ate,your,dalmation,not,...,garbage,dog,love,cute,food,i,posting,licks,park,problems
0,i want to hear stories about your dogs,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
0,mr licks ate my steak how to stop him,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
0,my dalmation is so cute I love him,0,0,1,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
0,my dog has flea problems help please,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,maybe not take him to dog park stupid,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,quit buying worthless dog food stupid,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,stop posting stupid worthless garbage,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,you shut up you idiot,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
alpha = 1
Pk0 = (df.loc[0,].sum() + alpha)/(len(df.loc[0,]) + alpha * 2)
Pk1 = (df.loc[1,].sum() + alpha)/(len(df.loc[1,]) + alpha * 2)
pd.DataFrame([Pk0, Pk1])

Unnamed: 0,please,you,I,has,to,hear,ate,your,dalmation,not,...,garbage,dog,love,cute,food,i,posting,licks,park,problems
0,0.333333,0.166667,0.333333,0.333333,0.5,0.333333,0.333333,0.333333,0.333333,0.166667,...,0.166667,0.333333,0.333333,0.333333,0.166667,0.333333,0.166667,0.333333,0.166667,0.333333
1,0.166667,0.333333,0.166667,0.166667,0.333333,0.166667,0.166667,0.166667,0.166667,0.333333,...,0.333333,0.5,0.166667,0.166667,0.333333,0.166667,0.333333,0.166667,0.333333,0.166667


In [33]:
clf = BernoulliNB()
clf.fit(X, Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [34]:
test = np.array(list(map(lambda post: setOfWords2Vec(vocab_list, post), test_set)))
clf.predict(test)

array([0, 1, 1])