In [2]:
import numpy as np
import re

In [3]:
def load_data_set():
    posting_list = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    class_vec = [0, 1, 0, 1, 0, 1]  # 1 代表侮辱性文字；0代表正常言论
    return posting_list, class_vec


def create_vocab_list(data_set):
    """
    返回不重复词的列表
    """
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)  # 求并集
    return list(vocab_set)


def set_of_words2vec(vocab_list, input_set):
    """
    将文档转换为词向量
    """
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return return_vec

list_posts, list_classes = load_data_set()
my_vocab_list = create_vocab_list(list_posts)

In [4]:
set_of_words2vec(my_vocab_list, list_posts[0])

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0]

In [5]:
"""
求某个文档d是某个类别c的概率：
p(c|d) = p(d|c)p(c) / p(d)

文档d可以表示为词向量(w1,w2,...,wn)，p(d|c) = p(w1,w2,...,wn|c)，
朴素贝叶斯假设所有词均独立，即p(d|c) = p(w1,w2,...,wn|c)=p(w1|c)p(w2|c)...p(wn|c)

根据训练样本可以求p(c)、p(w1|c)、p(w2|c)、p(wn|c)

"""

'\n求某个文档d是某个类别c的概率：\np(c|d) = p(d|c)p(c) / p(d)\n\n文档d可以表示为词向量(w1,w2,...,wn)，p(d|c) = p(w1,w2,...,wn|c)，\n朴素贝叶斯假设所有词均独立，即p(d|c) = p(w1,w2,...,wn|c)=p(w1|c)p(w2|c)...p(wn|c)\n\n根据训练样本可以求p(c)、p(w1|c)、p(w2|c)、p(wn|c)\n\n'

In [10]:
def train_nb0(train_matrix, train_category):
    """
    朴素贝叶斯分类器训练函数
    
    Parameters
        train_matrix：由词向量组成的list
        train_category：List，每个词向量的类别标签
        
    Return
        p0_vect：每个词属于类别0（正常言论）的概率
        p1_vect：每个词属于类别1（侮辱性言论）的概率
        p_abusive：文档属于侮辱性文档的概率
    """
    num_train_docs = len(train_matrix)  # 文档数量
    num_words = len(train_matrix[0])  # 词向量长度
    p_abusive = sum(train_category) / float(num_train_docs)  # 文档属于侮辱性文档的概率
    # p0_num = np.zeros(num_words)  # 每个词在正常言论中出现的次数
    # p1_num = np.zeros(num_words)  # 每个词在侮辱性言论中出现的次数
    p0_num = np.ones(num_words)  # 用ones代替zeros，降低概率为0导致乘积为0带来的影响
    p1_num = np.ones(num_words)
    # p0_denom = 0.0  # 正常言论中所有单词出现次数的总和
    # p1_denom = 0.0  # 侮辱性言中所有单词出现次数的总和
    p0_denom = 2.0  # 用2代替0
    p1_denom = 2.0

    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])

    # p1_vect = p1_num / p1_denom
    # p0_vect = p0_num / p0_denom
    p1_vect = np.log(p1_num / p1_denom)  # 取对数。当概率很小时相乘会很小，甚至接近于0，取对数可缓解这种情况
    p0_vect = np.log(p0_num / p0_denom)

    return p0_vect, p1_vect, p_abusive

# 获得文档的词向量
train_mat = []
for post_in_doc in list_posts:
    train_mat.append(set_of_words2vec(my_vocab_list, post_in_doc))

# 求每个词属于哪个类别的概率
p0_v, p1_v, p_ab = train_nb0(train_mat, list_classes)

In [11]:
print(p_ab)
print(p0_v)
print(p1_v)
print(my_vocab_list)

0.5
[-1.87180218 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -2.15948425 -3.25809654 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -2.56494936
 -3.25809654 -2.56494936]
[-3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244
 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -1.65822808 -1.94591015
 -2.35137526 -2.35137526 -3.04452244 -3.04452244 -1.94591015 -3.04452244
 -2.35137526 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -2.35137526
 -2.35137526 -3.04452244]
['my', 'mr', 'ate', 'posting', 'is', 'problems', 'dalmation', 'I', 'so', 'garbage', 'not', 'steak', 'licks', 'how', 'him', 'park', 'stupid', 'dog', 'buying', 'to', 'help', 'please', 'worthless', 'love', 'quit', 'ha

In [12]:
def classify_nb(vec2classify, p0_vec, p1_vec, p_class1):
    """
    朴素贝叶斯分类函数
    
    Parameters
        vec2classify：需要分类的词向量
        p0_vec：每个词属于类别0（正常言论）的概率
        p1_vect：每个词属于类别1（侮辱性言论）的概率
        p_class1：文档属于侮辱性文档的概率
    """
    p1 = sum(vec2classify * p1_vec) + log(p_class1)  # log(ab) = log(a) + log(b)
    p0 = sum(vec2classify * p0_vec) + log(1.0 - p_class1)

    if p1 > p0:
        return 1
    else:
        return 0

test_list = [
    ['love', 'my', 'dalmation'],
    ['stupid', 'garbage']
]

for test_doc in test_list:
    test_class = classify_nb(set_of_words2vec(my_vocab_list, test_doc), p0_v, p1_v, p_ab)
    print("%s classified as : %s" % (test_doc, test_class))

['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1


In [20]:
def text_parse(big_string):
    list_of_tokens = re.split(r'\W*', big_string)
    return [tok.lower() for tok in list_of_tokens if len(tok) > 2]

def spam_test():
    doc_list = []; class_list = []; full_text = []
    for i in range(1,26):
        word_list = text_parse(open('data/email/spam/%s.txt' % i,encoding='utf-8').read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parse(open('data/email/ham/%d.txt' % i,encoding='utf-8').read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    
    vocab_list = create_vocab_list(doc_list)
    training_set = list(range(50)); test_set = []
    for i in range(10):
        rand_index = int(np.random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del training_set[rand_index]

    train_mat = []; train_classes = []
    for doc_index in training_set:
        train_mat.append(set_of_words2vec(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])

    p0_v, p1_v, p_spam = train_nb0(np.array(train_mat), np.array(train_classes))

    error_count = 0

    for doc_index in test_set:
        word_vector = set_of_words2vec(vocab_list, doc_list[doc_index])
        if classify_nb(np.array(word_vector), p0_v, p1_v, p_spam) != class_list[doc_index]:
            error_count += 1
    print("the error rate is: ", float(error_count) / len(test_set))

spam_test()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)
