In [1]:
def load_dataset():
    """词表到向量的转换函数"""
    posting_list=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0,1,0,1,0,1]    # 1代表侮辱性文字，0代表正常言论
    return posting_list, class_vec

In [2]:
def create_vocab_list(dataset):
    """创建单词去重列表"""
    vocab_set = set()
    for document in dataset:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)

In [3]:
def set_of_words_vec(vocab_list, input_set):
    """获得单词向量"""
    return_vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] = 1
        else:
            print("the word %s is not in my Vocabulary" % word)
    return return_vec

In [4]:
list_of_posts, list_classes = load_dataset()
my_vocablist = create_vocab_list(list_of_posts)
print(my_vocablist)

['park', 'love', 'is', 'dog', 'worthless', 'ate', 'please', 'so', 'stupid', 'posting', 'mr', 'maybe', 'my', 'buying', 'has', 'dalmation', 'help', 'I', 'stop', 'cute', 'how', 'quit', 'food', 'garbage', 'flea', 'take', 'not', 'steak', 'to', 'him', 'licks', 'problems']


In [5]:
print(set_of_words_vec(my_vocablist, list_of_posts[0]))
print(set_of_words_vec(my_vocablist, list_of_posts[3]))

[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
import numpy as np 

def train_NB0(train_matrix, train_category):
    """朴素贝叶斯分类器训练函数"""
    num_train_docs = len(train_matrix)
    num_words = len(train_matrix[0])    # 词汇表所含单词数
    p_abusive = sum(train_category) / float(num_train_docs) # 是侮辱性文档的概率
    p0_num = np.ones(num_words)
    p1_num = np.ones(num_words) # 这里不初始化np.zero()是考虑在朴素贝叶斯中计算乘积一个为0导致结果为0
    p0_denom = 2.0  # 类似，这里不初始化为0
    p1_denom = 2.0  # 初始化概率

    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])    # 向量相加
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])    # 对于两类文档，分别加上某个文档词汇表中出现的词的计数
    p1_vect = np.log(p1_num / p1_denom)     # 使用log防止下溢
    p0_vect = np.log(p0_num / p0_denom)
    return p0_vect, p1_vect, p_abusive

In [7]:
list_of_posts, list_classes = load_dataset()

In [8]:
my_vocablist = create_vocab_list(list_of_posts)

In [9]:
train_mat = [] 
for postin_doc in list_of_posts:
    train_mat.append(set_of_words_vec(my_vocablist, postin_doc))

In [13]:
print(train_mat)

[[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0], [0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [14]:
p0_v, p1_v, p_ab = train_NB0(train_mat, list_classes)

In [15]:
p_ab

0.5

In [16]:
p0_v

array([-3.25809654, -2.56494936, -2.56494936, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -3.25809654,
       -2.56494936, -3.25809654, -1.87180218, -3.25809654, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -3.25809654, -3.25809654, -2.56494936,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.15948425,
       -2.56494936, -2.56494936])

In [17]:
p1_v

array([-2.35137526, -3.04452244, -3.04452244, -1.94591015, -1.94591015,
       -3.04452244, -3.04452244, -3.04452244, -1.65822808, -2.35137526,
       -3.04452244, -2.35137526, -3.04452244, -2.35137526, -3.04452244,
       -3.04452244, -3.04452244, -3.04452244, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -2.35137526, -3.04452244,
       -2.35137526, -2.35137526, -3.04452244, -2.35137526, -2.35137526,
       -3.04452244, -3.04452244])

In [18]:
def classify_NB(vec2classify, p0_vec, p1_vec, p_class1):
    """朴素贝叶斯分类函数"""
    p1 = np.sum(vec2classify * p1_vec) + np.log(p_class1)   # 对应元素相乘
    p0 = np.sum(vec2classify * p0_vec) + np.log(1.0 - p_class1)
    if p1 > p0:
        return 1 
    else:
        return 0

def testing_NB():
    """
    其实就是把上面的代码搞到了一起
    用两个非常简单的例子测试一下
    """
    list_of_posts, list_classes = load_dataset()
    my_vocablist = create_vocab_list(list_of_posts)
    train_mat = []
    for postin_doc in list_of_posts:
        train_mat.append(set_of_words_vec(my_vocablist, postin_doc))
    
    test_entry = ['love', 'my', 'dalmation']
    this_doc = np.array(set_of_words_vec(my_vocablist, test_entry))
    print(test_entry, 'classified as: ', classify_NB(this_doc, p0_v, p1_v, p_ab))

    test_entry = ['stupid', 'garbage']
    this_doc = np.array(set_of_words_vec(my_vocablist, test_entry))
    print(test_entry, 'classified as: ', classify_NB(this_doc, p0_v, p1_v, p_ab))

In [19]:
testing_NB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


In [20]:
def bag_of_words2vec_mn(vocab_list, input_set):
    """
    朴素贝叶斯词袋模型
    不仅关注某个词是否出现，还关心出现的频率
    """
    return_vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
    return return_vec

In [22]:
my_sent = 'This book is the best book for Python or M.L. I have ever laid eyes upon.'

In [23]:
my_sent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'for',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [24]:
import re 
reg_ex = re.compile(r'\W')
list_of_tokens = reg_ex.split(my_sent)
print(list_of_tokens)

['This', 'book', 'is', 'the', 'best', 'book', 'for', 'Python', 'or', 'M', 'L', '', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']


In [25]:
[tok for tok in list_of_tokens if tok]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'for',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [26]:
[tok.lower() for tok in list_of_tokens if tok]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'for',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [27]:
with open('email/ham/6.txt', encoding='gbk') as f:
    email_text = f.read()
list_of_tokens = reg_ex.split(email_text)

In [28]:
def text_parse(big_string):
    import re 
    list_of_tokens = re.split(r'\W', big_string)
    return [tok.lower() for tok in list_of_tokens if tok]

def spam_test():
    doc_list = []
    class_list = []
    full_text = []
    for i in range(1, 26):
        with open('email/spam/%d.txt' % i, encoding='ISO-8859-1') as f1:
            word_list = text_parse(f1.read())
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(1)
        with open('email/ham/%d.txt' % i, encoding='ISO-8859-1') as f2:
            word_list = text_parse(f2.read())
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
    
    vocab_list = create_vocab_list(doc_list)
    test_set = list(np.random.choice(50,10,replace=False))  # 从50个中不重复随机选择10个
    training_set = [i for i in range(50) if i not in test_set]  # 剩下的作为训练集

    train_mat = []; train_class = []
    for doc_index in training_set:
        train_mat.append(set_of_words_vec(vocab_list, doc_list[doc_index]))
        train_class.append(class_list[doc_index])
    p0_v, p1_v, p_spam = train_NB0(np.array(train_mat), np.array(train_class))
    error_count = 0
    for doc_index in test_set:
        word_vector = set_of_words_vec(vocab_list, doc_list[doc_index])
        if classify_NB(np.array(word_vector), p0_v, p1_v, p_spam) !=\
            class_list[doc_index]:
            error_count += 1
    print("the error rate is %f" % (float(error_count) / len(test_set))) 

spam_test()

the error rate is 0.000000
