In [17]:
import numpy as np
import re
import random
import feedparser

In [2]:
def gen_data_set():
    posting_list =[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return posting_list, class_vec

In [3]:
# 根据已有数据集，生成一个词汇表
def gen_voc_unique_list(data_set):
    voc_set = set(())
    for post in data_set:
        voc_set = voc_set | set(post)
    return list(voc_set)

# 将一条留言根据词汇表转成一个行向量，0 代表这个词没有出现，1 代表出现
def gen_post_vec_by_voc_unique_list(post, voc_set):
    post_vec = [0] * len(voc_set)
    for word in post:
        if word in voc_set:
            post_vec[voc_set.index(word)] = 1
    return post_vec

In [4]:
data_set, class_vec = gen_data_set()
print(data_set)
voc_list = gen_voc_unique_list(data_set)
print(voc_list)
# gen_post_vec_by_voc_unique_list(data_set[0], voc_list)

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
['him', 'mr', 'stupid', 'flea', 'so', 'my', 'please', 'not', 'park', 'stop', 'quit', 'problems', 'has', 'how', 'I', 'take', 'licks', 'ate', 'worthless', 'love', 'help', 'dog', 'posting', 'dalmation', 'garbage', 'buying', 'food', 'cute', 'maybe', 'to', 'is', 'steak']


In [5]:
def post_2_vec(data_set, voc_unique):
    train_set = []
    for data in data_set:
        train_set.append(gen_post_vec_by_voc_unique_list(data, voc_unique))
    return train_set

post_vec = post_2_vec(data_set, voc_list)
print(post_vec)

[[0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]]


In [6]:
def calc_pro(post_vec, class_vec):
    # 根据贝叶斯公式，计算类别的概率,p(w1,w2,w3, ...| ci)
    post_num         = len(post_vec)
    word_num_in_post = len(post_vec[0])
    p_abusive         = sum(class_vec) / post_num
    p_abusive_num     = np.zeros(word_num_in_post)
    p_abusive_total   = 0.0
    p_not_abusive_num = np.zeros(word_num_in_post)
    p_not_abusive_total = 0.0
#     print(p_abusive)
    
    for i in range(post_num):
        # 侮辱留言
        if class_vec[i] == 1:
            # 若标记为侮辱留言，那么要把其中出现在词汇表的词都算作侮辱性词汇来计算概率
            p_abusive_num += post_vec[i] # 一个向量中，某些单词出现了，要增加次数
            # 计算侮辱性词汇出现总数
            p_abusive_total += sum(post_vec[i])
        # 非侮辱留言
        else:
            p_not_abusive_num += post_vec[i]
            # 认为出现的都是非侮辱性词汇
            p_not_abusive_total += sum(post_vec[i])
    return p_abusive_num / p_abusive_total, p_not_abusive_num / p_not_abusive_total, p_abusive
    
# 可以看到，在给定类别 ci(侮辱与否) 的情况下，p(w0,w1,w2,...|ci)
calc_pro(post_vec, class_vec)

(array([0.05263158, 0.        , 0.15789474, 0.        , 0.        ,
        0.        , 0.        , 0.05263158, 0.05263158, 0.05263158,
        0.05263158, 0.        , 0.        , 0.        , 0.        ,
        0.05263158, 0.        , 0.        , 0.10526316, 0.        ,
        0.        , 0.10526316, 0.05263158, 0.        , 0.05263158,
        0.05263158, 0.05263158, 0.        , 0.05263158, 0.05263158,
        0.        , 0.        ]),
 array([0.08333333, 0.04166667, 0.        , 0.04166667, 0.04166667,
        0.125     , 0.04166667, 0.        , 0.        , 0.04166667,
        0.        , 0.04166667, 0.04166667, 0.04166667, 0.04166667,
        0.        , 0.04166667, 0.04166667, 0.        , 0.04166667,
        0.04166667, 0.04166667, 0.        , 0.04166667, 0.        ,
        0.        , 0.        , 0.04166667, 0.        , 0.04166667,
        0.04166667, 0.04166667]),
 0.5)

# 目前两个问题

1. 存在 p(wi|ci) = 0 情况，最终 p(w|c) 的概率也会变成 0
2. 由于概率可能都很小，计算 p(w|c) 的过程中，会造成下溢出变成 0。

针对第一种情况，默认每个词都会出现一次，且起码每个分类的词总数为 2.0。

针对第二种情况，将 p(w1,..|ci) 用自然对数计算，保留精度的同时，还能进行 ln(a * b) -> ln(a) + ln(b) 计算。

In [7]:
def calc_pro_1(post_vec, class_vec):
    # 根据贝叶斯公式，计算类别的概率,p(w1,w2,w3, ...| ci)
    post_num         = len(post_vec)
    word_num_in_post = len(post_vec[0])
    p_abusive         = sum(class_vec) / post_num
    p_abusive_num     = np.ones(word_num_in_post) # 默认都会有一次
    p_abusive_total   = 2.0 # 
    p_not_abusive_num = np.ones(word_num_in_post) # 默认都有一次
    p_not_abusive_total = 2.0 #
    
    for i in range(post_num):
        # 侮辱留言
        if class_vec[i] == 1:
            # 若标记为侮辱留言，那么要把其中出现在词汇表的词都算作侮辱性词汇来计算概率
            p_abusive_num += post_vec[i] # 一个向量中，某些单词出现了，要增加次数
            # 计算侮辱性词汇出现总数
            p_abusive_total += sum(post_vec[i])
        # 非侮辱留言
        else:
            p_not_abusive_num += post_vec[i]
            # 认为出现的都是非侮辱性词汇
            p_not_abusive_total += sum(post_vec[i])
    return np.log(p_abusive_num / p_abusive_total), np.log(p_not_abusive_num / p_not_abusive_total), p_abusive

In [8]:
p_abusive, p_not_abusive, p_class_abusive = calc_pro_1(post_vec, class_vec)

以上只是计算了各留言的各词的概率。但是利用贝叶斯分类的目的还没有达成。

In [9]:
# test_post = ['love', 'my', 'dalmation']，需要转化成 post_vec 形式
# 
def classify(test_post, p_abusive, p_not_abusive, p_class_abusive):
    # sum(p_abusive * test_post) 将 post 中，有出现在词汇表的词都计算进来
    p_test_abusive     = sum(p_abusive * test_post) + np.log(p_class_abusive)
    p_test_not_abusive = sum(p_not_abusive * test_post) + np.log(1.0 - p_class_abusive)
    
    if p_test_abusive > p_test_not_abusive:
        return 1
    else:
        return 0

    
# gen_post_vec_by_voc_unique_list(['love', 'my', 'dalmation'], voc_list)
test_post = gen_post_vec_by_voc_unique_list(['love', 'my', 'dalmation'], voc_list)

print(classify(test_post, p_abusive, p_not_abusive, p_class_abusive))

test_post = gen_post_vec_by_voc_unique_list(['stupid', 'garbage'], voc_list)
print(classify(test_post, p_abusive, p_not_abusive, p_class_abusive))

0
1


以上的 `voc_unique_list` 是词集模型，以一个单词是否出现作为一个特征，但是如果这个词其实是在相同或不同的句子中多次出现，这个模型就不能体现出更多的信息。

所以搞了一个词袋模型，其实就是每个特征记录出现的次数。

In [10]:
# 将一条留言根据词汇表转成一个行向量，记录出现次数
def gen_bag_post_vec_by_voc_unique_list(post, voc_set):
    post_vec = [0] * len(voc_set)
    for word in post:
        if word in voc_set:
            post_vec[voc_set.index(word)] += 1
    return post_vec

In [21]:
def text_parse(big_string):
    # 词条叫一个 token 可能是单词，可能是 url
    tokens_list = re.split(r'\W*', big_string)
    # 都要小写，只是这个应用需要, 同时去除空字符
    return [ tok.lower() for tok in tokens_list if len(tok) > 2 ]

# with open('./email/spam/6.txt', 'r') as f:
#     t = text_parse(f.read())
def spam():
    
    full_text_vec = []
    class_vec     = []
    doc_list      = []
    # spam 和 ham 目录都有 25 封邮件
    for i in range(1, 26):
        # 注意一下编码，真的操
        with open('./email/ham/{}.txt'.format(i), 'r', encoding='cp1252') as f:
            file_content_vec = text_parse(f.read())
            full_text_vec.append(file_content_vec)
            doc_list.append(file_content_vec)
            class_vec.append(0) # 非垃圾邮件
        with open('./email/spam/{}.txt'.format(i), 'r', encoding='cp1252') as f:
            file_content_vec = text_parse(f.read())
            full_text_vec.append(file_content_vec)
            doc_list.append(file_content_vec)
            class_vec.append(1) # 垃圾邮件
    
#     print(class_vec)
    # 用 doc list 做词汇表
    voc_list = gen_voc_unique_list(doc_list)
#     print(len(voc_list))
    
    x_train = []
    y_train = []
    
    x_test = []
    y_test = []
    # 将数据集分割成测试集和训练集，10 个测试集 留出法
    rand_list = list(range(50))
    random.shuffle(rand_list)
    for index in rand_list[:10]:
        x_test.append(gen_post_vec_by_voc_unique_list(doc_list[index], voc_list))
        y_test.append(class_vec[index])
        
    for index in rand_list[10:]:
        x_train.append(gen_post_vec_by_voc_unique_list(doc_list[index], voc_list))
        y_train.append(class_vec[index]) 
        
    p_1, p_0, p_spam = calc_pro_1(x_train, y_train) # 训练出模型(就是各个概率计算出来，让贝叶斯分类完整)
    
    correct = 0.0
    wrong   = 0.0
    for i in range(10):
        result = classify(x_test[i], p_1, p_0, p_spam)
#         print(result, y_test[i])
        if y_test[i] == result:
            correct += 1
        else:
            wrong += 1
    print(wrong / 10.0)
    
    

for i in range(10):
    spam()

  return _compile(pattern, flags).split(string, maxsplit)


0.0
0.1
0.0
0.0
0.2
0.1
0.0
0.1
0.0
0.2


In [22]:
# feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
# feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')