In [1]:
import numpy as np
import re

In [2]:
def create_vocab_list(data_set):
    """
    返回不重复词的列表
    """
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)  # 求并集
    return list(vocab_set)


def set_of_words2vec(vocab_list, input_set):
    """
    将文档转换为词向量（词集模型）
    """
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] = 1
        # else:
            # print("the word: %s is not in my Vocabulary!" % word)
    return return_vec

def bag_of_words2vec(vocab_list, input_set):
    """
    将文档转换为词向量（词袋模型）
    """
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
        # else:
            # print("the word: %s is not in my Vocabulary!" % word)
    return return_vec



In [3]:
def load_data_set():
    """
    示例数据
    """
    posting_list = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    class_vec = [0, 1, 0, 1, 0, 1]  # 1 代表侮辱性文字；0代表正常言论
    return posting_list, class_vec

list_posts, list_classes = load_data_set()  # 示例评论，示例评论所属分类
my_vocab_list = create_vocab_list(list_posts)  # 词库
set_of_words2vec(my_vocab_list, list_posts[0])  # 词向量

[0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

In [4]:
"""
求某个文档d是某个类别c的概率：
p(c|d) = p(d|c)p(c) / p(d)

文档d可以表示为词向量(w1,w2,...,wn)，p(d|c) = p(w1,w2,...,wn|c)，
朴素贝叶斯假设所有词均独立，即p(d|c) = p(w1,w2,...,wn|c)=p(w1|c)p(w2|c)...p(wn|c)

根据训练样本可以求p(c)、p(w1|c)、p(w2|c)、p(wn|c)

"""

'\n求某个文档d是某个类别c的概率：\np(c|d) = p(d|c)p(c) / p(d)\n\n文档d可以表示为词向量(w1,w2,...,wn)，p(d|c) = p(w1,w2,...,wn|c)，\n朴素贝叶斯假设所有词均独立，即p(d|c) = p(w1,w2,...,wn|c)=p(w1|c)p(w2|c)...p(wn|c)\n\n根据训练样本可以求p(c)、p(w1|c)、p(w2|c)、p(wn|c)\n\n'

In [5]:
def train_nb0(train_matrix, train_category):
    """
    朴素贝叶斯分类器训练函数
    
    Parameters
        train_matrix：由词向量组成的list
        train_category：List，每个词向量的类别标签
        
    Return
        p0_vect：每个词属于类别0（正常言论）的概率
        p1_vect：每个词属于类别1（侮辱性言论）的概率
        p_abusive：文档属于侮辱性文档的概率
    """
    num_train_docs = len(train_matrix)  # 文档数量
    num_words = len(train_matrix[0])  # 词向量长度
    p_abusive = sum(train_category) / float(num_train_docs)  # 文档属于侮辱性文档的概率
    # p0_num = np.zeros(num_words)  # 每个词在正常言论中出现的次数
    # p1_num = np.zeros(num_words)  # 每个词在侮辱性言论中出现的次数
    p0_num = np.ones(num_words)  # 用ones代替zeros，降低概率为0导致乘积为0带来的影响
    p1_num = np.ones(num_words)
    # p0_denom = 0.0  # 正常言论中所有单词出现次数的总和
    # p1_denom = 0.0  # 侮辱性言中所有单词出现次数的总和
    p0_denom = 2.0  # 用2代替0
    p1_denom = 2.0

    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])

    # p1_vect = p1_num / p1_denom
    # p0_vect = p0_num / p0_denom
    p1_vect = np.log(p1_num / p1_denom)  # 取对数。当概率很小时相乘会很小，甚至接近于0，取对数可缓解这种情况
    p0_vect = np.log(p0_num / p0_denom)

    return p0_vect, p1_vect, p_abusive

# 获得文档的词向量
train_mat = []
for post_in_doc in list_posts:
    train_mat.append(set_of_words2vec(my_vocab_list, post_in_doc))

# 训练。求每个词属于哪个类别的概率
p0_v, p1_v, p_ab = train_nb0(train_mat, list_classes)

In [6]:
# 查看训练器
print(p_ab) # 侮辱性言论的概率
print(p0_v) # 每个词是正常言论的概率
print(p1_v) # 每个词是侮辱性言论的概率
print(my_vocab_list)

0.5
[-2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -3.25809654 -2.56494936 -3.25809654 -1.87180218 -2.56494936 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -3.25809654 -3.25809654 -3.25809654 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -2.15948425]
[-2.35137526 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244
 -2.35137526 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -3.04452244
 -3.04452244 -2.35137526 -1.94591015 -3.04452244 -1.94591015 -3.04452244
 -3.04452244 -1.65822808 -2.35137526 -2.35137526 -2.35137526 -3.04452244
 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -2.35137526
 -2.35137526 -2.35137526]
['stop', 'help', 'has', 'how', 'buying', 'so', 'garbage', 'dalmation', 'maybe', 'my', 'I', 'love', 'steak', 'to', 'dog', 'flea', 'worthless', 'licks', 'mr', 'stupid', 'posting', 'quit', 'food', 'please', 'is', 'pro

In [7]:
def classify_nb(vec2classify, p0_vec, p1_vec, p_class1):
    """
    朴素贝叶斯分类函数
    
    Parameters
        vec2classify：需要分类的词向量
        p0_vec：每个词属于类别0（正常言论）的概率
        p1_vect：每个词属于类别1（侮辱性言论）的概率
        p_class1：文档属于侮辱性文档的概率
    """
    p1 = sum(vec2classify * p1_vec) + np.log(p_class1)  # log(ab) = log(a) + log(b)
    p0 = sum(vec2classify * p0_vec) + np.log(1.0 - p_class1)

    if p1 > p0:
        return 1
    else:
        return 0

test_list = [
    ['love', 'my', 'dalmation'],
    ['stupid', 'garbage']
]

# 利用训练器分类
for test_doc in test_list:
    test_class = classify_nb(set_of_words2vec(my_vocab_list, test_doc), p0_v, p1_v, p_ab)
    print("%s classified as : %s" % (test_doc, test_class))

['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1


In [11]:
# 示例：垃圾邮件分类

def text_parse(big_string):
    """
    将文本分割为list
    """
    list_of_tokens = re.split(r'\W+', big_string)
    return [tok.lower() for tok in list_of_tokens if len(tok) > 2]

def spam_test():
    """
    垃圾邮件测试
    """
    
    # 读取邮件内容
    doc_list = []; class_list = []; full_text = []
    for i in range(1,26):
        word_list = text_parse(open('data/email/spam/%s.txt' % i,encoding='utf-8').read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parse(open('data/email/ham/%d.txt' % i,encoding='utf-8').read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    
    # 词库
    vocab_list = create_vocab_list(doc_list)
    
    # 构造训练集、测试集
    training_set = list(range(50)); test_set = []
    for i in range(10):
        rand_index = int(np.random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del training_set[rand_index]

    # 训练模型
    train_mat = []; train_classes = []
    for doc_index in training_set:
        train_mat.append(set_of_words2vec(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    p0_v, p1_v, p_spam = train_nb0(np.array(train_mat), np.array(train_classes))

    # 测试
    error_count = 0
    for doc_index in test_set:
        word_vector = set_of_words2vec(vocab_list, doc_list[doc_index])
        if classify_nb(np.array(word_vector), p0_v, p1_v, p_spam) != class_list[doc_index]:
            error_count += 1
    print("the error rate is: ", float(error_count) / len(test_set))

spam_test()

the error rate is:  0.0


In [8]:
# 示例：发现地域相关的用词

import feedparser

ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

In [12]:
# 示例：发现地域相关的用词

def clac_most_freq(vocab_list, full_text):
    """
    计算词频，取出现次数多的词
    """
    import operator
    freq_dict = {}
    for token in vocab_list:
        freq_dict[token] = full_text.count(token)
    sorted_freq = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_freq[:30]


def local_words(feed1, feed0):
    # 读取rss内容
    doc_list = []; class_list = []; full_text = []
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(min_len):
        word_list = text_parse(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        word_list = text_parse(feed0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    
    # 词库
    vocal_list = create_vocab_list(doc_list)
    
    # 取出现次数多的词
    top30_words = clac_most_freq(vocal_list, full_text)
    
    # 移除高频词
    for pair_w in top30_words:
        if pair_w[0] in vocal_list:
            vocal_list.remove(pair_w[0])
    
    # 构建训练集和测试集
    training_set = list(range(2*min_len)); test_set = []
    for i in range(20):
        rand_indx = int(np.random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_indx])
        del(training_set[rand_indx])

    # 训练模型
    train_mat = []; train_classes = []
    for doc_index in training_set:
        train_mat.append(bag_of_words2vec(vocal_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    p0_v, p1_v, p_spam = train_nb0(np.array(train_mat), np.array(train_classes))

    # 测试
    error_count = 0
    for doc_index in test_set:
        word_vector = bag_of_words2vec(vocal_list, doc_list[doc_index])
        if classify_nb(np.array(word_vector), p0_v, p1_v, p_spam) != class_list[doc_index]:
            error_count += 1
    print('the error rate is: ', float(error_count)/len(test_set))
    return vocal_list, p0_v, p1_v

In [14]:
def get_top_words(ny, sf):
    """
    最具表征性的词汇显示函数
    """
    import operator
    vovab_list, p0_v, p1_v = local_words(ny, sf)
    top_ny = []; top_sf = []
    for i in range(len(p0_v)):
        if p0_v[i] > -5 : top_sf.append((vovab_list[i], p0_v[i]))
        if p1_v[i] > -5 : top_ny.append((vovab_list[i], p1_v[i]))

    sorted_sf = sorted(top_sf, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sorted_sf:
        print(item[0])

    sorted_ny = sorted(top_ny, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sorted_ny:
        print(item[0])
        
get_top_words(ny,sf)

the error rate is:  0.35
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
find
things
about
area
meet
here
just
married
get
friendship
prefer
now
what
happy
relationship
forward
being
new
employed
little
enjoy
all
tamil
wish
know
daddy
hang
helper
love
young
mind
email
texting
open
horny
than
there
lingerie
seeking
years
believes
are
click
bay
ever
don
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
host
that
come
place
massage
best
handsome
day
9832
nice
now
over
stress
from
cool
beautiful
reply
available
9745
professional
relaxing
want
also
travel
pic
420
full
candles
give
seaching
wants
buddy
two
nothing
gentlemen
waiting
let
great
fun
weed
sitting
here
panties
awesome
secret
girl
music
mature
four
details
five
guys
make
about
built
got
three
one
technique
lesbian
kept
female
body
scented
lotions
list
smoke
oral
ready
