# 垃圾邮件分类

数据来源:

- [enron_email](http://csmining.org/index.php/enron-spam-datasets.html)
- [ling_email](http://csmining.org/index.php/ling-spam-datasets.html)
- [CSDMC2010_email](http://csmining.org/index.php/spam-email-datasets-.html)

In [1]:
import os
import sys
import re
import random
import chardet

stop_word_list = [
    "a","about","above","after","again","against","all","am","an","and","any","are",
    "aren't","as","at","be","because","been","before","being","below","between","both",
    "but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't",
    "doing","don't","down","during","each","few","for","from","further","had","hadn't",
    "has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here",
    "here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm",
    "i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more",
    "most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or",
    "other","ought","our","ours","ourselves","out","over","own","same","shan't","she",
    "she'd","she'll","she's","should","shouldn't","so","some","such","than","that",
    "that's","the","their","theirs","them","themselves","then","there","there's","these",
    "they","they'd","they'll","they're","they've","this","those","through","to","too",
    "under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
    "weren't","what","what's","when","when's","where","where's","which","while","who",
    "who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll",
    "you're","you've","your","yours","yourself","yourselves","re","subject"
]

stop_words = dict(zip(stop_word_list, [1] * len(stop_word_list)))

## 基础函数

In [2]:
def extract_words(text, _stop_words):
    """ 提取邮件中的单词（重复单词只提取一次）

    :param text: 邮件正文
    :param _stop_words: 停用词
    :return: 单词列表
    """
    _words = re.findall('[a-zA-Z\d]{2,}', text)
    _word_set = []
    for w in _words:
        if w in _stop_words.keys() or re.search('\d', w):
            continue
        _word_set.append(w.lower())
    _word_set = list(set(_word_set))

    return _word_set
    
    
def load_email(filename, _stop_words):
    """ 根据文件名加载邮件
    
    :param filename: 文件名
    :param _stop_words: 停用词
    :return: 邮件（用一个单词列表表示）
    """
    with open(filename, "rb") as _fp:
        ec = chardet.detect(_fp.read())['encoding']
    with open(filename, "r", encoding=ec) as _fp:
        _email = extract_words(_fp.read(), _stop_words)
    return _email

    
def load_data_set(data_name):
    """ 加载数据

    :param data_name: 数据名：enron_email, ling_bare_email, CSDMC2010_email
    :return: 正常邮件集，垃圾邮件集
    """
    _emails_ham = {}; _emails_spam = {}
    for _fn in  os.listdir('./data/%s/ham' % data_name):
        if _fn == '.DS_Store':
            continue

        _email_list = load_email('./data/%s/ham/%s' % (data_name, _fn), stop_words)
        if len(_email_list) > 5:
            _emails_ham[_fn] = _email_list

    for _fn in  os.listdir('./data/%s/spam' % data_name):
        if _fn == '.DS_Store':
            continue

        _email_list = load_email('./data/%s/spam/%s' % (data_name, _fn), stop_words)
        if len(_email_list) > 5:
            _emails_spam[_fn] = _email_list

    return _emails_ham, _emails_spam

## 加载enron_email数据作为训练集

In [3]:
# 加载enron_email数据（较慢，可能需要1分钟）
enron_hams, enron_spams = load_data_set('enron_email')
print(len(enron_hams), len(enron_spams))

16436 16816


## 训练

enron_email 数据中取2000个作为测试集，不参与训练

In [4]:
def word_count(emails, test_fns=[]):
    """ 统计每个单词在邮件中出现的次数

    :param emails: 所有邮件
    :param test_fns: 测试用例的名字
    :return: 单词出现的次数
    """
    _word_count = {}
    for _fn in emails:
        if _fn in test_fns.keys():
            continue  # 排除测试用例
        for w in emails[_fn]:
            _word_count[w] = _word_count.get(w, 0) + 1
    return _word_count


def calc(email, _p_y1, _p_xi_y0, _p_xi_y1, _p_xi_y0_not_appear, _p_xi_y1_not_appear, is_use_not_appear=True):
    """ 计算一个邮件是否是垃圾邮件
    
    :param email: 邮件内容，由单词组成的list
    :param _p_y1: 样本中正常邮件所占的比率
    :param _p_xi_y0: 每个单词在垃圾邮件中出现的概率
    :param _p_xi_y1: 每个单词在正常邮件中出现的概率
    :param _p_xi_y0_not_appear: 垃圾邮件中未出现过的词，给一个很小的概率
    :param _p_xi_y1_not_appear: 正常邮件中未出现过的词，给一个很小的概率
    :param is_use_not_appear: 是否使用未出现过的词，True表示使用
    :return: 正常邮件与垃圾邮件概率的比值，大于1表示分类结果是正常邮件
    """
    rate = (_p_y1) / (1.0 - _p_y1)
    for w in email:
        if is_use_not_appear == False and (w not in _p_xi_y1 or w not in _p_xi_y0):
            continue
        rate *= _p_xi_y1.get(w, _p_xi_y0_not_appear) / _p_xi_y0.get(w, _p_xi_y1_not_appear)
        
    return rate


# 随机选取测试用例
test_spam_count = 2000; test_ham_count = 2000
enron_test_ham_fns = dict(zip(random.sample(enron_hams.keys(), test_ham_count), [1] * 2000))
enron_test_spam_fns = dict(zip(random.sample(enron_spams.keys(), test_ham_count), [1] * 2000))

# 统计每个单词分别在正常邮件、垃圾邮件中出现的次数
enron_ham_word_count = word_count(enron_hams, enron_test_ham_fns)
enron_spam_word_count = word_count(enron_spams, enron_test_spam_fns)
print(len(enron_ham_word_count), len(enron_spam_word_count))

# 计算概率
p_xi_y0 = {}; p_xi_y1 ={}
for w in enron_ham_word_count:
    p_xi_y1[w] = (enron_ham_word_count[w] + 1.0) / (len(enron_hams) + 2.0)  # 单词xi在正常邮件中出现的概率
for w in enron_spam_word_count:
    p_xi_y0[w] = (enron_spam_word_count[w] + 1.0) / (len(enron_spams) + 2.0)  # 单词xi在垃圾邮件中出现的概率
p_y1 = float(len(enron_hams)) / float(len(enron_hams) + len(enron_spams))  # 正常邮件的概率
p_xi_y1_not_appear = 1.0 / (2.0 + len(enron_spams))  # 拉普拉斯平滑，给未见过的词一个很小的概率
p_xi_y0_not_appear = 1.0 / (2.0 + len(enron_hams))  # 拉普拉斯平滑，给未见过的词一个很小的概率

print(p_y1)

48326 109119
0.4942860579814748


## 测试

### enron_email 测试集测试

In [5]:
# 测试
err_ham_count = 0; err_spam_count = 0
for fn in enron_test_ham_fns:
    rate = calc(enron_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_ham_count += 1

for fn in enron_test_spam_fns:
    rate = calc(enron_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ham_count, test_ham_count, 100.0 * err_ham_count / test_ham_count,\
       err_spam_count, test_spam_count, 100.0 * err_spam_count / test_spam_count))

# 忽略没有出现过的单词
err_ham_count = 0; err_spam_count = 0
for fn in enron_test_ham_fns:
    rate = calc(enron_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_ham_count += 1

for fn in enron_test_spam_fns:
    rate = calc(enron_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ham_count, test_ham_count, 100.0 * err_ham_count / test_ham_count,\
       err_spam_count, test_spam_count, 100.0 * err_spam_count / test_spam_count))

error ham: 14/2000 (0.70%); error spam: 31/2000 (1.55%)
error ham: 18/2000 (0.90%); error spam: 56/2000 (2.80%)


### ling_email 数据作为测试集

In [6]:
# 加载ling_email数据
ling_hams, ling_spams = load_data_set('ling_email')
print(len(ling_hams), len(ling_spams))

2410 481


In [7]:
# 测试ling_email数据
err_ling_ham_count = 0; err_ling_spam_count = 0
for fn in ling_hams:
    rate = calc(ling_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_ling_ham_count += 1

for fn in ling_spams:
    rate = calc(ling_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_ling_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ling_ham_count, len(ling_hams), 100.0 * err_ling_ham_count / len(ling_hams),\
       err_ling_spam_count, len(ling_spams), 100.0 * err_ling_spam_count / len(ling_spams)))

# 忽略没有出现过的单词
err_ling_ham_count = 0; err_ling_spam_count = 0
for fn in ling_hams:
    rate = calc(ling_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_ling_ham_count += 1

for fn in ling_spams:
    rate = calc(ling_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_ling_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ling_ham_count, len(ling_hams), 100.0 * err_ling_ham_count / len(ling_hams),\
       err_ling_spam_count, len(ling_spams), 100.0 * err_ling_spam_count / len(ling_spams)))

error ham: 574/2410 (23.82%); error spam: 13/481 (2.70%)
error ham: 283/2410 (11.74%); error spam: 14/481 (2.91%)


### CSDMC2010_email 数据作为测试集

可能正常邮件中有大量的单词在样本中没有出现过，导致正确率不高

In [8]:
# 加载CSDMC2010_email数据
csdmc_hams, csdmc_spams = load_data_set('CSDMC2010_email')
print(len(csdmc_hams), len(csdmc_spams))

2948 1376


In [9]:
# 测试CSDMC2010_email数据
err_csdmc_ham_count = 0; err_csdmc_spam_count = 0
for fn in csdmc_hams:
    rate = calc(csdmc_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_csdmc_ham_count += 1

for fn in csdmc_spams:
    rate = calc(csdmc_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_csdmc_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_csdmc_ham_count, len(csdmc_hams), 100.0 * err_csdmc_ham_count / len(csdmc_hams),\
       err_csdmc_spam_count, len(csdmc_spams), 100.0 * err_csdmc_spam_count / len(csdmc_spams)))

# 忽略没有出现过的单词
err_csdmc_ham_count = 0; err_csdmc_spam_count = 0
for fn in csdmc_hams:
    rate = calc(csdmc_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_csdmc_ham_count += 1

for fn in csdmc_spams:
    rate = calc(csdmc_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_csdmc_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_csdmc_ham_count, len(csdmc_hams), 100.0 * err_csdmc_ham_count / len(csdmc_hams),\
       err_csdmc_spam_count, len(csdmc_spams), 100.0 * err_csdmc_spam_count / len(csdmc_spams)))

error ham: 2186/2948 (74.15%); error spam: 2/1376 (0.15%)
error ham: 1761/2948 (59.74%); error spam: 14/1376 (1.02%)
