# 垃圾邮件分类

数据来源:

- [enron_email](http://csmining.org/index.php/enron-spam-datasets.html)
- [ling_email](http://csmining.org/index.php/ling-spam-datasets.html)
- [CSDMC2010_email](http://csmining.org/index.php/spam-email-datasets-.html)

In [1]:
import os
import sys
import re
import random
import chardet

stop_words = {
    "a": 1,"about": 1,"above": 1,"after": 1,"again": 1,"against": 1,"all": 1,"am": 1,"an": 1,"and": 1,
    "any": 1,"are": 1,"aren't": 1,"as": 1,"at": 1,"be": 1,"because": 1,"been": 1,"before": 1,"being": 1,
    "below": 1,"between": 1,"both": 1,"but": 1,"by": 1,"can't": 1,"cannot": 1,"could": 1,"couldn't": 1,
    "did": 1,"didn't": 1,"do": 1,"does": 1,"doesn't": 1,"doing": 1,"don't": 1,"down": 1,"during": 1,
    "each": 1,"few": 1,"for": 1,"from": 1,"further": 1,"had": 1,"hadn't": 1,"has": 1,"hasn't": 1,
    "have": 1,"haven't": 1,"having": 1,"he": 1,"he'd": 1,"he'll": 1,"he's": 1,"her": 1,"here": 1,
    "here's": 1,"hers": 1,"herself": 1,"him": 1,"himself": 1,"his": 1,"how": 1,"how's": 1,"i": 1,
    "i'd": 1,"i'll": 1,"i'm": 1,"i've": 1,"if": 1,"in": 1,"into": 1,"is": 1,"isn't": 1,"it": 1,
    "it's": 1,"its": 1,"itself": 1,"let's": 1,"me": 1,"more": 1,"most": 1,"mustn't": 1,"my": 1,
    "myself": 1,"no": 1,"nor": 1,"not": 1,"of": 1,"off": 1,"on": 1,"once": 1,"only": 1,"or": 1,
    "other": 1,"ought": 1,"our": 1,"ours": 1,"ourselves": 1,"out": 1,"over": 1,"own": 1,"same": 1,
    "shan't": 1,"she": 1,"she'd": 1,"she'll": 1,"she's": 1,"should": 1,"shouldn't": 1,"so": 1,"some": 1,
    "such": 1,"than": 1,"that": 1,"that's": 1,"the": 1,"their": 1,"theirs": 1,"them": 1,"themselves": 1,
    "then": 1,"there": 1,"there's": 1,"these": 1,"they": 1,"they'd": 1,"they'll": 1,"they're": 1,
    "they've": 1,"this": 1,"those": 1,"through": 1,"to": 1,"too": 1,"under": 1,"until": 1,"up": 1,
    "very": 1,"was": 1,"wasn't": 1,"we": 1,"we'd": 1,"we'll": 1,"we're": 1,"we've": 1,"were": 1,
    "weren't": 1,"what": 1,"what's": 1,"when": 1,"when's": 1,"where": 1,"where's": 1,"which": 1,
    "while": 1,"who": 1,"who's": 1,"whom": 1,"why": 1,"why's": 1,"with": 1,"won't": 1,"would": 1,
    "wouldn't": 1,"you": 1,"you'd": 1,"you'll": 1,"you're": 1,"you've": 1,"your": 1,"yours": 1,
    "yourself": 1,"yourselves": 1, "Subject": 1, "re": 1
}

## 基础函数

In [2]:
def extract_words(text, _stop_words):
    """ 提取邮件中的单词（重复单词只提取一次）

    :param text: 邮件正文
    :param _stop_words: 停用词
    :return: 单词列表
    """
    _words = re.findall('[a-zA-Z\d]{2,}', text)
    _word_set = []
    for w in _words:
        if w in _stop_words.keys() or re.search('\d', w):
            continue
        _word_set.append(w.lower())
    _word_set = list(set(_word_set))

    return _word_set
    
    
def load_email(filename, _stop_words):
    """ 根据文件名加载邮件
    
    :param filename: 文件名
    :param _stop_words: 停用词
    :return: 邮件（用一个单词列表表示）
    """
    with open(filename, "rb") as _fp:
        ec = chardet.detect(_fp.read())['encoding']
    with open(filename, "r", encoding=ec) as _fp:
        _email = extract_words(_fp.read(), _stop_words)
    return _email

    
def load_data_set(data_name):
    """ 加载数据

    :param data_name: 数据名：enron_email, ling_bare_email, CSDMC2010_email
    :return: 正常邮件集，垃圾邮件集
    """
    _emails_ham = {}; _emails_spam = {}
    for _fn in  os.listdir('./data/%s/ham' % data_name):
        if _fn == '.DS_Store':
            continue

        _email_list = load_email('./data/%s/ham/%s' % (data_name, _fn), stop_words)
        if len(_email_list) > 5:
            _emails_ham[_fn] = _email_list

    for _fn in  os.listdir('./data/%s/spam' % data_name):
        if _fn == '.DS_Store':
            continue

        _email_list = load_email('./data/%s/spam/%s' % (data_name, _fn), stop_words)
        if len(_email_list) > 5:
            _emails_spam[_fn] = _email_list

    return _emails_ham, _emails_spam

## 加载enron_email数据作为训练集

In [3]:
# 加载enron_email数据（较慢，可能需要1分钟）
enron_hams, enron_spams = load_data_set('enron_email')
print(len(enron_hams), len(enron_spams))

16363 16693


## 训练

enron_email 数据中取2000个作为测试集，不参与训练

In [4]:
def word_count(emails, test_fns=[]):
    """ 统计每个单词在邮件中出现的次数

    :param emails: 所有邮件
    :param test_fns: 测试用例的名字
    :return: 单词出现的次数
    """
    _word_count = {}
    for _fn in emails:
        if _fn in test_fns.keys():
            continue  # 排除测试用例
        for w in emails[_fn]:
            _word_count[w] = _word_count.get(w, 0) + 1
    return _word_count


def calc(email, _p_y1, _p_xi_y0, _p_xi_y1, _p_xi_y0_not_appear, _p_xi_y1_not_appear, is_use_not_appear=True):
    """ 计算一个邮件是否是垃圾邮件
    
    :param email: 邮件内容，由单词组成的list
    :param _p_y1: 样本中正常邮件所占的比率
    :param _p_xi_y0: 每个单词在垃圾邮件中出现的概率
    :param _p_xi_y1: 每个单词在正常邮件中出现的概率
    :param _p_xi_y0_not_appear: 垃圾邮件中未出现过的词，给一个很小的概率
    :param _p_xi_y1_not_appear: 正常邮件中未出现过的词，给一个很小的概率
    :param is_use_not_appear: 是否使用未出现过的词，True表示使用
    :return: 正常邮件与垃圾邮件概率的比值，大于1表示分类结果是正常邮件
    """
    rate = (_p_y1) / (1.0 - _p_y1)
    for w in email:
        if is_use_not_appear == False and (w not in _p_xi_y1 or w not in _p_xi_y0):
            continue
        rate *= _p_xi_y1.get(w, _p_xi_y0_not_appear) / _p_xi_y0.get(w, _p_xi_y1_not_appear)
        
    return rate


# 随机选取测试用例
test_spam_count = 2000; test_ham_count = 2000
enron_test_ham_fns = dict(zip(random.sample(enron_hams.keys(), test_ham_count), [1] * 2000))
enron_test_spam_fns = dict(zip(random.sample(enron_spams.keys(), test_ham_count), [1] * 2000))

# 统计每个单词分别在正常邮件、垃圾邮件中出现的次数
enron_ham_word_count = word_count(enron_hams, enron_test_ham_fns)
enron_spam_word_count = word_count(enron_spams, enron_test_spam_fns)
print(len(enron_ham_word_count), len(enron_spam_word_count))

# 计算概率
p_xi_y0 = {}; p_xi_y1 ={}
for w in enron_ham_word_count:
    p_xi_y1[w] = (enron_ham_word_count[w] + 1.0) / (len(enron_hams) + 2.0)  # 单词xi在正常邮件中出现的概率
for w in enron_spam_word_count:
    p_xi_y0[w] = (enron_spam_word_count[w] + 1.0) / (len(enron_spams) + 2.0)  # 单词xi在垃圾邮件中出现的概率
p_y1 = float(len(enron_hams)) / float(len(enron_hams) + len(enron_spams))  # 正常邮件的概率
p_xi_y1_not_appear = 1.0 / (2.0 + len(enron_spams))  # 拉普拉斯平滑，给未见过的词一个很小的概率
p_xi_y0_not_appear = 1.0 / (2.0 + len(enron_hams))  # 拉普拉斯平滑，给未见过的词一个很小的概率

print(p_y1)

47968 109712
0.49500847047434654


## 测试

### enron_email 测试集测试

In [5]:
# 测试
err_ham_count = 0; err_spam_count = 0
for fn in enron_test_ham_fns:
    rate = calc(enron_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_ham_count += 1

for fn in enron_test_spam_fns:
    rate = calc(enron_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ham_count, test_ham_count, 100.0 * err_ham_count / test_ham_count,\
       err_spam_count, test_spam_count, 100.0 * err_spam_count / test_spam_count))

# 忽略没有出现过的单词
err_ham_count = 0; err_spam_count = 0
for fn in enron_test_ham_fns:
    rate = calc(enron_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_ham_count += 1

for fn in enron_test_spam_fns:
    rate = calc(enron_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ham_count, test_ham_count, 100.0 * err_ham_count / test_ham_count,\
       err_spam_count, test_spam_count, 100.0 * err_spam_count / test_spam_count))

error ham: 15/2000 (0.75%); error spam: 30/2000 (1.50%)
error ham: 24/2000 (1.20%); error spam: 56/2000 (2.80%)


### ling_email 数据作为测试集

In [6]:
# 加载ling_email数据
ling_hams, ling_spams = load_data_set('ling_email')
print(len(ling_hams), len(ling_spams))

2410 480


In [7]:
# 测试ling_email数据
err_ling_ham_count = 0; err_ling_spam_count = 0
for fn in ling_hams:
    rate = calc(ling_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_ling_ham_count += 1

for fn in ling_spams:
    rate = calc(ling_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_ling_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ling_ham_count, len(ling_hams), 100.0 * err_ling_ham_count / len(ling_hams),\
       err_ling_spam_count, len(ling_spams), 100.0 * err_ling_spam_count / len(ling_spams)))

# 忽略没有出现过的单词
err_ling_ham_count = 0; err_ling_spam_count = 0
for fn in ling_hams:
    rate = calc(ling_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_ling_ham_count += 1

for fn in ling_spams:
    rate = calc(ling_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_ling_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_ling_ham_count, len(ling_hams), 100.0 * err_ling_ham_count / len(ling_hams),\
       err_ling_spam_count, len(ling_spams), 100.0 * err_ling_spam_count / len(ling_spams)))

error ham: 543/2410 (22.53%); error spam: 14/480 (2.92%)
error ham: 281/2410 (11.66%); error spam: 17/480 (3.54%)


### CSDMC2010_email 数据作为测试集

可能正常邮件中有大量的单词在样本中没有出现过，导致正确率不高

In [8]:
# 加载CSDMC2010_email数据
csdmc_hams, csdmc_spams = load_data_set('CSDMC2010_email')
print(len(csdmc_hams), len(csdmc_spams))

2948 1376


In [9]:
# 测试CSDMC2010_email数据
err_csdmc_ham_count = 0; err_csdmc_spam_count = 0
for fn in csdmc_hams:
    rate = calc(csdmc_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate < 1:
        err_csdmc_ham_count += 1

for fn in csdmc_spams:
    rate = calc(csdmc_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear)
    if rate >= 1:
        err_csdmc_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_csdmc_ham_count, len(csdmc_hams), 100.0 * err_csdmc_ham_count / len(csdmc_hams),\
       err_csdmc_spam_count, len(csdmc_spams), 100.0 * err_csdmc_spam_count / len(csdmc_spams)))

# 忽略没有出现过的单词
err_csdmc_ham_count = 0; err_csdmc_spam_count = 0
for fn in csdmc_hams:
    rate = calc(csdmc_hams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate < 1:
        err_csdmc_ham_count += 1

for fn in csdmc_spams:
    rate = calc(csdmc_spams[fn], p_y1, p_xi_y0, p_xi_y1, p_xi_y0_not_appear, p_xi_y1_not_appear, False)
    if rate >= 1:
        err_csdmc_spam_count += 1

print('error ham: %s/%s (%.2f%%); error spam: %s/%s (%.2f%%)' % \
      (err_csdmc_ham_count, len(csdmc_hams), 100.0 * err_csdmc_ham_count / len(csdmc_hams),\
       err_csdmc_spam_count, len(csdmc_spams), 100.0 * err_csdmc_spam_count / len(csdmc_spams)))

error ham: 2111/2948 (71.61%); error spam: 2/1376 (0.15%)
error ham: 1637/2948 (55.53%); error spam: 17/1376 (1.24%)
