# 中文邮件垃圾分类


数据集https://pan.baidu.com/s/1LKYOtDsYEZ9FibavauWLuQ

参考https://zhuanlan.zhihu.com/p/49040330


In [8]:
import numpy as np
import pandas as pd
import jieba
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split
##读入数据，由于数据格式是文本格式，需要转换成pandas

stopwords = []
with open("D:\my\mldemo\emailcheck\data\chinease_stop.txt",encoding="utf-8") as fp:
    stopwords = fp.read().splitlines()


    

def readdata(label,filepath):
    data = []
    with open(filepath,encoding="utf-8") as fp:
        for line in fp:
            tmp = {}
            words = jieba.cut(fp.readline())
            w = trimstropword(words)
            tmp['email'] = " ".join(w)
            tmp['label'] = label
            data.append(tmp)
    return data

def trimstropword(words):
    trimdwords = []
    for word in words:
        
        if word not in stopwords:
            trimdwords.append(word)
    return trimdwords







In [25]:
data = pd.DataFrame.from_dict(readdata("spam","D:\my\mldemo\emailcheck\data\spam_5000.utf8"))

df2 = pd.DataFrame.from_dict(readdata("ham","D:\my\mldemo\emailcheck\data\ham_5000.utf8"))
data = pd.concat([data,df2])
data.head(10)

Unnamed: 0,email,label
0,我司 一家 实业 贸易 定税 企业 余额 票向 外开 费用 相对 低 操作 方式 为贵 ...,spam
1,来京 记得 找 社为 提供 优惠 旅游 价格 酒店 机票 火车票 北京 地接 在线 ...,spam
2,您好 公司 可代开 商品销售 运输业 建筑安装 业 广告业 服务业 发票 税务局 验后 ...,spam
3,"也许 能够 找到 苦苦 追寻 程序代码 , 代码 价值 远远 超过 价格 , 此道 中人...",spam
4,中信 国际 电子科技 有限公司 推出 新 产品 升职 步步高 做生意 发大财 找 情人 ...,spam
5,中信 国际 电子科技 有限公司 推出 新 产品 升职 步步高 做生意 发大财 找 情人 ...,spam
6,中信 国际 电子科技 有限公司 推出 新 产品 升职 步步高 做生意 发大财 找 情人 ...,spam
7,您好 公司 多余 发票 外代 开 国税 地税 运输 广告 海关 缴款 书 贵 公司 ...,spam
8,公司 主营 国际 国内 飞机票 包定 国内外 酒店 服务 热情周到 北京 四环 以内 免...,spam
9,"公司 负责人 你们好 一家 专业 代理 公司 , 现在 多余 发票 向外 优惠 代开 具...",spam


In [27]:
count1 = Counter(" ".join(data[data['label']=='ham']["email"]).split()).most_common(20)

df1 = pd.DataFrame.from_dict(count1)
df1 = df1.rename(columns={0: "words in non-spam", 1 : "count"})
count2 = Counter(" ".join(data[data['label']=='spam']["email"]).split()).most_common(20)
df2 = pd.DataFrame.from_dict(count2)
df2 = df2.rename(columns={0: "words in spam", 1 : "count_"})


In [28]:
df1

Unnamed: 0,words in non-spam,count
0,",",2764
1,说,1651
2,一个,1305
3,没有,981
4,题,771
5,标,760
6,知道,740
7,会,734
8,现在,691
9,觉得,677


In [29]:
df2

Unnamed: 0,words in spam,count_
0,公司,1800
1,com,1743
2,",",1512
3,http,1326
4,发票,1325
5,www,885
6,有限公司,764
7,服务,653
8,@,616
9,电话,604


In [51]:
f = feature_extraction.text.CountVectorizer()
X = f.fit_transform(data["email"])
np.shape(X)

(5001, 23400)

In [31]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, data['label'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])



[(3350, 23400), (1651, 23400)]


In [33]:
list_alpha = np.arange(1/100000, 20, 0.11)
score_train = np.zeros(len(list_alpha))
score_test = np.zeros(len(list_alpha))
recall_test = np.zeros(len(list_alpha))
precision_test= np.zeros(len(list_alpha))
count = 0
for alpha in list_alpha:
    bayes = naive_bayes.MultinomialNB(alpha=alpha)
    bayes.fit(X_train, y_train)
    score_train[count] = bayes.score(X_train, y_train)
    score_test[count]= bayes.score(X_test, y_test)
    recall_test[count] = metrics.recall_score(y_test, bayes.predict(X_test), average="binary", pos_label="spam")
    precision_test[count] = metrics.precision_score(y_test, bayes.predict(X_test), average="binary", pos_label="spam")
    count = count + 1 
matrix = np.matrix(np.c_[list_alpha, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data = matrix, columns = 
             ['alpha', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(n=20)

Unnamed: 0,alpha,Train Accuracy,Test Accuracy,Test Recall,Test Precision
0,1e-05,0.998806,0.986069,0.974148,0.998795
1,0.11001,0.998806,0.993943,0.990599,0.997633
2,0.22001,0.998209,0.994549,0.992949,0.996462
3,0.33001,0.998209,0.992732,0.992949,0.992949
4,0.44001,0.998209,0.992732,0.992949,0.992949
5,0.55001,0.998209,0.992732,0.992949,0.992949
6,0.66001,0.997313,0.992732,0.992949,0.992949
7,0.77001,0.996716,0.992732,0.994125,0.991794
8,0.88001,0.996716,0.993337,0.9953,0.991803
9,0.99001,0.996418,0.992732,0.994125,0.991794


In [34]:
best_index = models['Test Precision'].idxmax()
models.iloc[best_index, :]

alpha             0.000010
Train Accuracy    0.998806
Test Accuracy     0.986069
Test Recall       0.974148
Test Precision    0.998795
Name: 0, dtype: float64

In [35]:
bayes = naive_bayes.MultinomialNB(alpha=0.00001)
bayes.fit(X, data['label'])
score =  bayes.score(X, data['label'])


In [36]:
score

0.9994001199760048

In [62]:
t = pd.Series(["我司 免费 办理 发票 代开 业务","免费 领取 优惠券 联系 电话 13800000000","明天 上午 十点 办公室 开会","你的 发票 拿错 了","我们 彻底 搜索 了 课程 目录 并 找到 符合 您 兴趣 的 课程 和 专项 课程"])
Xt = f.transform(t)

np.shape(Xt)

(5, 23400)

In [63]:
predicted = bayes.predict(Xt)


In [64]:
predicted

array(['spam', 'ham', 'ham', 'spam', 'ham'], dtype='<U4')