kaggle 垃圾短信分类，数据集https://www.kaggle.com/uciml/sms-spam-collection-dataset 参考:https://www.kaggle.com/pablovargas/naive-bayes-svm-spam-filtering

使用朴素贝叶斯进行文本分类。学习文本处理（分词，停用词过滤），贝叶斯分类器

延深：使用贝叶斯分类器进行中文邮件分类

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split
import re
%matplotlib inline  

## 数据分析

In [2]:
data = pd.read_csv('D:/my/mldemo/emailcheck/data/spam.csv',encoding='latin-1')
data['v2'] = data['v2'].str.lower()
def filterdata(s):
    numcount = len(re.findall("\d+", s))
    strlen = len(s)
    if numcount/strlen>0.05:
        s = "adphone " + s 
    r = re.search("(http[s]?:\/\/)?[\w\d]+\.\s[\w\d]+\.\s*[\w]+",s,re.M|re.I)
    if r!=None:
        s = " adweb " + s  
    return  s

data['v2'] = data['v2'].apply(filterdata)
data.head(n=20)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"go until jurong point, crazy.. available only ...",,,
1,ham,ok lar... joking wif u oni...,,,
2,spam,free entry in 2 a wkly comp to win fa cup fina...,,,
3,ham,u dun say so early hor... u c already then say...,,,
4,ham,"nah i don't think he goes to usf, he lives aro...",,,
5,spam,freemsg hey there darling it's been 3 week's n...,,,
6,ham,even my brother is not like to speak with me. ...,,,
7,ham,as per your request 'melle melle (oru minnamin...,,,
8,spam,winner!! as a valued network customer you have...,,,
9,spam,had your mobile 11 months or more? u r entitle...,,,


In [3]:
count1 = Counter(" ".join(data[data['v1']=='ham']["v2"]).split()).most_common(20) #查看正常短信中最常出现的20个词分别出现的次数，并用表格显示
df1 = pd.DataFrame.from_dict(count1)
df1 = df1.rename(columns={0: "words in non-spam", 1 : "count"})
df1

Unnamed: 0,words in non-spam,count
0,i,2172
1,you,1665
2,to,1544
3,the,1113
4,a,1046
5,u,874
6,and,845
7,in,786
8,my,741
9,is,710


In [4]:
count2 = Counter(" ".join(data[data['v1']=='spam']["v2"]).split()).most_common(20) #查看垃圾短信中最常出现的20个词分别出现的次数
df2 = pd.DataFrame.from_dict(count2)
df2 = df1.rename(columns={0: "words in non-spam", 1 : "count"})
df2

Unnamed: 0,words in non-spam,count
0,i,2172
1,you,1665
2,to,1544
3,the,1113
4,a,1046
5,u,874
6,and,845
7,in,786
8,my,741
9,is,710


## 特征工程

从上可以看到常出现的词都是一些无意义的介词，虚词等，因此需要先去除停用词

In [5]:
# 使用sklearn中CountVectoryzer获取文本特征，转化成一个向量形式，出现过的词为1，未出现的为0
f = feature_extraction.text.CountVectorizer(stop_words = 'english') 

X = f.fit_transform(data["v2"])
## 看一下去除之后的内容
a = pd.SparseDataFrame(X[0,:]).T
a.describe()

Unnamed: 0,0
count,13.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


## 训练

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, data['v1'], test_size=0.33, random_state=42)
print([np.shape(X_train), np.shape(X_test)])
a =data["v1"]
a.head(10)

[(3733, 8406), (1839, 8406)]


0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: v1, dtype: object

In [114]:
list_alpha = np.arange(1/100000, 20, 0.11)
score_train = np.zeros(len(list_alpha))
score_test = np.zeros(len(list_alpha))
recall_test = np.zeros(len(list_alpha))
precision_test= np.zeros(len(list_alpha))
count = 0
for alpha in list_alpha:
    bayes = naive_bayes.MultinomialNB(alpha=alpha)
    bayes.fit(X_train, y_train)
    score_train[count] = bayes.score(X_train, y_train)
    score_test[count]= bayes.score(X_test, y_test)
    recall_test[count] = metrics.recall_score(y_test, bayes.predict(X_test), average="binary", pos_label="spam")
    precision_test[count] = metrics.precision_score(y_test, bayes.predict(X_test), average="binary", pos_label="spam")
    count = count + 1 

matrix = np.matrix(np.c_[list_alpha, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data = matrix, columns = 
             ['alpha', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(n=20)


Unnamed: 0,alpha,Train Accuracy,Test Accuracy,Test Recall,Test Precision
0,1e-05,0.998393,0.974986,0.924603,0.896154
1,0.11001,0.997589,0.976618,0.944444,0.891386
2,0.22001,0.997857,0.978793,0.948413,0.901887
3,0.33001,0.997857,0.978249,0.948413,0.898496
4,0.44001,0.997053,0.978249,0.948413,0.898496
5,0.55001,0.996518,0.977705,0.948413,0.895131
6,0.66001,0.996518,0.977705,0.944444,0.898113
7,0.77001,0.99625,0.977162,0.936508,0.900763
8,0.88001,0.995714,0.977162,0.936508,0.900763
9,0.99001,0.995714,0.977162,0.93254,0.903846


In [115]:
best_index = models['Test Precision'].idxmax()
models.iloc[best_index, :]

alpha             15.620010
Train Accuracy     0.980980
Test Accuracy      0.970092
Test Recall        0.781746
Test Precision     1.000000
Name: 142, dtype: float64