In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# 获取数据
def get_data():
    '''
    获取数据
    :return: 文本数据，对应的labels
    '''
    with open("ham_data.txt", encoding="utf8") as ham_f, open("spam_data.txt", encoding="utf8") as spam_f:
        # 正常的邮件数据
        ham_data = ham_f.readlines()
        # 垃圾邮件数据
        spam_data = spam_f.readlines()
        
        # 正常数据标记为1
        ham_label = np.ones(len(ham_data)).tolist()
        # 垃圾邮件数据标记为0
        spam_label = np.zeros(len(spam_data)).tolist()
        
        # 数据集集合
        corpus = ham_data + spam_data
         
        # 标记数据集合
        labels = ham_label + spam_label

    return corpus, labels

In [3]:
# 数据分割
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    '''
    :param corpus: 文本数据
    :param labels: label数据
    :param test_data_proportion:测试数据占比 
    :return: 训练数据,测试数据，训练label,测试label
    '''
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                        test_size=test_data_proportion, random_state=42)
    return train_X, test_X, train_Y, test_Y

In [4]:
# 移除空的数据
def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)

    return filtered_corpus, filtered_labels

In [5]:
corpus, labels = get_data()  # 获取数据集

print("总的数据量:", len(labels))

corpus, labels = remove_empty_docs(corpus, labels)

print('样本之一:', corpus[10])
print('样本的label:', labels[10])
label_name_map = ["垃圾邮件", "正常邮件"]
print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])])

# 对数据进行划分
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
                                                                        labels,
                                                                        test_data_proportion=0.3)

总的数据量: 10001
样本之一: 北京售票员可厉害，嘿嘿，有专座的，会直接拉着脖子指着鼻子让上面的人站起来让 座的，呵呵，比较赞。。。 杭州就是很少有人给让座，除非司机要求乘客那样做。 五一去杭州一个景点玩，车上有两个不到一岁的小孩，就是没有人给让座，没办法家长只能在车上把小孩的推车打开让孩子坐进去，但是孩子还是闹，只能抱着，景点离市区很远，车上很颠，最后家长坐在地上抱孩子，就是没有一个人给让座，要是在北京，一上车就有人让座了

样本的label: 1.0
实际类型: 正常邮件 垃圾邮件


In [6]:
# 归一化函数

import re
import string
import jieba

# 加载停用词
with open("stop_words.utf8", encoding="utf8") as f:
    stopword_list = f.readlines()

# 分词
def tokenize_text(text):
    tokens = jieba.cut(text)
    tokens = [token.strip() for token in tokens]
    return tokens


# 去掉特殊符号
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# 取出停用词
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

# 标准化数据集
def normalize_corpus(corpus, tokenize=False):
    # 声明一个变量用来存储标准化后的数据
    normalized_corpus = []
    for text in corpus:
            # 去掉特殊符号
        text = remove_special_characters(text)
            # 取出停用词
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)

    return normalized_corpus

In [7]:

# 进行归一化
norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/yk/j1gndz3j7zzd7vxlzs6wj1q00000gn/T/jieba.cache
Loading model cost 0.985 seconds.
Prefix dict has been built succesfully.


In [8]:
# 特征提取

import gensim
import jieba

from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

# tfidf 特征    标准化后的数据送入函数进行提取
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)


# 导入贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()


from sklearn import metrics

# 模型性能预测函数
def get_metrics(true_labels, predicted_labels):
    print('准确率:', np.round(
        metrics.accuracy_score(true_labels,
                               predicted_labels),
        2))
    print('精度:', np.round(
        metrics.precision_score(true_labels,
                                predicted_labels,
                                average='weighted'),
        2))
    print('召回率:', np.round(
        metrics.recall_score(true_labels,
                             predicted_labels,
                             average='weighted'),
        2))
    print('F1得分:', np.round(
        metrics.f1_score(true_labels,
                         predicted_labels,
                         average='weighted'),
        2))


# 模型调用函数，这样做的好处是，可以自己人选模型
def train_predict_evaluate_model(classifier,
                                 train_features, train_labels,
                                 test_features, test_labels):
    # 模型构建
    classifier.fit(train_features, train_labels)
    # 使用哪个模型做预测
    predictions = classifier.predict(test_features)
    # 评估模型预测性能
    get_metrics(true_labels=test_labels,
                predicted_labels=predictions)
    return predictions


# 基于tfidf的多项式朴素贝叶斯模型
print("基于tfidf的贝叶斯模型")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                     train_features=tfidf_train_features,
                                                     train_labels=train_labels,
                                                     test_features=tfidf_test_features,
                                                     test_labels=test_labels)



基于tfidf的贝叶斯模型
准确率: 0.79
精度: 0.85
召回率: 0.79
F1得分: 0.78
