# 导入需要用到的库

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import joblib

## 先将要分的类用键值对表示，使用int类型数据方便缩小tag的长度

In [62]:
LABEL_MAP = {'现代文' : 0, '文言文': 1, '诗' : 2, '词' : 3}
PATH = "D:/project/ancientModernProseClassifier/data/"

# ↓导入所有分词完成之后的文本和数据，保存在documents的列表中,标签保存在labels列表

In [63]:
documents = []
labels = []

for label_dir in os.listdir(PATH):
    file_path = os.path.join(PATH, label_dir)
    for file in os.listdir(file_path):
        filename = os.path.join(file_path, file)
        with open(filename, 'r', encoding='UTF-8') as fr:
            for line in fr.readlines():
                if (line != '\n'):
                    labels.append(LABEL_MAP[label_dir])
                    documents.append(line)

# 划分训练集和测试集

In [64]:
train_data_list, test_data_list, train_class_list, test_class_list = train_test_split(documents, labels, test_size=0.2)

#### 验证训练集和测试集的大小是否是0.8:0.2

In [65]:
len(train_data_list) / len(test_data_list)

3.9999976295622037

# 计算每段文本的TF-IDF，生成矩阵

In [66]:
train_tfidf_vector = TfidfVectorizer(token_pattern='\\b\\w+\\b', max_df=0.5)
train_x = train_tfidf_vector.fit_transform(train_data_list)

In [67]:
train_x

<1687451x3151396 sparse matrix of type '<class 'numpy.float64'>'
	with 48051151 stored elements in Compressed Sparse Row format>

In [68]:
test_tfidf_vector = TfidfVectorizer(max_df=0.5, vocabulary=train_tfidf_vector.vocabulary_)
test_x = test_tfidf_vector.fit_transform(test_data_list)

In [69]:
test_x

<421863x3151396 sparse matrix of type '<class 'numpy.float64'>'
	with 8746856 stored elements in Compressed Sparse Row format>

# 训练模型

In [72]:
MultinomialNB_clf = MultinomialNB(alpha = 0.1)
BernoulliNB_clf = BernoulliNB(alpha = 0.01)
MultinomialNB_clf.fit(train_x, train_class_list)
BernoulliNB_clf.fit(train_x, train_class_list)

BernoulliNB(alpha=0.01)

# 测试模型

In [77]:
MultinomialNB_pred = MultinomialNB_clf.predict(test_x)
BernoulliNB_pred = BernoulliNB_clf.predict(test_x)
print("多项式朴素贝叶斯的accuracy_socre为:", accuracy_score(test_class_list, MultinomialNB_pred))
print("伯努利朴素贝叶斯的accuracy_socre为:", accuracy_score(test_class_list, BernoulliNB_pred))

多项式朴素贝叶斯的accuracy_socre为: 0.914946321436106
伯努利朴素贝叶斯的accuracy_socre为: 0.90774019053579


# 保存模型

In [78]:
joblib.dump(train_tfidf_vector, 'TF-IDF_model.pkl')
joblib.dump(MultinomialNB_clf, 'MultinomialNB_clf_model.pkl')
joblib.dump(BernoulliNB_clf, 'BernoulliNB_clf_model.pkl')

['BernoulliNB_clf_model.pkl']