# 读取数据

In [1]:
from instrument import read_bunch

train_bunch_path = './data_bunch/train_bunch_balance.dat'       # 训练集保存地址，由于后期需要调用该数据且地址也相同，故选择该地址
train_bunch = read_bunch(train_bunch_path)

validate_bunch_path = './data_bunch/validate_bunch.dat' # 验证集保存地址，由于后期需要调用该数据且地址也相同，故选择该地址
validate_bunch = read_bunch(validate_bunch_path)

# 寻找 max_df 和 alpha

In [4]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics, feature_selection
from sklearn.metrics import classification_report
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from instrument import read_bunch
stop_words_list = None

results = [['min_df', 'max_df', 'feature percent', 'alpha', 'accuracy_best', 'accuracy_test']]
for min_df in [0,0.0001,0.005,0.01]:              # 控制 min_df ，尽量小
    for max_df in [0.6,0.7,0.8,0.9,0.99]:          # 控制 max_df ，尽量大
        tfidf_train = Bunch()
        tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_bunch.news_pic_label, tdm=[], vocabulary={})
        train_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, min_df=min_df,
                                            max_df=max_df)
        tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)  # 取 jieba 分词结果进行模型训练
#         tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_ltp)  # 取 ltp 分词结果进行模型训练
        tfidf_train.vocabulary = train_vectorizer.vocabulary_

        # 训练集与测试集划分
        from sklearn.model_selection import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(tfidf_train.tdm, tfidf_train.Label, test_size=0.3,
                                                            random_state=30)
        for feature_percent in [0.9,0.95,0.99]:   # 控制参加模型训练的特征的比例
            feature_select = feature_selection.SelectPercentile(feature_selection.chi2, percentile=feature_percent)
            x_train_fs = feature_select.fit_transform(x_train, y_train)
            x_test_fs = feature_select.transform(x_test)

            model = MultinomialNB(alpha=0.1)
            param = {'alpha': [0.9,0.5,0.099,0.055,0.001]}
            gs_model = GridSearchCV(model, param, scoring='accuracy', verbose=0, cv=5, n_jobs=-1)

            gs_model.fit(x_train_fs, y_train)
            y_true, y_pred = y_test, gs_model.predict(x_test_fs)

            print(''.center(130, '*'))
            print('Accuracy is %g' % metrics.accuracy_score(y_true, y_pred))
            print('The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is :',
                    gs_model.score(x_test_fs, y_test))
            print(classification_report(y_test, y_pred))
            print('Current best parameters and result :',
                    [min_df, max_df, feature_percent, gs_model.best_params_['alpha'], gs_model.best_score_,
                    gs_model.score(x_test_fs, y_test)])
            results.append([min_df, max_df, feature_percent, gs_model.best_params_['alpha'], gs_model.best_score_,
                            gs_model.score(x_test_fs, y_test)])
            print(''.center(130, '*'))

# 查找使贝叶斯模型在验证集上性能最佳的参数组合并进行输出
accu = []
for index in range(len(results)):
    accu.append(results[index][5])
index_best_param = accu.index(max(accu[1:]))
print('Best parameters: ', results[index_best_param])

# 将迭代结果写入本地
# import csv
# store_pamaters_path = './paramter/pamaters_jieba.csv'
# # store_pamaters_path = './paramter/pamaters_ltp.csv'
# with open(store_pamaters_path, 'w', newline="") as file:
#     csv.writer(file).writerows(results)

**********************************************************************************************************************************
Accuracy is 0.614139
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.6141394905974767
             precision    recall  f1-score   support

          0       0.58      0.80      0.67      7150
          1       0.61      0.24      0.35      4533
          2       0.69      0.68      0.69      5121

avg / total       0.62      0.61      0.59     16804

Current best parameters and result : [0, 0.6, 0.9, 0.001, 0.6379402688158331, 0.6141394905974767]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.615211
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.61521066412758

**********************************************************************************************************************************
Accuracy is 0.615151
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.615151154487027
             precision    recall  f1-score   support

          0       0.58      0.80      0.67      7150
          1       0.60      0.25      0.35      4533
          2       0.69      0.68      0.69      5121

avg / total       0.62      0.62      0.59     16804

Current best parameters and result : [0, 0.9, 0.99, 0.001, 0.6437552602718764, 0.615151154487027]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.614139
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.614139490597476

**********************************************************************************************************************************
Accuracy is 0.566591
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.5665912877886218
             precision    recall  f1-score   support

          0       0.51      0.92      0.66      7150
          1       0.78      0.04      0.08      4533
          2       0.74      0.54      0.63      5121

avg / total       0.65      0.57      0.49     16804

Current best parameters and result : [0.0001, 0.8, 0.95, 0.001, 0.5694865974648677, 0.5665912877886218]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.567662
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.56766246

**********************************************************************************************************************************
Accuracy is 0.451083
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.45108307545822424
             precision    recall  f1-score   support

          0       0.44      1.00      0.61      7150
          1       0.62      0.01      0.01      4533
          2       0.89      0.09      0.16      5121

avg / total       0.62      0.45      0.31     16804

Current best parameters and result : [0.005, 0.7, 0.9, 0.001, 0.4531102552985284, 0.45108307545822424]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.452333
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.45233277

**********************************************************************************************************************************
Accuracy is 0.452333
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.4523327779100214
             precision    recall  f1-score   support

          0       0.44      0.99      0.61      7150
          1       0.61      0.01      0.01      4533
          2       0.88      0.09      0.17      5121

avg / total       0.62      0.45      0.31     16804

Current best parameters and result : [0.005, 0.99, 0.99, 0.001, 0.45489555969292766, 0.4523327779100214]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.442811
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.4428112

**********************************************************************************************************************************
Accuracy is 0.446382
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.44638181385384434
             precision    recall  f1-score   support

          0       0.44      1.00      0.61      7150
          1       0.55      0.01      0.01      4533
          2       0.89      0.07      0.13      5121

avg / total       0.61      0.45      0.30     16804

Current best parameters and result : [0.01, 0.9, 0.95, 0.001, 0.4492590986763243, 0.44638181385384434]
**********************************************************************************************************************************
**********************************************************************************************************************************
Accuracy is 0.446501
The accuracy of Naive_Bayes Classifier on dataset wich splis from train data is : 0.44650083

# 预测

In [None]:
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import feature_selection
from sklearn.naive_bayes import MultinomialNB
# from instrument import read_bunch

# load train data and validate data
train_bunch_path = './data_bunch/train_bunch_balance.dat'
train_bunch = read_bunch(train_bunch_path)
validate_bunch_path = './data_bunch/validate_bunch_balance.dat'
validate_bunch = read_bunch(validate_bunch_path)

# 构建训练集的 TF-IDF 词向量空间对象（分为 jieba 和 ltp ）
tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_bunch.news_pic_label, tdm=[], vocabulary={})
train_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, min_df=best_min_df,
                                       max_df=best_max_df)
tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)  # 取 jieba 分词结果进行模型训练
# tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_ltp)  # 取 ltp 分词结果进行模型训练
tfidf_train.vocabulary = train_vectorizer.vocabulary_
# 构建验证集的 TF-IDF 词向量空间对象
tfidf_validate = Bunch(Id=validate_bunch.news_id, tdm=[], vocabulary={})
tfidf_validate.vocabulary = tfidf_train.vocabulary
validate_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True,
                                      min_df=best_min_df, max_df=best_max_df, vocabulary=tfidf_train.vocabulary)
tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_jieba)  # jieba 分词结果
# tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_ltp)  # ltp 分词结果

# 利用训练集所有数据训练模型并对验证集进行预测
model = MultinomialNB(best_alpha)
feature_select = feature_selection.SelectPercentile(feature_selection.chi2, percentile=best_feature_percentage)
x_train = feature_select.fit_transform(tfidf_train.tdm, tfidf_train.Label)
x_validate = feature_select.transform(tfidf_validate.tdm)
model.fit(x_train, tfidf_train.Label)
predict_label = model.predict(x_validate)