In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning,
                        message="'multi_class' was deprecated")
warnings.filterwarnings("ignore", category=FutureWarning,
                        message="The SAMME.R algorithm .* is deprecated")
import datetime
import pandas as pd
from sklearn.utils import Bunch


In [2]:
# 导入训练数据
print('导入训练数据' + str(datetime.datetime.now()))

train_df = pd.read_excel('主题分类训练集.xlsx', header=0)
dataset_train = Bunch(target=train_df.主题名, target_names=train_df.主题名, data=train_df.分词后内容)

# 检查训练数据是否合规
categories = {'主题1':0, '主题2':0, '主题3':0, '主题4':0, '主题5':0}
i = 0; err = 0
while i < len(dataset_train.data):
    if len(str(dataset_train.data[i])) < 1: err = err + 1 # 文本少于1个字视为异常
    categories[dataset_train.target[i]] += 1 # 累计各类文本数量
    i = i + 1

# 打印出 各类训练数量、总数量 及 异常数量
print(categories) 
print('训练数据数量：' + str(len(dataset_train.data)) + '，异常数据数量：' + str(err) 
      + '，结束时间：' + str(datetime.datetime.now()))
print('')

# 导入评估数据
print('导入评估数据' + str(datetime.datetime.now()))

test_df = pd.read_excel('主题分类测试集.xlsx', header=0)
dataset_test = Bunch(target=test_df.主题名, target_names=test_df.主题名, data=test_df.分词后内容)

# 检查评估数据是否合规
categories = {'主题1':0, '主题2':0, '主题3':0, '主题4':0, '主题5':0}
i = 0; err = 0
while i < len(dataset_test.data):
    if len(str(dataset_test.data[i])) < 1: err = err + 1 # 文本少于10个字视为异常
    categories[dataset_test.target[i]] += 1 # 累计各类文本数量
    i = i + 1

# 打印出 各类评估数量、总数量 及 异常数量
print(categories) 

print('评估数据数量：' + str(len(dataset_test.data)) + '，异常数据数量：' + str(err) 
      + '，结束时间：' + str(datetime.datetime.now()))


导入训练数据2024-12-24 10:33:21.533656
{'主题1': 36736, '主题2': 29553, '主题3': 29445, '主题4': 32538, '主题5': 34404}
训练数据数量：162676，异常数据数量：0，结束时间：2024-12-24 10:33:29.185294

导入评估数据2024-12-24 10:33:29.185294
{'主题1': 501, '主题2': 501, '主题3': 501, '主题4': 501, '主题5': 501}
评估数据数量：2505，异常数据数量：0，结束时间：2024-12-24 10:33:29.325902


In [3]:
# 2.数据准备与理解
print('开始时间：' + str(datetime.datetime.now()))

# 计算词频
print('计算词频' + str(datetime.datetime.now()))
count_vect = CountVectorizer(stop_words='english', decode_error='ignore')
X_train_counts = count_vect.fit_transform(dataset_train.data)

# 查看数据维度
print(X_train_counts.shape)
print('结束时间：' + str(datetime.datetime.now()))
print(' ')

# 计算TF-IDF：词频 x 逆文本频率 = (特定词语的词频/文件长度) x log(总文档数/含有特定词的文档数)
print('计算TF-IDF' + str(datetime.datetime.now()))
tf_transformer = TfidfVectorizer(stop_words='english', decode_error='ignore')
X_train_counts_tf = tf_transformer.fit_transform(dataset_train.data)
# 查看数据维度
print(X_train_counts_tf.shape)

print('\n结束时间：' + str(datetime.datetime.now()))

开始时间：2024-12-24 10:33:29.333939
计算词频2024-12-24 10:33:29.333939
(162676, 39698)
结束时间：2024-12-24 10:33:30.593467
 
计算TF-IDF2024-12-24 10:33:30.593467
(162676, 39698)

结束时间：2024-12-24 10:33:31.645082


In [4]:
# 3.评估算法
print('开始时间：' + str(datetime.datetime.now()))

# 设置评估算法的基准
num_folds = 10
seed = 7
scoring = 'accuracy'

# 生成算法模型
models = {}
models['LR'] = LogisticRegression(solver='lbfgs', multi_class='auto',max_iter=300) # 逻辑回归
models['CART'] = DecisionTreeClassifier() # 决策树分类器
models['MNB'] = MultinomialNB() # 补素贝叶斯分类器
models['KNN'] = KNeighborsClassifier() # K邻近分类器

# 比较算法
results = []
for key in models:
    print(key + '算法' + str(datetime.datetime.now()))
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#    kfold = KFold(n_splits=num_folds)
    cv_results = cross_val_score(models[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s : %f (%f)，结束时间：%s' % (key, cv_results.mean(), cv_results.std(), datetime.datetime.now()))
    
print('\n结束时间：' + str(datetime.datetime.now()))

开始时间：2024-12-24 10:33:31.653188
LR算法2024-12-24 10:33:31.653188
LR : 0.912021 (0.002375)，结束时间：2024-12-24 10:35:30.664876
CART算法2024-12-24 10:35:30.664876
CART : 0.807513 (0.003218)，结束时间：2024-12-24 10:48:02.888457
MNB算法2024-12-24 10:48:02.888457
MNB : 0.847144 (0.001962)，结束时间：2024-12-24 10:48:06.722548
KNN算法2024-12-24 10:48:06.722548
KNN : 0.764286 (0.002418)，结束时间：2024-12-24 10:54:32.548042

结束时间：2024-12-24 10:54:32.548042


In [5]:
# 4.算法调参
print('开始时间：' + str(datetime.datetime.now()))

# 调参LR
param_grid = {}
param_grid['C'] = [13, 15, 20, 200] # 逻辑回归的超参数是目标约束函数C

print('LR 算法调参' + str(datetime.datetime.now()))
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参MNB
param_grid = {}
param_grid['alpha'] = [0.001, 0.01, 0.1, 1.5] # 补素贝叶斯是通过alpha调参

print('MNB 算法调参' + str(datetime.datetime.now()))
model = MultinomialNB()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参CART
param_grid = {}
param_grid['max_depth'] = [90, 100, 120, 150, 200] # 决策树是通过 最大深度 调参

print('CART 算法调参' + str(datetime.datetime.now()))
model = DecisionTreeClassifier(criterion = 'gini') # 决策树分类器
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参KNN
param_grid = {}
param_grid['n_neighbors'] = [3, 5, 7, 9, 11] # K邻近是通过 邻近数(k) 调参

print('KNN 算法调参' + str(datetime.datetime.now()))
model = KNeighborsClassifier() # K邻近分类器

kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))

开始时间：2024-12-24 10:54:32.563065
LR 算法调参2024-12-24 10:54:32.563065
最优 : 0.9339546331352864 使用 {'C': 200}，结束时间：2024-12-24 11:05:23.785624
 
MNB 算法调参2024-12-24 11:05:23.785624
最优 : 0.8484164942143181 使用 {'alpha': 1.5}，结束时间：2024-12-24 11:05:38.177778
 
CART 算法调参2024-12-24 11:05:38.177778
最优 : 0.6280582664147583 使用 {'max_depth': 200}，结束时间：2024-12-24 11:27:45.789688
 
KNN 算法调参2024-12-24 11:27:45.789688
最优 : 0.7693636755852352 使用 {'n_neighbors': 3}，结束时间：2024-12-24 11:57:20.094119


In [9]:
print('开始时间：'+str(datetime.datetime.now()))

model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000, C = 200)
# model = MultinomialNB(alpha = 0.01)
model.fit(X_train_counts_tf, dataset_train.target) # 训练模型
X_test_counts = tf_transformer.transform(dataset_test.data) # 评估数据集
predictions = model.predict(X_test_counts) # 预测
print('准确率：' + str(accuracy_score(dataset_test.target, predictions)))
print(classification_report(dataset_test.target, predictions))

开始时间：2024-12-24 13:51:48.208469
准确率：0.9297405189620759
              precision    recall  f1-score   support

         主题1       0.92      0.94      0.93       501
         主题2       0.93      0.91      0.92       501
         主题3       0.94      0.94      0.94       501
         主题4       0.95      0.94      0.94       501
         主题5       0.91      0.93      0.92       501

    accuracy                           0.93      2505
   macro avg       0.93      0.93      0.93      2505
weighted avg       0.93      0.93      0.93      2505

