In [1]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning,
                        message="'multi_class' was deprecated")
warnings.filterwarnings("ignore", category=FutureWarning,
                        message="The SAMME.R algorithm .* is deprecated")
import datetime
import pandas as pd
from sklearn.utils import Bunch


In [2]:
# 导入训练数据
print('导入训练数据' + str(datetime.datetime.now()))

train_df = pd.read_excel('分类训练集.xlsx', header=0)
dataset_train = Bunch(target=train_df.歌名, target_names=train_df.歌名, data=train_df.分词后内容)

# 检查训练数据是否合规
categories = {'于是':0, '孤独患者':0, '罗生门':0}
i = 0; err = 0
while i < len(dataset_train.data):
    if len(str(dataset_train.data[i])) < 1: err = err + 1 # 文本少于1个字视为异常
    categories[dataset_train.target[i]] += 1 # 累计各类文本数量
    i = i + 1

# 打印出 各类训练数量、总数量 及 异常数量
print(categories) 
print('训练数据数量：' + str(len(dataset_train.data)) + '，异常数据数量：' + str(err) 
      + '，结束时间：' + str(datetime.datetime.now()))
print('')

# 导入评估数据
print('导入评估数据' + str(datetime.datetime.now()))

test_df = pd.read_excel('分类测试集.xlsx', header=0)
dataset_test = Bunch(target=test_df.歌名, target_names=test_df.歌名, data=test_df.分词后内容)

# 检查评估数据是否合规
categories = {'于是':0, '孤独患者':0, '罗生门':0}
i = 0; err = 0
while i < len(dataset_test.data):
    if len(str(dataset_test.data[i])) < 10: err = err + 1 # 文本少于10个字视为异常
    categories[dataset_test.target[i]] += 1 # 累计各类文本数量
    i = i + 1

# 打印出 各类评估数量、总数量 及 异常数量
print(categories) 

print('评估数据数量：' + str(len(dataset_test.data)) + '，异常数据数量：' + str(err) 
      + '，结束时间：' + str(datetime.datetime.now()))


导入训练数据2024-12-25 07:45:45.397573
{'于是': 54725, '孤独患者': 55583, '罗生门': 53368}
训练数据数量：163676，异常数据数量：0，结束时间：2024-12-25 07:45:54.179471

导入评估数据2024-12-25 07:45:54.179471
{'于是': 501, '孤独患者': 501, '罗生门': 503}
评估数据数量：1505，异常数据数量：979，结束时间：2024-12-25 07:45:54.289067


In [3]:
# 2.数据准备与理解
print('开始时间：' + str(datetime.datetime.now()))

# 计算词频
print('计算词频' + str(datetime.datetime.now()))
count_vect = CountVectorizer(stop_words='english', decode_error='ignore')
X_train_counts = count_vect.fit_transform(dataset_train.data)

# 查看数据维度
print(X_train_counts.shape)
print('结束时间：' + str(datetime.datetime.now()))
print(' ')

# 计算TF-IDF：词频 x 逆文本频率 = (特定词语的词频/文件长度) x log(总文档数/含有特定词的文档数)
print('计算TF-IDF' + str(datetime.datetime.now()))
tf_transformer = TfidfVectorizer(stop_words='english', decode_error='ignore')
X_train_counts_tf = tf_transformer.fit_transform(dataset_train.data)
# 查看数据维度
print(X_train_counts_tf.shape)

print('\n结束时间：' + str(datetime.datetime.now()))

开始时间：2024-12-25 07:45:54.302334
计算词频2024-12-25 07:45:54.302334
(163676, 39814)
结束时间：2024-12-25 07:45:55.273144
 
计算TF-IDF2024-12-25 07:45:55.273144
(163676, 39814)

结束时间：2024-12-25 07:45:56.169882


In [4]:
# 3.评估算法
print('开始时间：' + str(datetime.datetime.now()))

# 设置评估算法的基准
num_folds = 10
seed = 7
scoring = 'accuracy'

# 生成算法模型
models = {}
models['LR'] = LogisticRegression(solver='lbfgs', multi_class='auto',max_iter=300) # 逻辑回归
models['CART'] = DecisionTreeClassifier() # 决策树分类器
models['MNB'] = MultinomialNB() # 补素贝叶斯分类器
models['KNN'] = KNeighborsClassifier() # K邻近分类器

# 比较算法
results = []
for key in models:
    print(key + '算法' + str(datetime.datetime.now()))
    kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#    kfold = KFold(n_splits=num_folds)
    cv_results = cross_val_score(models[key], X_train_counts_tf, dataset_train.target, cv=kfold, scoring=scoring)
    results.append(cv_results)
    print('%s : %f (%f)，结束时间：%s' % (key, cv_results.mean(), cv_results.std(), datetime.datetime.now()))
    
print('\n结束时间：' + str(datetime.datetime.now()))

开始时间：2024-12-25 07:45:56.188483
LR算法2024-12-25 07:45:56.188483
LR : 0.478262 (0.003680)，结束时间：2024-12-25 07:47:36.814708
CART算法2024-12-25 07:47:36.814708
CART : 0.477737 (0.001955)，结束时间：2024-12-25 08:06:08.761380
MNB算法2024-12-25 08:06:08.761380
MNB : 0.480712 (0.004684)，结束时间：2024-12-25 08:06:12.421295
KNN算法2024-12-25 08:06:12.421295
KNN : 0.451196 (0.006087)，结束时间：2024-12-25 08:13:39.511260

结束时间：2024-12-25 08:13:39.520478


In [5]:
# 4.算法调参
print('开始时间：' + str(datetime.datetime.now()))

# 调参LR
param_grid = {}
param_grid['C'] = [13, 15, 20, 200] # 逻辑回归的超参数是目标约束函数C

print('LR 算法调参' + str(datetime.datetime.now()))
model = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=10000)
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参MNB
param_grid = {}
param_grid['alpha'] = [0.001, 0.01, 0.1, 1.5] # 补素贝叶斯是通过alpha调参

print('MNB 算法调参' + str(datetime.datetime.now()))
model = MultinomialNB()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参CART
param_grid = {}
param_grid['max_depth'] = [90, 100, 120, 150, 200] # 决策树是通过 最大深度 调参

print('CART 算法调参' + str(datetime.datetime.now()))
model = DecisionTreeClassifier(criterion = 'gini') # 决策树分类器
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))
print(' ')

# 调参KNN
param_grid = {}
param_grid['n_neighbors'] = [3, 5, 7, 9, 11] # K邻近是通过 邻近数(k) 调参

print('KNN 算法调参' + str(datetime.datetime.now()))
model = KNeighborsClassifier() # K邻近分类器

kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train_counts_tf, y=dataset_train.target)
print('最优 : %s 使用 %s，结束时间：%s' % (grid_result.best_score_, grid_result.best_params_, datetime.datetime.now()))

开始时间：2024-12-25 08:13:39.556793
LR 算法调参2024-12-25 08:13:39.556793
最优 : 0.4774921281115393 使用 {'C': 13}，结束时间：2024-12-25 08:33:53.615591
 
MNB 算法调参2024-12-25 08:33:53.615591
最优 : 0.48248980936773994 使用 {'alpha': 0.1}，结束时间：2024-12-25 08:34:07.255512
 
CART 算法调参2024-12-25 08:34:07.255512
最优 : 0.43678364336817027 使用 {'max_depth': 200}，结束时间：2024-12-25 08:55:45.024351
 
KNN 算法调参2024-12-25 08:55:45.024351
最优 : 0.4534752182959285 使用 {'n_neighbors': 9}，结束时间：2024-12-25 09:32:49.410026


In [7]:
# 6.选择最优算法，生成模型
from sklearn.metrics import classification_report
print ('6.选择最优算法，生成模型')
print('算法评估' + str(datetime.datetime.now()))

best_mnb = grid_result.best_estimator_
X_test_counts = count_vect.transform(dataset_test.data)
y_test = dataset_test.target
# 进行预测
y_pred = best_mnb.predict(X_test_counts) 
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f'准确率: {accuracy}')
class_report = classification_report(dataset_test.target, y_pred)
print('Classification Report:')
print(class_report)
print('MNB 算法调参结束' + str(datetime.datetime.now()))

6.选择最优算法，生成模型
算法评估2024-12-25 09:33:02.490245
准确率: 0.47840531561461797
Classification Report:
              precision    recall  f1-score   support

          于是       0.46      0.67      0.55       501
        孤独患者       0.37      0.25      0.30       501
         罗生门       0.58      0.51      0.55       503

    accuracy                           0.48      1505
   macro avg       0.47      0.48      0.47      1505
weighted avg       0.47      0.48      0.47      1505

MNB 算法调参结束2024-12-25 09:33:02.548383
