In [4]:
import torch
import torch.nn as nn
import numpy as np

import pandas as pd
import joblib

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import BertTokenizer, BertConfig, BertModel, Trainer, TrainingArguments, BertForSequenceClassification


In [5]:
class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size + 2, config.num_labels)  # 增加了2个特征

    def forward(self, input_ids=None, attention_mask=None, month=None, hour=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # BERT模型的pooler_output
        
        # 将时间特征拼接到pooled_output中
        time_features = torch.stack((month, hour), dim=1).float()  # 创建时间特征张量，假设 month 和 hour 的形状都是 [batch_size]
        pooled_output = torch.cat((pooled_output, time_features), dim=1)  # 在最后一个维度上拼接
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        outputs = (logits,) + outputs[2:]  # 将 logits 与 BERT 模型的其他输出组合在一起
        
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        
        return outputs

In [6]:
# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载大类和小类编码器
major_encoder = joblib.load('major_encoder.pkl')
minor_encoder = joblib.load('minor_encoder.pkl')

# 初始化变量
num_major_labels = None
num_minor_labels = None
num_labels = None

# 打开并读取文件内容
with open('labels_info.txt', 'r') as f:
    lines = f.readlines()

# 解析文件内容并赋值给变量
for line in lines:
    if line.startswith("Number of major labels:"):
        num_major_labels = int(line.split(": ")[1].strip())
    elif line.startswith("Number of minor labels:"):
        num_minor_labels = int(line.split(": ")[1].strip())
    elif line.startswith("Total number of labels:"):
        num_labels = int(line.split(": ")[1].strip())

# 加载自定义模型权重
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)
model = CustomBertForSequenceClassification.from_pretrained('bert-base-chinese', config=config)  # 多标签分类，输出类别数量需适当调整
model.load_state_dict(torch.load('model.pth'))  # 加载训练好的模型权重
model.eval()

# 读取CSV文件的前5行数据
csv_file = "data/data_cleaned_enhanced.csv"  # 替换成你的CSV文件路径
df = pd.read_csv(csv_file, header=0)  # 读取前5行数据

def preprocess_data(data):
    # 提取日期和时间特征
    data['date'] = pd.to_datetime(data['date'])
    data['time'] = pd.to_datetime(data['time'], format='%H:%M:%S').dt.time
    data['month'] = data['date'].dt.month
    data['hour'] = data['time'].apply(lambda x: x.hour)

    # 对标签进行编码
    data['major_label_encoded'] = major_encoder.transform(data['bjlbmc'])
    data['minor_label_encoded'] = minor_encoder.transform(data['bjlxmc'])

    return data

# 执行推理
def predict(input_ids, attention_mask, month, hour, true_major_labels, true_minor_labels):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
        logits = outputs[0]

    major_preds = np.argmax(logits[:, :num_major_labels].cpu().numpy(), axis=1)
    minor_preds = np.argmax(logits[:, num_major_labels:].cpu().numpy(), axis=1)

    major_labels = major_encoder.inverse_transform(major_preds)
    minor_labels = minor_encoder.inverse_transform(minor_preds)

    # 计算准确率
    major_accuracy = accuracy_score(true_major_labels, major_preds)
    minor_accuracy = accuracy_score(true_minor_labels, minor_preds)

    return major_labels, minor_labels, major_accuracy, minor_accuracy

# 预处理数据
df = preprocess_data(df)

# 将数据按照每4行分组
n = 4
groups = [df.iloc[i:i+n] for i in range(0, len(df), n)]

# 将这些组转化为一个DataFrame列表
group_dfs = [group.reset_index(drop=True) for group in groups]

# 创建一个包含这些组的索引列表
group_indices = list(range(len(group_dfs)))

# 10折交叉验证
kf = KFold(n_splits=10, shuffle=True, random_state=42)
major_accuracies = []
minor_accuracies = []

for train_index, test_index in kf.split(group_indices):
    # 根据索引划分训练集和测试集
    train_groups = [group_dfs[i] for i in train_index]
    test_groups = [group_dfs[i] for i in test_index]

    # 将这些组合并成训练集和测试集的DataFrame
    train_df = pd.concat(train_groups).reset_index(drop=True)
    test_df = pd.concat(test_groups).reset_index(drop=True)

    # 提取训练集和测试集的各列数据
    train_texts = train_df['content']
    test_texts = test_df['content']
    train_major = train_df['major_label_encoded']
    test_major = test_df['major_label_encoded']
    train_minor = train_df['minor_label_encoded']
    test_minor = test_df['minor_label_encoded']
    train_month = train_df['month']
    test_month = test_df['month']
    train_hour = train_df['hour']
    test_hour = test_df['hour']

    # 文本处理，tokenize
    tokenized_test = tokenizer(list(test_texts), padding='max_length', truncation=True, max_length=64, return_tensors='pt')

    # 转换为张量
    input_ids = tokenized_test['input_ids'].to(device)
    attention_mask = tokenized_test['attention_mask'].to(device)
    month = torch.tensor(test_month.values).to(device)
    hour = torch.tensor(test_hour.values).to(device)

    # 执行推理过程
    major_labels, minor_labels, major_accuracy, minor_accuracy = predict(input_ids, attention_mask, month, hour, test_major.values, test_minor.values)

    # 输出预测结果和准确率
    print(f"Fold results:")
    print(f"Major Label Accuracy: {major_accuracy}")
    print(f"Minor Label Accuracy: {minor_accuracy}")

    major_accuracies.append(major_accuracy)
    minor_accuracies.append(minor_accuracy)

# 输出平均准确率
print(f"Average Major Label Accuracy: {np.mean(major_accuracies)}")
print(f"Average Minor Label Accuracy: {np.mean(minor_accuracies)}")

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold results:
Major Label Accuracy: 0.7658227848101266
Minor Label Accuracy: 0.6170886075949367
Fold results:
Major Label Accuracy: 0.8205128205128205
Minor Label Accuracy: 0.717948717948718
Fold results:
Major Label Accuracy: 0.967948717948718
Minor Label Accuracy: 0.9647435897435898
Fold results:
Major Label Accuracy: 0.9487179487179487
Minor Label Accuracy: 0.907051282051282
Fold results:
Major Label Accuracy: 0.9483870967741935
Minor Label Accuracy: 0.8935483870967742
Fold results:
Major Label Accuracy: 0.9006410256410257
Minor Label Accuracy: 0.8814102564102564
Fold results:
Major Label Accuracy: 0.9391025641025641
Minor Label Accuracy: 0.8878205128205128
Fold results:
Major Label Accuracy: 0.9551282051282052
Minor Label Accuracy: 0.9134615384615384
Fold results:
Major Label Accuracy: 0.9519230769230769
Minor Label Accuracy: 0.9391025641025641
Fold results:
Major Label Accuracy: 0.9775641025641025
Minor Label Accuracy: 0.9294871794871795
Average Major Label Accuracy: 0.91757483431