### 数据预处理

In [None]:
import pandas as pd
import re

In [None]:
dataset = pd.read_csv("data/bjsj.csv", engine='python', header=0, encoding='utf-8')

In [None]:
dataset.head()

In [None]:
if len(dataset.columns) > 7:
    dataset = dataset.drop(dataset.columns[7], axis=1)

# 判断是否已存在 'date' 和 'time' 列，避免重复执行
if 'date' not in dataset.columns and 'time' not in dataset.columns:
    # 拆分“bjsj”列成“日期”和“时间”两列，并转换为相应格式
    dataset['date'] = pd.to_datetime(dataset['bjsj'].str.split().str[0], format='%Y-%m-%d')
    dataset['time'] = pd.to_datetime(dataset['bjsj'].str.split().str[1], format='%H:%M:%S.%f').dt.time



def extract_info(text):
    # 使用正则表达式匹配关键词后面的内容作为报警内容
    match = re.search(r'(?:报警：|称|在)(.*)', text)
    if match:
        content = match.group(1)
        return content.strip()  # 去除首尾空格
    else:
        return text.strip()  # 如果没有匹配到关键词，则整行内容作为报警内容

# 判断是否已存在 'content' 列，避免重复执行
if 'content' not in dataset.columns:
    # 提取报警内容
    dataset['content'] = dataset['bjnr'].apply(extract_info)

dataset.head()

In [None]:
dataset.info()

In [None]:
# 去除重复列
dataset = dataset.drop_duplicates(subset=dataset.columns[1])

In [None]:
# 对文本进行增强
from nlpcda import Similarword, RandomDeleteChar

# 假设 dataset 是你的数据集 DataFrame
input_csv = dataset

# 初始化增强方法
smw = Similarword(create_num=2, change_rate=0.5)
rdc = RandomDeleteChar(create_num=2, change_rate=0.3)

# 定义函数来对文本进行增强
def augment_text(text):
    augmented_texts = []
    
    # 同义词替换增强
    for _ in range(2):  # 创建两个增强文本
        augmented_text_list = smw.replace(text)
        if len(augmented_text_list) > 1:
            augmented_text = augmented_text_list[1]  # 取第二个增强后的文本
            augmented_texts.append(augmented_text)
    
    # 随机字删除增强
    for _ in range(2):  # 创建两个增强文本
        augmented_text_list = rdc.replace(text)
        if len(augmented_text_list) > 1:
            augmented_text = augmented_text_list[1]  # 取第二个增强后的文本
            augmented_texts.append(augmented_text)
    
    return augmented_texts

# 对每行的 "content" 列进行增强，并保存增强后的数据
augmented_data = []

for index, row in input_csv.iterrows():
    original_content = row['content']
    augmented_contents = augment_text(original_content)
    
    for augmented_content in augmented_contents:
        new_row = row.copy()
        new_row['content'] = augmented_content
        augmented_data.append(new_row)

# 将增强后的数据转换为DataFrame，并保存为新的CSV文件
augmented_df = pd.DataFrame(augmented_data)


In [None]:
augmented_df.info()

In [None]:
augmented_df["bjlbmc"].value_counts()

In [None]:
augmented_df["bjlxmc"].value_counts()

In [None]:
# 保存修改后的CSV文件
augmented_df.to_csv('data/data_cleaned_enhanced.csv', index=False)

### 构建数据集

In [None]:
import torch
import numpy
import types
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader, Subset

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from datasets import Dataset

from transformers import BertTokenizer, BertConfig, BertModel, Trainer, TrainingArguments
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

from tqdm import tqdm, trange

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
# 读取数据

df = pd.read_csv("data/data_cleaned_enhanced.csv", header=0)

In [None]:
df.head()

In [None]:
major_vocab = df["bjlbmc"].unique()
minor_vocab = df["bjlxmc"].unique()

# 创建LabelEncoder对象
major_encoder = LabelEncoder()
minor_encoder = LabelEncoder()

# 对标签进行编码
df['major_label_encoded'] = major_encoder.fit_transform(df['bjlbmc'])
df['minor_label_encoded'] = minor_encoder.fit_transform(df['bjlxmc'])

# 计算主要标签和次要标签的类别数量
num_major_labels = df['bjlbmc'].nunique() # 大类
num_minor_labels = df['bjlxmc'].nunique() # 小类

# 设置 num_labels 为主要标签和次要标签类别数量之和
num_labels = num_major_labels + num_minor_labels
# 将各类标签保存到文本文件中
with open('labels_info.txt', 'w') as f:
    f.write(f"Number of major labels: {num_major_labels}\n")
    f.write(f"Number of minor labels: {num_minor_labels}\n")
    f.write(f"Total number of labels: {num_labels}\n")


import joblib
# 保存 LabelEncoder 对象
joblib.dump(major_encoder, 'major_encoder.pkl')
joblib.dump(minor_encoder, 'minor_encoder.pkl')

# 提取日期和时间特征
df['month'] = pd.to_datetime(df['date']).dt.month
df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour

In [None]:
# # 划分训练集和测试集
# train_texts, test_texts, train_major, test_major, train_minor, test_minor, train_month, test_month, train_hour, test_hour = train_test_split(
#     df['content'], df['major_label_encoded'], df['minor_label_encoded'], df['month'], df['hour'], test_size=0.2, random_state=42)

In [None]:
# 将数据按照每4行分组
n = 4
groups = [df.iloc[i:i+n] for i in range(0, len(df), n)]

# 将这些组转化为一个DataFrame列表
group_dfs = [group.reset_index(drop=True) for group in groups]

# 创建一个包含这些组的索引列表
group_indices = list(range(len(group_dfs)))

# 按照8:2的比例划分这些组的索引
train_indices, test_indices = train_test_split(group_indices, test_size=0.2, random_state=42)

# 根据划分的索引分别获取训练集和测试集
train_groups = [group_dfs[i] for i in train_indices]
test_groups = [group_dfs[i] for i in test_indices]

# 将这些组合并成训练集和测试集的DataFrame
train_df = pd.concat(train_groups).reset_index(drop=True)
test_df = pd.concat(test_groups).reset_index(drop=True)

# 提取训练集和测试集的各列数据
train_texts = train_df['content']
test_texts = test_df['content']
train_major = train_df['major_label_encoded']
test_major = test_df['major_label_encoded']
train_minor = train_df['minor_label_encoded']
test_minor = test_df['minor_label_encoded']
train_month = train_df['month']
test_month = test_df['month']
train_hour = train_df['hour']
test_hour = test_df['hour']

In [None]:
# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [None]:
# 创建训练集和测试集的Dataset对象
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'major_label': train_major, 'minor_label': train_minor, 'month': train_month, 'hour': train_hour}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'major_label': test_major, 'minor_label': test_minor, 'month': test_month, 'hour': test_hour}))

In [None]:
# 对文本进行tokenize，并转换为BERT的输入格式
def tokenize_texts(example):
    encoding = tokenizer(example['text'], padding='max_length', truncation=True, max_length=64)
    encoding['month'] = example['month']
    encoding['hour'] = example['hour']
    # encoding['minute'] = example['minute']
    return encoding

train_dataset = train_dataset.map(tokenize_texts, batched=True)
test_dataset = test_dataset.map(tokenize_texts, batched=True)

# 转换标签为张量类型
def convert_labels(example):
    example['major_label'] = torch.tensor(example['major_label'])
    example['minor_label'] = torch.tensor(example['minor_label'])
    example['month'] = torch.tensor(example['month'])
    example['hour'] = torch.tensor(example['hour'])
    # example['minute'] = torch.tensor(example['minute'])
    return example

train_dataset = train_dataset.map(convert_labels)
test_dataset = test_dataset.map(convert_labels)

# 设置返回张量格式
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])

# 将数据集转换为 DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

### 训练模型

In [None]:
class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size + 2, config.num_labels)  # 增加了2个特征

    def forward(self, input_ids=None, attention_mask=None, month=None, hour=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # BERT模型的pooler_output
        
        # 将时间特征拼接到pooled_output中
        time_features = torch.stack((month, hour), dim=1).float()  # 创建时间特征张量，假设 month 和 hour 的形状都是 [batch_size]
        pooled_output = torch.cat((pooled_output, time_features), dim=1)  # 在最后一个维度上拼接
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        outputs = (logits,) + outputs[2:]  # 将 logits 与 BERT 模型的其他输出组合在一起
        
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        
        return outputs

In [None]:
# 加载BERT模型和训练参数
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)

model = CustomBertForSequenceClassification.from_pretrained('bert-base-chinese', config=config)  # 多标签分类，输出类别数量需适当调整
model.to(device)

In [None]:
# # 定义优化器和损失函数
# optimizer = AdamW(model.parameters(), lr=4e-5, weight_decay=2e-3)
# criterion = nn.CrossEntropyLoss()  # 适用于多分类任务，根据实际情况可能需要调整损失函数

# # 训练循环
# num_epochs = 10
# for epoch in range(num_epochs):
#     # 训练模式
#     model.train()
#     train_loss = 0.0
#     for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         major_labels = batch['major_label'].to(device)
#         minor_labels = batch['minor_label'].to(device)
#         month = batch['month'].to(device)
#         hour = batch['hour'].to(device)

#         # 前向传播
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
#         logits = outputs[0]  # outputs包含损失和logits，第一个元素是logits

#         # 计算损失
#         loss_major = criterion(logits[:, :num_major_labels], major_labels)
#         loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
#         loss = loss_major + loss_minor

#         # 反向传播和优化
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         train_loss += loss.item()

#     # 打印每个epoch的训练损失
#     print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}")

#     # 评估模式
#     model.eval()
#     test_loss = 0.0
#     for batch in tqdm(test_loader, desc="Evaluating"):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         major_labels = batch['major_label'].to(device)
#         minor_labels = batch['minor_label'].to(device)
#         month = batch['month'].to(device)
#         hour = batch['hour'].to(device)

#         # 前向传播
#         with torch.no_grad():
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
#             logits = outputs[0]  # outputs包含损失和logits，第一个元素是logits

#             # 计算损失
#             loss_major = criterion(logits[:, :num_major_labels], major_labels)
#             loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
#             loss = loss_major + loss_minor

#             test_loss += loss.item()

#     # 打印每个epoch的评估损失
#     print(f"Epoch {epoch + 1}/{num_epochs}, Test Loss: {test_loss / len(test_loader)}")

# # 保存模型
# torch.save(model.state_dict(), 'model.pth')

In [None]:
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

fold = 0
major_accs = []
minor_accs = []

for train_index, val_index in kf.split(group_indices):
    fold += 1
    train_groups = [group_dfs[i] for i in train_index]
    val_groups = [group_dfs[i] for i in val_index]

    train_df = pd.concat(train_groups).reset_index(drop=True)
    val_df = pd.concat(val_groups).reset_index(drop=True)

    # 提取训练集和验证集的各列数据
    train_texts = train_df['content']
    val_texts = val_df['content']
    train_major = train_df['major_label_encoded']
    val_major = val_df['major_label_encoded']
    train_minor = train_df['minor_label_encoded']
    val_minor = val_df['minor_label_encoded']
    train_month = train_df['month']
    val_month = val_df['month']
    train_hour = train_df['hour']
    val_hour = val_df['hour']

    # 创建训练集和验证集的Dataset对象
    train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'major_label': train_major, 'minor_label': train_minor, 'month': train_month, 'hour': train_hour}))
    val_dataset = Dataset.from_pandas(pd.DataFrame({'text': val_texts, 'major_label': val_major, 'minor_label': val_minor, 'month': val_month, 'hour': val_hour}))

    # 对文本进行tokenize，并转换为BERT的输入格式
    train_dataset = train_dataset.map(tokenize_texts, batched=True)
    val_dataset = val_dataset.map(tokenize_texts, batched=True)

    # 转换标签为张量类型
    train_dataset = train_dataset.map(convert_labels)
    val_dataset = val_dataset.map(convert_labels)

    # 设置返回张量格式
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])

    # 将数据集转换为 DataLoader
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # 训练模型和评估
    model = CustomBertForSequenceClassification.from_pretrained('bert-base-chinese', config=config)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=4e-5, weight_decay=2e-3)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 3  # 演示目的，减少到3个epoch
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Fold {fold}, Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            major_labels = batch['major_label'].to(device)
            minor_labels = batch['minor_label'].to(device)
            month = batch['month'].to(device)
            hour = batch['hour'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
            logits = outputs[0]

            loss_major = criterion(logits[:, :num_major_labels], major_labels)
            loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
            loss = loss_major + loss_minor

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        print(f"Fold {fold}, Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}")

        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []
        for batch in tqdm(val_loader, desc=f"Fold {fold}, Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            major_labels = batch['major_label'].to(device)
            minor_labels = batch['minor_label'].to(device)
            month = batch['month'].to(device)
            hour = batch['hour'].to(device)

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
                logits = outputs[0]

                loss_major = criterion(logits[:, :num_major_labels], major_labels)
                loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
                loss = loss_major + loss_minor

                val_loss += loss.item()

                # 计算准确率
                preds_major = logits[:, :num_major_labels].argmax(dim=1).detach().cpu().numpy()
                preds_minor = logits[:, num_major_labels:].argmax(dim=1).detach().cpu().numpy()
                val_preds.extend(list(zip(preds_major, preds_minor)))
                val_labels.extend(list(zip(major_labels.cpu().numpy(), minor_labels.cpu().numpy())))

        val_major_labels, val_minor_labels = zip(*val_labels)
        val_major_preds, val_minor_preds = zip(*val_preds)
        acc_major = accuracy_score(val_major_labels, val_major_preds)
        acc_minor = accuracy_score(val_minor_labels, val_minor_preds)
        print(f"Fold {fold}, Epoch {epoch + 1}/{num_epochs}, Validation Loss: {val_loss / len(val_loader)}, Major Accuracy: {acc_major}, Minor Accuracy: {acc_minor}")

        major_accs.append(acc_major)
        minor_accs.append(acc_minor)

# 输出所有折的平均准确率
print(f"Average Major Accuracy across all folds: {sum(major_accs) / len(major_accs)}")
print(f"Average Minor Accuracy across all folds: {sum(minor_accs) / len(minor_accs)}")