### 数据预处理

In [1]:
import pandas as pd
import re

In [2]:
dataset = pd.read_csv("data/bjsj.csv", engine='python', header=0, encoding='utf-8')

In [3]:
dataset.head()

Unnamed: 0,bjsj,bjnr,bjlbdm,bjlxdm,bjxldm,bjlbmc,bjlxmc,bjxlmc
0,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,
1,2024-04-11 21:43:25.0,2024年4月11日 21时43分22秒 郭女士( 139****6828 ) 报警：沙XX...,10,100100,100199.0,刑事案件,盗窃,
2,2024-04-11 21:16:36.0,2024年4月11日 21时16分35秒 牛女士( 151****7579 ，142329*...,10,100100,100120.0,刑事案件,盗窃,
3,2024-04-11 21:09:29.0,2024年4月11日 21时9分28秒 王先生( 151****8799、140111***...,10,100100,100120.0,刑事案件,盗窃,
4,2024-04-11 21:01:51.0,2024年4月11日 10时44分48秒 黄志明( 151****3088 、350524*...,10,100100,100199.0,刑事案件,盗窃,


In [4]:
if len(dataset.columns) > 7:
    dataset = dataset.drop(dataset.columns[7], axis=1)

# 判断是否已存在 'date' 和 'time' 列，避免重复执行
if 'date' not in dataset.columns and 'time' not in dataset.columns:
    # 拆分“bjsj”列成“日期”和“时间”两列，并转换为相应格式
    dataset['date'] = pd.to_datetime(dataset['bjsj'].str.split().str[0], format='%Y-%m-%d')
    dataset['time'] = pd.to_datetime(dataset['bjsj'].str.split().str[1], format='%H:%M:%S.%f').dt.time



def extract_info(text):
    # 使用正则表达式匹配关键词后面的内容作为报警内容
    match = re.search(r'(?:报警：|称|在)(.*)', text)
    if match:
        content = match.group(1)
        return content.strip()  # 去除首尾空格
    else:
        return text.strip()  # 如果没有匹配到关键词，则整行内容作为报警内容

# 判断是否已存在 'content' 列，避免重复执行
if 'content' not in dataset.columns:
    # 提取报警内容
    dataset['content'] = dataset['bjnr'].apply(extract_info)

dataset.head()

Unnamed: 0,bjsj,bjnr,bjlbdm,bjlxdm,bjxldm,bjlbmc,bjlxmc,date,time,content
0,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:49:00,XX村北XX街，女朋友放在口袋内一部价值9000元苹果手机被盗。
1,2024-04-11 21:43:25.0,2024年4月11日 21时43分22秒 郭女士( 139****6828 ) 报警：沙XX...,10,100100,100199.0,刑事案件,盗窃,2024-04-11,21:43:25,沙XX街天和XX小区6号楼3单元2802，现房门锁子打不开，门锁上有被砸痕迹，恐家内进入小偷...
2,2024-04-11 21:16:36.0,2024年4月11日 21时16分35秒 牛女士( 151****7579 ，142329*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:16:36,XX街寇XX路口往北200米路西处，放在口袋内一部价值1000余元的VIVO手机被盗。
3,2024-04-11 21:09:29.0,2024年4月11日 21时9分28秒 王先生( 151****8799、140111***...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:09:29,亲XX街王府井百货门口，放在衣服口袋内价值17000余元的华为手机被盗。
4,2024-04-11 21:01:51.0,2024年4月11日 10时44分48秒 黄志明( 151****3088 、350524*...,10,100100,100199.0,刑事案件,盗窃,2024-04-11,21:01:51,昨天，将车停放在南中环当代城摩马门口，现发现三元催化被盗，价值35000元。


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   bjsj     924 non-null    object        
 1   bjnr     924 non-null    object        
 2   bjlbdm   924 non-null    int64         
 3   bjlxdm   924 non-null    int64         
 4   bjxldm   527 non-null    float64       
 5   bjlbmc   924 non-null    object        
 6   bjlxmc   924 non-null    object        
 7   date     924 non-null    datetime64[ns]
 8   time     924 non-null    object        
 9   content  924 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 72.3+ KB


In [6]:
# 去除重复列
dataset = dataset.drop_duplicates(subset=dataset.columns[1])

In [7]:
# 对文本进行增强
from nlpcda import Similarword, RandomDeleteChar

# 假设 dataset 是你的数据集 DataFrame
input_csv = dataset

# 初始化增强方法
smw = Similarword(create_num=2, change_rate=0.5)
rdc = RandomDeleteChar(create_num=2, change_rate=0.3)

# 定义函数来对文本进行增强
def augment_text(text):
    augmented_texts = []
    
    # 同义词替换增强
    for _ in range(2):  # 创建两个增强文本
        augmented_text_list = smw.replace(text)
        if len(augmented_text_list) > 1:
            augmented_text = augmented_text_list[1]  # 取第二个增强后的文本
            augmented_texts.append(augmented_text)
    
    # 随机字删除增强
    for _ in range(2):  # 创建两个增强文本
        augmented_text_list = rdc.replace(text)
        if len(augmented_text_list) > 1:
            augmented_text = augmented_text_list[1]  # 取第二个增强后的文本
            augmented_texts.append(augmented_text)
    
    return augmented_texts

# 对每行的 "content" 列进行增强，并保存增强后的数据
augmented_data = []

for index, row in input_csv.iterrows():
    original_content = row['content']
    augmented_contents = augment_text(original_content)
    
    for augmented_content in augmented_contents:
        new_row = row.copy()
        new_row['content'] = augmented_content
        augmented_data.append(new_row)

# 将增强后的数据转换为DataFrame，并保存为新的CSV文件
augmented_df = pd.DataFrame(augmented_data)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/28/dhnkpq7x2ng78vx80gq2gwmc0000gn/T/jieba.cache


Simbert不能正常使用，除非你安装：bert4keras、tensorflow ，为了安装快捷，没有默认安装.... No module named 'bert4keras'


Loading model cost 0.297 seconds.
Prefix dict has been built successfully.


load :/Users/daypu/anaconda3/envs/pytorch/lib/python3.10/site-packages/nlpcda/data/同义词.txt done


In [8]:
augmented_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3122 entries, 0 to 923
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   bjsj     3122 non-null   object        
 1   bjnr     3122 non-null   object        
 2   bjlbdm   3122 non-null   int64         
 3   bjlxdm   3122 non-null   int64         
 4   bjxldm   1728 non-null   float64       
 5   bjlbmc   3122 non-null   object        
 6   bjlxmc   3122 non-null   object        
 7   date     3122 non-null   datetime64[ns]
 8   time     3122 non-null   object        
 9   content  3122 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 268.3+ KB


In [9]:
augmented_df["bjlbmc"].value_counts()

bjlbmc
治安警情      1078
刑事案件       484
交通警情       472
群众求助       352
社会联动       192
消防救援       164
举报线索       156
群体事件       154
其他报警类别      42
投诉监督        24
灾害事故         4
Name: count, dtype: int64

In [10]:
augmented_df["bjlxmc"].value_counts()

bjlxmc
诈骗             148
敲诈勒索           148
盗窃             144
打架斗殴            80
其它举报线索          80
环保执法            80
其它群众求助          80
其它交通管理          80
交通违法            80
交通事故            80
其它治安警情          80
交通设施            80
聚众上访            78
其它刑警情           76
提供线索            76
其它群体警情          76
色情淫秽            74
恐吓              72
失物求助            72
火灾              72
交通逃逸            72
其它消防救援          72
妨碍公务            70
扰乱秩序            70
治安纠纷            66
抢夺              64
其它社会联动          64
家庭暴力            64
强奸              64
交通秩序            60
抢劫              60
赌博              60
自杀求助            60
故意毁坏公私财物        58
走失求助            56
水、电、气、热险情求助     52
侵犯人身权利          48
其它报警类型          42
安全生产监督          32
非法侵入他人住宅        32
制贩、使用假币         20
开锁求助            20
抢险救援            20
交通保障            20
群众投诉            16
伤害              12
伪造票证、凭证         12
贩毒              12
绑架               8
坠楼求助             8
纵火               8
虐待               8
其他投诉监

In [11]:
# 保存修改后的CSV文件
augmented_df.to_csv('data/data_cleaned_enhanced.csv', index=False)

### 构建数据集

In [12]:
import torch
import numpy
import types
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from datasets import Dataset

from transformers import BertTokenizer, BertConfig, BertModel, Trainer, TrainingArguments
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup

from tqdm import tqdm, trange

comet_ml is installed but `COMET_API_KEY` is not set.


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cpu')

In [14]:
# 读取数据

df = pd.read_csv("data_cleaned_enhanced.csv", header=0)

In [15]:
df.head()

Unnamed: 0,bjsj,bjnr,bjlbdm,bjlxdm,bjxldm,bjlbmc,bjlxmc,date,time,content
0,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:49:00,XX村北XX街，女朋友放在口袋内一部价9000元苹果手机被盗。
1,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:49:00,XX村北XX街，女朋友雄居口袋内一部价9000元苹手机被盗。
2,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:49:00,XX村北XX，女朋友放在口袋一部价值9000元苹果手机被盗
3,2024-04-11 21:49:00.0,2024年4月11日 21时48分55秒 薛一铭( 180****5228 ，142427*...,10,100100,100120.0,刑事案件,盗窃,2024-04-11,21:49:00,XX村北XX街女朋友放在口袋内一部价值9000元苹果手机被盗
4,2024-04-11 21:43:25.0,2024年4月11日 21时43分22秒 郭女士( 139****6828 ) 报警：沙XX...,10,100100,100199.0,刑事案件,盗窃,2024-04-11,21:43:25,沙XX街天和XX小区6号楼3单元2802，现房门锁子打不开，门锁上有被砸痕迹，恐家内进入鸡鸣...


In [16]:
major_vocab = df["bjlbmc"].unique()
minor_vocab = df["bjlxmc"].unique()

# 创建LabelEncoder对象
major_encoder = LabelEncoder()
minor_encoder = LabelEncoder()

# 对标签进行编码
df['major_label_encoded'] = major_encoder.fit_transform(df['bjlbmc'])
df['minor_label_encoded'] = minor_encoder.fit_transform(df['bjlxmc'])

# 计算主要标签和次要标签的类别数量
num_major_labels = df['bjlbmc'].nunique() # 大类
num_minor_labels = df['bjlxmc'].nunique() # 小类

# 设置 num_labels 为主要标签和次要标签类别数量之和
num_labels = num_major_labels + num_minor_labels
# 将各类标签保存到文本文件中
with open('labels_info.txt', 'w') as f:
    f.write(f"Number of major labels: {num_major_labels}\n")
    f.write(f"Number of minor labels: {num_minor_labels}\n")
    f.write(f"Total number of labels: {num_labels}\n")


import joblib
# 保存 LabelEncoder 对象
joblib.dump(major_encoder, 'major_encoder.pkl')
joblib.dump(minor_encoder, 'minor_encoder.pkl')

# 提取日期和时间特征
df['month'] = pd.to_datetime(df['date']).dt.month
df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour

In [17]:
# 划分训练集和测试集
train_texts, test_texts, train_major, test_major, train_minor, test_minor, train_month, test_month, train_hour, test_hour = train_test_split(
    df['content'], df['major_label_encoded'], df['minor_label_encoded'], df['month'], df['hour'], test_size=0.2, random_state=42)

In [18]:
# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [19]:
# 创建训练集和测试集的Dataset对象
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': train_texts, 'major_label': train_major, 'minor_label': train_minor, 'month': train_month, 'hour': train_hour}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': test_texts, 'major_label': test_major, 'minor_label': test_minor, 'month': test_month, 'hour': test_hour}))

In [20]:
# 对文本进行tokenize，并转换为BERT的输入格式
def tokenize_texts(example):
    encoding = tokenizer(example['text'], padding='max_length', truncation=True, max_length=64)
    encoding['month'] = example['month']
    encoding['hour'] = example['hour']
    # encoding['minute'] = example['minute']
    return encoding

train_dataset = train_dataset.map(tokenize_texts, batched=True)
test_dataset = test_dataset.map(tokenize_texts, batched=True)

# 转换标签为张量类型
def convert_labels(example):
    example['major_label'] = torch.tensor(example['major_label'])
    example['minor_label'] = torch.tensor(example['minor_label'])
    example['month'] = torch.tensor(example['month'])
    example['hour'] = torch.tensor(example['hour'])
    # example['minute'] = torch.tensor(example['minute'])
    return example

train_dataset = train_dataset.map(convert_labels)
test_dataset = test_dataset.map(convert_labels)

# 设置返回张量格式
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'major_label', 'minor_label', 'month', 'hour'])

# 将数据集转换为 DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

Map:   0%|          | 0/2497 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

Map:   0%|          | 0/2497 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

### 训练模型

In [21]:
class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size + 2, config.num_labels)  # 增加了2个特征

    def forward(self, input_ids=None, attention_mask=None, month=None, hour=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # BERT模型的pooler_output
        
        # 将时间特征拼接到pooled_output中
        time_features = torch.stack((month, hour), dim=1).float()  # 创建时间特征张量，假设 month 和 hour 的形状都是 [batch_size]
        pooled_output = torch.cat((pooled_output, time_features), dim=1)  # 在最后一个维度上拼接
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        outputs = (logits,) + outputs[2:]  # 将 logits 与 BERT 模型的其他输出组合在一起
        
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        
        return outputs

In [22]:
# 加载BERT模型和训练参数
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)

model = CustomBertForSequenceClassification.from_pretrained('bert-base-chinese', config=config)  # 多标签分类，输出类别数量需适当调整
model.to(device)

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomBertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [23]:
# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=6e-5, weight_decay=2e-3)
criterion = nn.CrossEntropyLoss()  # 适用于多分类任务，根据实际情况可能需要调整损失函数

# 训练循环
num_epochs = 10
for epoch in range(num_epochs):
    # 训练模式
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        major_labels = batch['major_label'].to(device)
        minor_labels = batch['minor_label'].to(device)
        month = batch['month'].to(device)
        hour = batch['hour'].to(device)

        # 前向传播
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
        logits = outputs[0]  # outputs包含损失和logits，第一个元素是logits

        # 计算损失
        loss_major = criterion(logits[:, :num_major_labels], major_labels)
        loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
        loss = loss_major + loss_minor

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # 打印每个epoch的训练损失
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_loader)}")

    # 评估模式
    model.eval()
    test_loss = 0.0
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        major_labels = batch['major_label'].to(device)
        minor_labels = batch['minor_label'].to(device)
        month = batch['month'].to(device)
        hour = batch['hour'].to(device)

        # 前向传播
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, month=month, hour=hour)
            logits = outputs[0]  # outputs包含损失和logits，第一个元素是logits

            # 计算损失
            loss_major = criterion(logits[:, :num_major_labels], major_labels)
            loss_minor = criterion(logits[:, num_major_labels:], minor_labels)
            loss = loss_major + loss_minor

            test_loss += loss.item()

    # 打印每个epoch的评估损失
    print(f"Epoch {epoch + 1}/{num_epochs}, Test Loss: {test_loss / len(test_loader)}")

# 保存模型
torch.save(model.state_dict(), 'model.pth')

Epoch 1/10: 100%|██████████| 313/313 [02:49<00:00,  1.84it/s]


Epoch 1/10, Train Loss: 4.024419457386858


Evaluating: 100%|██████████| 79/79 [00:11<00:00,  6.64it/s]


Epoch 1/10, Test Loss: 2.4272379528118084


Epoch 2/10: 100%|██████████| 313/313 [02:44<00:00,  1.90it/s]


Epoch 2/10, Train Loss: 1.6552274932686133


Evaluating: 100%|██████████| 79/79 [00:11<00:00,  6.61it/s]


Epoch 2/10, Test Loss: 1.2269477840465834


Epoch 3/10: 100%|██████████| 313/313 [02:47<00:00,  1.87it/s]


Epoch 3/10, Train Loss: 0.8663716019628147


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.40it/s]


Epoch 3/10, Test Loss: 0.8204853678994541


Epoch 4/10: 100%|██████████| 313/313 [02:46<00:00,  1.88it/s]


Epoch 4/10, Train Loss: 0.5103789413460909


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.49it/s]


Epoch 4/10, Test Loss: 0.47746341067213044


Epoch 5/10: 100%|██████████| 313/313 [02:51<00:00,  1.82it/s]


Epoch 5/10, Train Loss: 0.2414186061404574


Evaluating: 100%|██████████| 79/79 [00:14<00:00,  5.63it/s]


Epoch 5/10, Test Loss: 0.41155912477193


Epoch 6/10: 100%|██████████| 313/313 [02:49<00:00,  1.85it/s]


Epoch 6/10, Train Loss: 0.15522024259209252


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.29it/s]


Epoch 6/10, Test Loss: 0.48473890290796


Epoch 7/10: 100%|██████████| 313/313 [02:47<00:00,  1.87it/s]


Epoch 7/10, Train Loss: 0.3149445329468471


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.17it/s]


Epoch 7/10, Test Loss: 0.5450129973454566


Epoch 8/10: 100%|██████████| 313/313 [02:49<00:00,  1.85it/s]


Epoch 8/10, Train Loss: 0.20756113719635497


Evaluating: 100%|██████████| 79/79 [00:13<00:00,  6.00it/s]


Epoch 8/10, Test Loss: 0.2434947330598967


Epoch 9/10: 100%|██████████| 313/313 [02:52<00:00,  1.82it/s]


Epoch 9/10, Train Loss: 0.09833763912915232


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.49it/s]


Epoch 9/10, Test Loss: 0.0980232213918544


Epoch 10/10: 100%|██████████| 313/313 [02:48<00:00,  1.86it/s]


Epoch 10/10, Train Loss: 0.05403644794199509


Evaluating: 100%|██████████| 79/79 [00:12<00:00,  6.45it/s]


Epoch 10/10, Test Loss: 0.23954555055692414
