In [1]:
import json
import numpy as np
import torch
import random
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
import copy

from model import RawGlobalPointer, ERENet
from utils import sparse_multilabel_categorical_crossentropy, MetricsCalculator_bdci

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
with open('datasets_bdci/rel2id.json', 'r', encoding='utf-8') as f:
    id_to_rel, rel_to_id = json.load(f)
    print(id_to_rel)
    print(rel_to_id)

{'0': '部件故障', '1': '性能故障', '2': '检测工具', '3': '组成'}
{'部件故障': 0, '性能故障': 1, '检测工具': 2, '组成': 3}


In [5]:
def load_data(filename):
    # example=[{'text': text0, 'spo_list': [(h_name00, hpos00, t_name00, tpos00, r00), (h_name01, hpos01, t_name01, tpos01, r01), xxxxxx]}, xxxxxx]
    D = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = json.loads(line)
            D.append({
                "text": line["text"],
                "spo_list": [(spo['h']['name'], spo['h']['pos'], spo['t']['name'], spo['t']['pos'], spo['relation']) for
                             spo in line["spo_list"]]})
        return D


data = load_data('datasets_bdci/train_bdci.json')
print(data[0])

{'text': '62号汽车故障报告综合情况:故障现象:加速后，丢开油门，发动机熄火。', 'spo_list': [('发动机', [28, 31], '熄火', [31, 33], '部件故障')]}


In [6]:
class CustomDataset(Dataset):
    """定义数据集"""

    def __init__(self, items):
        self._items = items

    def __len__(self):
        return len(self._items)

    def __getitem__(self, index):
        item = self._items[index]

        return {'text': item['text'],
                'spo_list': item['spo_list']}

In [7]:
tokenizer_fast = AutoTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese', use_fast=False)
tokenizer_fast.add_tokens(new_tokens=['[SP]'])
print(tokenizer_fast)

pretrained = AutoModel.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese')
pretrained.resize_token_embeddings(len(tokenizer_fast))
print(pretrained.num_parameters())

PreTrainedTokenizer(name_or_path='IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese', vocab_size=12800, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


Some weights of the model checkpoint at IDEA-CCNL/Erlangshen-DeBERTa-v2-320M-Chinese were not used when initializing DebertaV2Model: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


319094784


In [8]:
def padding_space(d):
    """将句子转换为字符列表,并将列表中的空格(' ')替换为'[SP]'"""
    if d.find(' ') == -1:
        return list(d)
    else:
        d_arr = np.array(list(d))
        d_arr = np.where(d_arr == ' ', '[SP]', d_arr).tolist()
        return d_arr


def get_collate_fn(tokenizer, max_len=2048):
    """返回collate_fun函数(通过闭包函数引入形参)"""

    def collate_fn(data):
        batch_size = len(data)
        texts = [padding_space(i['text']) for i in data]
        spo_lists = [i['spo_list'] for i in data]

        encoder_text = tokenizer(texts, padding=True, max_length=max_len, truncation=True, is_split_into_words=True,
                                 return_tensors='pt')
        input_ids, token_type_ids, attention_mask = encoder_text.values()

        entity_labels, head_labels, tail_labels = [], [], []
        for i in range(batch_size):
            entity_labels_temp, head_labels_temp, tail_labels_temp = [[], []], [[], [], [], []], [[], [], [], []]
            if spo_lists[i]:
                for _, p_index, _, o_index, r in spo_lists[i]:
                    entity_labels_temp[0].append((p_index[0], p_index[1] - 1))
                    entity_labels_temp[1].append((o_index[0], o_index[1] - 1))
                    head_labels_temp[rel_to_id[r]].append((p_index[0], o_index[0]))
                    tail_labels_temp[rel_to_id[r]].append((p_index[1] - 1, o_index[1] - 1))
            else:
                # spo_lists为空列表时
                entity_labels_temp[0].append((0, 0))
                entity_labels_temp[1].append((0, 0))

            _, _ = [i.append((0, 0)) for i in head_labels_temp if not i], [i.append((0, 0)) for i in tail_labels_temp if
                                                                           not i]

            entity_labels_temp = torch.transpose(torch.tensor(entity_labels_temp), 0, 1)
            entity_labels.append(entity_labels_temp)

            head_labels_temp = [torch.tensor(i) for i in head_labels_temp]
            head_labels_temp = torch.transpose(pad_sequence(head_labels_temp, batch_first=True), 0, 1)
            head_labels.append(head_labels_temp)
            tail_labels_temp = [torch.tensor(i) for i in tail_labels_temp]
            tail_labels_temp = torch.transpose(pad_sequence(tail_labels_temp, batch_first=True), 0, 1)
            tail_labels.append(tail_labels_temp)

        entity_labels = torch.transpose(pad_sequence(entity_labels, batch_first=True), 1, 2)
        head_labels = torch.transpose(pad_sequence(head_labels, batch_first=True), 1, 2)
        tail_labels = torch.transpose(pad_sequence(tail_labels, batch_first=True), 1, 2)
        return input_ids, attention_mask, token_type_ids, entity_labels, head_labels, tail_labels, texts, spo_lists

    return collate_fn


train_loader = DataLoader(CustomDataset(items=data), batch_size=2, shuffle=True,
                          collate_fn=get_collate_fn(tokenizer_fast))
for i in train_loader:
    print(i[0].shape)
    print(i[3].shape)
    print(i[4].shape)
    break

torch.Size([2, 971])
torch.Size([2, 2, 11, 2])
torch.Size([2, 4, 8, 2])


In [9]:
hidden_size = pretrained.config.hidden_size
mention_detect = RawGlobalPointer(hidden_size, 2, 64).to(device)  # 不提取实体类型(只识别subject、object对应的实体)
s_o_head = RawGlobalPointer(hidden_size, len(id_to_rel), 64, RoPE=False, tril_mask=False).to(
    device)  # 不需要设置tril_mask=False
s_o_tail = RawGlobalPointer(hidden_size, len(id_to_rel), 64, RoPE=False, tril_mask=False).to(
    device)  # 不需要设置tril_mask=False
net = ERENet(copy.deepcopy(pretrained), mention_detect, s_o_head, s_o_tail).to(device)

optimizer = torch.optim.AdamW(net.parameters(), lr=2e-5)

In [10]:
def extract_spoes(logits1, logits2, logits3, texts, id2predicate):
    logits1 = logits1.data.cpu().numpy()
    logits2 = logits2.data.cpu().numpy()
    logits3 = logits3.data.cpu().numpy()
    batch_size = logits1.shape[0]

    # 序列开头与结尾特殊token('[CLS]', '[SEP]')处元素设置为无穷小
    logits1[:, :, [0, -1]] -= np.inf
    logits1[:, :, :, [0, -1]] -= np.inf
    subjects, objects = [[] for _ in range(batch_size)], [[] for _ in range(batch_size)]
    for b, l, h, t in zip(*np.where(logits1 > 0.0)):  # 阈值(threshold)设置为0.0
        if l == 0:  # 不提取实体类型(只识别subject、objects对应的实体)
            subjects[b].append((int(h), int(t)))
        else:
            objects[b].append((int(h), int(t)))

    spoes = [[] for _ in range(batch_size)]
    for b in range(batch_size):
        text_b = np.array(texts[b])
        text_b = np.where(text_b == '[SP]', ' ', text_b).tolist()  # 重新恢复为' '
        # 计算subjects[b]与objects[b]所有可能关系的笛卡尔组合
        # 解析:subjects[b]:((s00, s01), (s10, s11), (s20, s21), (s30, s31)), objects[b]:((o00, o01), (o10, o11), (o20, o21))
        # 无法确定(s00, s01)必定对应(o00, o01),可能(o00, o01)前有未被解析;也有可能(s10, s11)才是subjects[b]的第一个实体,(s00, s01)为解析错误
        # 故仍采用笛卡尔组合的形式
        for sh, st in subjects[b]:
            for oh, ot in objects[b]:
                p1s = np.where(logits2[b, :, sh, oh] > 0.0)[0]  # 阈值(threshold)设置为0.0
                p2s = np.where(logits3[b, :, st, ot] > 0.0)[0]  # 阈值(threshold)设置为0.0
                ps = set(p1s) & set(p2s)
                for p in ps:
                    sht_str = ''.join(text_b[sh: st + 1])
                    oht_str = ''.join(text_b[oh: ot + 1])
                    spoes[b].append((sht_str, (sh, st + 1), oht_str, (oh, ot + 1),
                                     id2predicate[str(p)]))  # 添加预测结果:(h_name, hpos, t_name, tpos, r)
    return spoes

In [11]:
# 模型训练
def train(model, dataloader, optimizer, id2predicate, device):
    model.train()

    for idx, (
            input_ids, attention_mask, token_type_ids, entity_labels, head_labels, tail_labels, texts,
            spo_lists) in enumerate(
        dataloader, start=1):
        # 数据设备切换
        # input_ids.shape=[batch_size, seq_len]
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        entity_labels = entity_labels.to(device)
        head_labels = head_labels.to(device)
        tail_labels = tail_labels.to(device)

        # logits1.shape=[batch_size, 2, seq_len, seq_len]
        # logits2.shape=[batch_size, len(schema) seq_len, seq_len]
        # logits3.shape=[batch_size, len(schema), seq_len, seq_len]
        logits1, logits2, logits3 = model(input_ids, attention_mask, token_type_ids)

        loss1 = sparse_multilabel_categorical_crossentropy(y_true=entity_labels, y_pred=logits1)
        loss2 = sparse_multilabel_categorical_crossentropy(y_true=head_labels, y_pred=logits2)
        loss3 = sparse_multilabel_categorical_crossentropy(y_true=tail_labels, y_pred=logits3)
        loss = sum([loss1, loss2, loss3]) / 3  # entities和relations之间的信息共享和交互
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if idx % 100 == 0:
            y_pred = extract_spoes(logits1, logits2, logits3, texts, id2predicate)
            mc = MetricsCalculator_bdci()  # 计算查准率、查全率、F1 score 
            mc.calc_confusion_matrix(y_pred, spo_lists)
            print('| step {:5d} | loss {:9.5f} | precision {:8.5f} | recall {:8.5f} | f1 {:8.5f} |'.format(idx,
                                                                                                           loss.item(),
                                                                                                           mc.precision,
                                                                                                           mc.recall,
                                                                                                           mc.f1))

In [12]:
for epoch in range(30):
    print('-' * 50 + str(epoch) + '-' * 50)
    train(net, train_loader, optimizer, id_to_rel, device)

--------------------------------------------------0--------------------------------------------------


  attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / torch.tensor(
  score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
  score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)


| step   100 | loss  22.03539 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
| step   200 | loss  22.05331 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
| step   300 | loss  11.94513 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
| step   400 | loss  -2.82410 | precision  1.00000 | recall  0.66667 | f1  0.80000 |
| step   500 | loss  28.19783 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
| step   600 | loss   4.90600 | precision  1.00000 | recall  1.00000 | f1  1.00000 |
| step   700 | loss  17.45040 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
--------------------------------------------------1--------------------------------------------------
| step   100 | loss   4.86326 | precision  0.40000 | recall  0.28571 | f1  0.33333 |
| step   200 | loss   3.64981 | precision  0.00000 | recall  0.00000 | f1  0.00000 |
| step   300 | loss  17.02134 | precision  0.22222 | recall  0.18182 | f1  0.20000 |
| step   400 | loss  14.05476 | precision  0.500

In [13]:
# 模型预测
def predict(model, filename, tokenizer, id2predicate, device):
    model.eval()

    predict_list = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = json.loads(line)  # 每次预测一条数据

            ID, text = line["ID"], line["text"]
            texts = [padding_space(text)]

            temp_dict = {}
            temp_dict["ID"] = ID
            temp_dict["text"] = text

            text_encode = tokenizer(texts, is_split_into_words=True, return_tensors='pt')
            input_ids = text_encode['input_ids'].to(device)
            attention_mask = text_encode['attention_mask'].to(device)
            token_type_ids = text_encode['attention_mask'].to(device)

            logits1, logits2, logits3 = model(input_ids, attention_mask, token_type_ids)
            y_pred = extract_spoes(logits1, logits2, logits3, texts, id2predicate)

            temp_spo_list = []
            for i in y_pred[0]:
                temp_spo = {"h": {"name": i[0], "pos": list(i[1])},
                            "t": {"name": i[2], "pos": list(i[3])},
                            "relation": i[-1]}
                temp_spo_list.append(temp_spo)

            temp_dict["spo_list"] = temp_spo_list
            predict_list.append(temp_dict)
    return predict_list

In [14]:
predict_result = predict(net, 'datasets_bdci/evalA.json', tokenizer_fast, id_to_rel, device)

In [15]:
predict_result[0]

{'ID': 'AE0001',
 'text': '三、故障排除(1)发生电缆分支箱带电后,先断开电缆分支箱电源。(2)及时汇报上级部门,并由95598抢修平台发布停电信息。(3)分析故障原因。一般情况下设备外壳带电是由于相线接地、零线断线或三相负荷不平衡和接地电阻不符合要求引起。应先分析用户侧用电使用情况,如用户侧用电正常,则说明相线不完全接地或三相负荷不平衡和接地电阻不符合要求引起,如用户侧用电不正常,则说明相线完全接地或零线断线等。(4)根据分析结果制定查找故障点方法,正确填写配电故障紧急抢修单,选择查找故障所需的工器具和材料。(5)查找故障点时为确保作业人身安全,操作时应按带电作业的安全要求进行。(6)查找故障点。首先确认该电缆分支箱已停电,出线负荷已断开;检查电缆分支箱内的设备(导线)与分支箱外壳有无明显导通,可用万用表欧姆挡测量各相与分支箱外壳是否导通;再检查零线连接是否牢固或有无断线;最后用接地电阻摇表测量接地电阻是否合格,确定故障点位置。(7)落实安全组织措施后,工作许可人应做好线路停电、验电、挂设接地线、悬挂标志牌等安全技术措施,并向工作负责人办理许可手续。(8)施工前工作负责人向全体工作班成员进行"三交三查",班组人员确认签名。(9)故障点处理。使用合格的工器具,做好防触电等安全措施,正确处理故障点。(10)工作负责人对施工质量进行验收,并符合设计要求。(11)拆除现场安全围栏等设施,收回工器具、材料并清理现场,工作负责人召开站班会,组织抢修人员撤离作业现场。(12)工作负责人向工作许可人汇报工作结束,工作许可人拆除所有安全措施后,按操作步骤进行送电。(13)工作总结,并由 95598 抢修平台发布送电信息。',
 'spo_list': [{'h': {'name': '电缆分支箱', 'pos': [11, 16]},
   't': {'name': '带电', 'pos': [16, 18]},
   'relation': '部件故障'},
  {'h': {'name': '电缆分支箱', 'pos': [11, 16]},
   't': {'name': '断线', 'pos': [92, 94]},
   'relation': '部件故障'},
  {'h': {'name': '相线', 'pos': [85, 87]},

In [16]:
# JSON字符串的保存
f = open('evalResult.json', 'w', encoding='utf-8')
for i in predict_result:
    s = json.dumps(i, ensure_ascii=False)
    f.write(s + '\n')
f.close()