In [1]:
from transformers import BertTokenizerFast
from tqdm import tqdm
import json

In [2]:
tokenizer_fast = BertTokenizerFast.from_pretrained('./save_tokenizer/')
print(tokenizer_fast)

PreTrainedTokenizerFast(name_or_path='./save_tokenizer/', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [3]:
with open('datasets/train.conll_convert.conll', 'r', encoding='utf-8') as f:
    for sentence in f.readlines():
        try:
            sentence = json.loads(sentence)
        except Exception as e:
            continue

        sent = sentence['sent'].replace(' ', '')
        print(sent)
        print(sentence['ners'], end='\n\n')

        # 2 - 1表示结尾字符'廓'的未知
        print(sent[2 - 1])
        print(sent[4 - 1])
        print(sent[7 - 1])
        print(sent[9 - 1], end='\n\n')

        # 字符串索引不包括结尾位置元素 
        print(sent[0: 2])
        print(sent[2: 4])
        print(sent[5: 6])
        print(sent[7: 9])
        break

胸廓对称，气管居中。所见骨骼骨质结构完整。双肺纹理清晰。两肺门影不大。心影横径增大，左心缘饱满。两侧膈面光整，两侧肋膈角锐利。1.两肺未见明显活动性病变，随诊。2.心影改变请结合临床。
[[0, 2, '器官组织', '胸廓'], [2, 4, '阴性表现', '对称'], [5, 7, '器官组织', '气管'], [7, 9, '阴性表现', '居中'], [12, 16, '器官组织', '骨骼骨质'], [16, 18, '属性', '结构'], [18, 20, '阴性表现', '完整'], [21, 23, '器官组织', '双肺'], [23, 25, '属性', '纹理'], [25, 27, '阴性表现', '清晰'], [28, 32, '器官组织', '两肺门影'], [32, 34, '阴性表现', '不大'], [35, 37, '器官组织', '心影'], [37, 39, '属性', '横径'], [39, 41, '阳性表现', '增大'], [42, 45, '器官组织', '左心缘'], [45, 47, '阳性表现', '饱满'], [48, 52, '器官组织', '两侧膈面'], [52, 54, '阴性表现', '光整'], [55, 60, '器官组织', '两侧肋膈角'], [60, 62, '阴性表现', '锐利'], [65, 67, '器官组织', '两肺'], [67, 69, '否定描述', '未见'], [69, 71, '修饰描述', '明显'], [71, 74, '修饰描述', '活动性'], [74, 76, '异常现象', '病变'], [82, 84, '器官组织', '心影'], [84, 86, '异常现象', '改变']]

廓
称
管
中

胸廓
对称
气
居中


In [4]:
sentences = []

with open('datasets/train.conll_convert.conll', 'r', encoding='utf-8') as f:
    for sentence in tqdm(f.readlines()):
        try:
            sentence = json.loads(sentence)
        except Exception as e:
            continue

        offset_mapping_output = tokenizer_fast(sentence['sent'], return_offsets_mapping=True)["offset_mapping"]
        # 处理原句空格
        offset_mapping = []
        for i, (start, end) in enumerate(offset_mapping_output):
            if (end > 0) and (i >= 2):
                start -= (i - 1)
                end -= (i - 1)
            offset_mapping.append((start, end))

        start_mapping = {j[0]: i for i, j in enumerate(offset_mapping) if j != (0, 0)}
        # j[1] - 1表示该token结尾字符的位置
        end_mapping = {j[1] - 1: i for i, j in enumerate(offset_mapping) if j != (0, 0)}

        ent2token_spans = []
        for ent in sentence['ners']:
            start_idx, end_idx, entity_type, entity_text = ent[0], ent[1] - 1, ent[2], ent[3]
            if start_idx in start_mapping and end_idx in end_mapping:
                start_span = start_mapping[start_idx]
                end_span = end_mapping[end_idx]
            ent2token_spans.append([start_span, end_span, entity_type, entity_text])
        sentence['spans'] = ent2token_spans
        sentences.append(sentence)

100%|██████████| 144757/144757 [00:02<00:00, 54946.12it/s]


In [5]:
with open('datasets/train_ner.json', 'w', encoding='utf-8') as f:
    json.dump(sentences, f, ensure_ascii=False, indent=2)