In [153]:
import json
import numpy as np
from transformers import BertTokenizerFast

In [154]:
with open('CMeEE_train.json', encoding='utf-8') as f:
    data_raw = json.load(f)

data_raw  # 列表

[{'text': '【病原和流行病学】狂犬病病毒（rabiesvirus）属弹状病毒科狂犬病病毒属。',
  'entities': [{'start_idx': 9,
    'end_idx': 13,
    'type': 'mic',
    'entity': '狂犬病病毒'},
   {'start_idx': 15, 'end_idx': 25, 'type': 'mic', 'entity': 'rabiesvirus'},
   {'start_idx': 28, 'end_idx': 31, 'type': 'mic', 'entity': '弹状病毒'},
   {'start_idx': 33, 'end_idx': 37, 'type': 'mic', 'entity': '狂犬病病毒'}]},
 {'text': '对儿童SARST细胞亚群的研究表明，与成人SARS相比，儿童细胞下降不明显，证明上述推测成立。',
  'entities': [{'start_idx': 3,
    'end_idx': 9,
    'type': 'bod',
    'entity': 'SARST细胞'},
   {'start_idx': 19, 'end_idx': 24, 'type': 'dis', 'entity': '成人SARS'}]},
 {'text': '研究证实，细胞减少与肺内病变程度及肺内炎性病变吸收程度密切相关。',
  'entities': [{'start_idx': 10, 'end_idx': 10, 'type': 'bod', 'entity': '肺'},
   {'start_idx': 10, 'end_idx': 13, 'type': 'sym', 'entity': '肺内病变'},
   {'start_idx': 17, 'end_idx': 17, 'type': 'bod', 'entity': '肺'},
   {'start_idx': 17, 'end_idx': 22, 'type': 'sym', 'entity': '肺内炎性病变'}]}]

In [155]:
data_raw[0]

{'text': '【病原和流行病学】狂犬病病毒（rabiesvirus）属弹状病毒科狂犬病病毒属。',
 'entities': [{'start_idx': 9,
   'end_idx': 13,
   'type': 'mic',
   'entity': '狂犬病病毒'},
  {'start_idx': 15, 'end_idx': 25, 'type': 'mic', 'entity': 'rabiesvirus'},
  {'start_idx': 28, 'end_idx': 31, 'type': 'mic', 'entity': '弹状病毒'},
  {'start_idx': 33, 'end_idx': 37, 'type': 'mic', 'entity': '狂犬病病毒'}]}

In [156]:
# 9表示开始字符('狂')的位置
print(data_raw[0]['text'][9], end='\n\n')

# 13表示结尾字符('毒')的位置
print(data_raw[0]['text'][13])
print(data_raw[0]['text'][25])
print(data_raw[0]['text'][31])
print(data_raw[0]['text'][37], end='\n\n')

# 字符串索引不包括结尾位置元素
print(data_raw[0]['text'][9:13 + 1])
print(data_raw[0]['text'][15:25 + 1])
print(data_raw[0]['text'][28:31 + 1])
print(data_raw[0]['text'][33:37 + 1])

狂

毒
s
毒
毒

狂犬病病毒
rabiesvirus
弹状病毒
狂犬病病毒


In [157]:
tokenizer = BertTokenizerFast.from_pretrained('IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese')
tokenizer

PreTrainedTokenizerFast(name_or_path='IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese', vocab_size=12800, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [158]:
tokenizer.tokenize(data_raw[0]['text'])

['【',
 '病',
 '原',
 '和',
 '流',
 '行',
 '病',
 '学',
 '】',
 '狂',
 '犬',
 '病',
 '病',
 '毒',
 '（',
 'ra',
 '##bi',
 '##e',
 '##s',
 '##vi',
 '##r',
 '##us',
 '）',
 '属',
 '弹',
 '状',
 '病',
 '毒',
 '科',
 '狂',
 '犬',
 '病',
 '病',
 '毒',
 '属',
 '。']

In [159]:
outputs = tokenizer([data_raw[0]['text']],
                    max_length=512, truncation=True, padding=True,
                    return_offsets_mapping=True)
offset_mapping = outputs["offset_mapping"]
offset_mapping  # return (char_start, char_end) for each token.

[[(0, 0),
  (0, 1),
  (1, 2),
  (2, 3),
  (3, 4),
  (4, 5),
  (5, 6),
  (6, 7),
  (7, 8),
  (8, 9),
  (9, 10),
  (10, 11),
  (11, 12),
  (12, 13),
  (13, 14),
  (14, 15),
  (15, 17),
  (17, 19),
  (19, 20),
  (20, 21),
  (21, 23),
  (23, 24),
  (24, 26),
  (26, 27),
  (27, 28),
  (28, 29),
  (29, 30),
  (30, 31),
  (31, 32),
  (32, 33),
  (33, 34),
  (34, 35),
  (35, 36),
  (36, 37),
  (37, 38),
  (38, 39),
  (39, 40),
  (0, 0)]]

In [160]:
# (0, 0)表示特殊token(如:'[CLS]','[SEP'], '[PAD]'等)
# i表示第几个token(从0开始计数,包含特殊token)
# j[1] - 1表示该token结尾字符的位置
start_mapping = [{j[0]: i for i, j in enumerate(i) if j != (0, 0)} for i in offset_mapping]
end_mapping = [{j[1] - 1: i for i, j in enumerate(i) if j != (0, 0)} for i in offset_mapping]
print(start_mapping)
print(end_mapping)

[{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 17: 17, 19: 18, 20: 19, 21: 20, 23: 21, 24: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 34: 31, 35: 32, 36: 33, 37: 34, 38: 35, 39: 36}]
[{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 16: 16, 18: 17, 19: 18, 20: 19, 22: 20, 23: 21, 25: 22, 26: 23, 27: 24, 28: 25, 29: 26, 30: 27, 31: 28, 32: 29, 33: 30, 34: 31, 35: 32, 36: 33, 37: 34, 38: 35, 39: 36}]


In [161]:
for i in data_raw[0]['entities']:
    print(i)
    start_idx, end_idx, entity_type, entity_text = i['start_idx'], i['end_idx'], i['type'], i['entity']
    if start_idx in start_mapping[0] and end_idx in end_mapping[0]:
        start_span = start_mapping[0][start_idx]
        end_span = end_mapping[0][end_idx]
        print("start_span: {},end_span: {}".format(start_span, end_span))  # 该实体由第[start_span, end_span]的token组成(从0开始)

{'start_idx': 9, 'end_idx': 13, 'type': 'mic', 'entity': '狂犬病病毒'}
start_span: 10,end_span: 14
{'start_idx': 15, 'end_idx': 25, 'type': 'mic', 'entity': 'rabiesvirus'}
start_span: 16,end_span: 22
{'start_idx': 28, 'end_idx': 31, 'type': 'mic', 'entity': '弹状病毒'}
start_span: 25,end_span: 28
{'start_idx': 33, 'end_idx': 37, 'type': 'mic', 'entity': '狂犬病病毒'}
start_span: 30,end_span: 34


### 注意

In [162]:
test_text = 'variety of diseases，     想要4atm和5sim!'

In [163]:
# 多空格('    ')视为单个空格(' ')
# '4atm'======拆分为======>['4', '##at', '##m']('atm'为一个实体,此时被错误拆分)
init_t = tokenizer(test_text, add_special_tokens=False)['input_ids']
print(init_t)
print(tokenizer.convert_ids_to_tokens(init_t))

['var', '##i', '##e', '##ty', 'of', 'dis', '##e', '##as', '##e', '##s', '，', '想', '要', '4', '##at', '##m', '和', '5', '##s', '##im', '!']
[10933, 11902, 11898, 12386, 9091, 10710, 11898, 12299, 11898, 11912, 183, 2454, 6459, 214, 12300, 11906, 1091, 216, 11912, 12331, 104]
['var', '##i', '##e', '##ty', 'of', 'dis', '##e', '##as', '##e', '##s', '，', '想', '要', '4', '##at', '##m', '和', '5', '##s', '##im', '!']


In [164]:
# 解决:
test_text_sp = np.array(list(test_text))
test_text_sp = np.where(test_text_sp == ' ', '[SP]', test_text_sp).tolist()
tokenizer.add_tokens(new_tokens=['[SP]'])
sp_t = tokenizer(test_text_sp, is_split_into_words=True)['input_ids']
print(sp_t)
print(tokenizer.convert_ids_to_tokens(sp_t))

[1, 8273, 8252, 8269, 8260, 8256, 8271, 8276, 12800, 8266, 8257, 12800, 8255, 8260, 8270, 8256, 8252, 8270, 8256, 8270, 183, 12800, 12800, 12800, 12800, 12800, 2454, 6459, 214, 8252, 8271, 8264, 1091, 216, 8270, 8260, 8264, 104, 2]
['[CLS]', 'v', 'a', 'r', 'i', 'e', 't', 'y', '[SP]', 'o', 'f', '[SP]', 'd', 'i', 's', 'e', 'a', 's', 'e', 's', '，', '[SP]', '[SP]', '[SP]', '[SP]', '[SP]', '想', '要', '4', 'a', 't', 'm', '和', '5', 's', 'i', 'm', '!', '[SEP]']
