# CLUE-NER数据预处理
- 数据分析
- 数据处理

## 读取数据

In [188]:
import srsly
from datasets import Dataset, load_from_disk, DatasetDict, ClassLabel, Sequence
from transformers import AutoTokenizer, DataCollatorForTokenClassification, BertTokenizer, BertTokenizerFast
from spacy.training import offsets_to_biluo_tags
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import pandas as pd
import torch
from spacy.tokens import Doc
import jieba
import datasets

In [89]:
train_data = list(srsly.read_jsonl('clue_ner/train.json'))

In [90]:
train_data[0]['text'] = '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.'

In [91]:
train_data[0]['text']

'浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.'

In [92]:
train_data[1506]  #这个数据出现了嵌套 删除

{'text': '据了解，日前有媒体上刊发了题为《招商银行：投资永隆银行浮亏逾百亿港元》的文章，',
 'label': {'book': {'《招商银行：投资永隆银行浮亏逾百亿港元》': [[15, 34]]},
  'company': {'招商银行': [[16, 19]], '永隆银行': [[23, 26]]}}}

In [93]:
del train_data[1506]

In [94]:
train_data[1506]

{'text': '姜山：切沃在迎战近况极佳的尤文之前刚刚换帅，不过亚齐尼的接任者也很难给人太多的信心———',
 'label': {'organization': {'切沃': [[3, 4]], '尤文': [[13, 14]]},
  'name': {'姜山': [[0, 1]], '亚齐尼': [[24, 26]]}}}

In [95]:
train_data[24] = {'text': '当记者在泗水街头随便询问10余名市民却发现，只有寥寥几人知道当地有如此神奇的东西。',
 'label': {'address': {'泗水': [[4, 5]]}, 'position': {'记者': [[1, 2]]}}}
train_data[24]

{'text': '当记者在泗水街头随便询问10余名市民却发现，只有寥寥几人知道当地有如此神奇的东西。',
 'label': {'address': {'泗水': [[4, 5]]}, 'position': {'记者': [[1, 2]]}}}

In [96]:
train_data[1722]['label'] = {'game': {'dota': [[27, 30]]}}

In [97]:
train_data[1986]['label'] = {'address': {'香港': [[28, 29]]}, 'movie': {'《狩猎聚会》': [[4, 9]], 'thehuntingparty': [[11, 25]], '猎狐行动': [[33, 36]]}}

In [98]:
train_data[3016] = {'text': 'Moon星际2无力GSL,A级预选落败',
 'label': {'game': {'星际2': [[4, 6]]},
  'organization': {'GSL': [[9, 11]]},
  'name': {'Moon': [[0, 3]]}}}

In [99]:
train_data[3016]

{'text': 'Moon星际2无力GSL,A级预选落败',
 'label': {'game': {'星际2': [[4, 6]]},
  'organization': {'GSL': [[9, 11]]},
  'name': {'Moon': [[0, 3]]}}}

In [100]:
train_data[5175] = {'text': 'StarCraft2 Forum在倒计时的上方有一行字：“它将来临……”',
 'label': {'game': {'StarCraft2': [[0, 9]]}}}
train_data[5175]

{'text': 'StarCraft2 Forum在倒计时的上方有一行字：“它将来临……”',
 'label': {'game': {'StarCraft2': [[0, 9]]}}}

In [101]:
train_data[6554] = {'text': '朴茨茅斯队vs ac米兰队两队在欧洲三大杯历史上没有交锋战绩。',
 'label': {'organization': {'朴茨茅斯队': [[0, 4]], 'ac米兰队': [[8, 12]]}}}
train_data[6554]

{'text': '朴茨茅斯队vs ac米兰队两队在欧洲三大杯历史上没有交锋战绩。',
 'label': {'organization': {'朴茨茅斯队': [[0, 4]], 'ac米兰队': [[8, 12]]}}}

In [102]:
train_data[6907]

{'text': '最后一种模式和War3DotA中的死亡模式类似，当你的英雄死亡，',
 'label': {'game': {'War3': [[7, 10]], 'DotA': [[11, 14]]}}}

In [103]:
valid_data = list(srsly.read_jsonl('clue_ner/dev.json'))
valid_data[0]

{'text': '彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，',
 'label': {'address': {'台湾': [[15, 16]]}, 'name': {'彭小军': [[0, 2]]}}}

## 数据分析

In [104]:
def to_df(data):
    data_ls = []
    for d in data:
        text = d['text']
        for label in d['label']:
            for span in d['label'][label]:
                start_index = d['label'][label][span][0][0]
                end_index = d['label'][label][span][0][1]
                data_ls.append([text, span, label, start_index, end_index])
    return pd.DataFrame(data_ls, columns=['text', 'span','label', 'start', 'end'])

In [105]:
train_df = to_df(train_data)
valid_df = to_df(valid_data)

In [106]:
valid_df['label'].value_counts()

name            451
position        425
company         366
address         364
organization    344
game            287
government      244
scene           199
book            152
movie           150
Name: label, dtype: int64

## 数据处理为huggingface 和 spacy 格式

In [107]:
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')

In [108]:
tokenizer.add_special_tokens({'additional_special_tokens':["A", 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V','W','X','Y','Z', '0', '1','2','3','4','5','6','7','8','9', 'ac','米兰']})

27

In [109]:
print(tokenizer.tokenize('StarCraft2 Forum在倒计时的上方有一行字：“它将来临……”'))

['S', 'ta', '##r', 'C', 'ra', '##ft', '2', 'F', 'or', '##um', '在', '倒', '计', '时', '的', '上', '方', '有', '一', '行', '字', '：', '[UNK]', '它', '将', '来', '临', '[UNK]', '[UNK]', '[UNK]']


In [110]:
nlp = spacy.blank('zh')
tokens = tokenizer.tokenize('今天非常firable')
doc = Doc(nlp.vocab, words=tokens)

In [124]:

def process(data_ls):
    dataset = {'tokens':[], 'labels':[]}
    docs = []
    for data in tqdm(data_ls):
        entities = []
        dataset['tokens'].append([s for s in data['text']])
        mapping = tokenizer(data['text'], add_special_tokens=False, return_offsets_mapping=True)['offset_mapping']
        tokens = tokenizer.tokenize(data['text'], add_special_tokens=False)
        doc = Doc(nlp.vocab, words=tokens)
        spans = []
        for label in data['label']:
            for ent in data['label'][label]:
                char_start = data['label'][label][ent][0][0]
                char_end = data['label'][label][ent][0][1]+1
                entity = []
                # try:
                for i, v in enumerate(mapping):
                    if v[0] == char_start and v[1] == char_end:
                        entity.append(i)
                        entity.append(i+1)
                    if v[0] == char_start and v[1]!= char_end:
                        entity.append(i)
                    elif v[1] == char_end and v[0] != char_start:
                        entity.append(i)
                entity.append(label)
                # print(entity)
                assert len(entity) == 3
                # except AssertionError:
                #     print(entity)
                entities.append(entity)
                e_span = spacy.tokens.Span(doc, entity[0], entity[1], label)
                spans.append(e_span)
        doc.set_ents(spans)
        docs.append(doc)
        tags = [t.ent_iob_ if t.ent_iob_ == 'O' else t.ent_iob_ + '-' + t.ent_type_ for t in doc ]
        dataset['labels'].append(tags)
    return dataset, docs

In [125]:
train_ds, train_docs= process(train_data)
valid_ds, valid_docs = process(valid_data)

100%|██████████| 10747/10747 [00:05<00:00, 1969.46it/s]
100%|██████████| 1343/1343 [00:00<00:00, 2147.58it/s]


In [126]:
index=4222
print(train_data[index]['text'])
print(train_data[index]['label'])
print(tokenizer.tokenize(train_data[index]['text']))
tokenizer(train_data[index]['text'], return_offsets_mapping=True, add_special_tokens=False)['offset_mapping']

过多地依赖范布隆霍斯特、马凯和托马森（正是从费耶诺德辗转ac米兰、斯图加特和维拉利尔，
{'organization': {'费耶诺德': [[22, 25]], '米兰': [[30, 31]], '斯图加特': [[33, 36]], '维拉利尔': [[38, 41]]}, 'name': {'范布隆霍斯特': [[5, 10]], '马凯': [[12, 13]], '托马森': [[15, 17]]}}
['过', '多', '地', '依', '赖', '范', '布', '隆', '霍', '斯', '特', '、', '马', '凯', '和', '托', '马', '森', '（', '正', '是', '从', '费', '耶', '诺', '德', '辗', '转', 'ac', '米兰', '、', '斯', '图', '加', '特', '和', '维', '拉', '利', '尔', '，']


[(0, 1),
 (1, 2),
 (2, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (6, 7),
 (7, 8),
 (8, 9),
 (9, 10),
 (10, 11),
 (11, 12),
 (12, 13),
 (13, 14),
 (14, 15),
 (15, 16),
 (16, 17),
 (17, 18),
 (18, 19),
 (19, 20),
 (20, 21),
 (21, 22),
 (22, 23),
 (23, 24),
 (24, 25),
 (25, 26),
 (26, 27),
 (27, 28),
 (28, 30),
 (30, 32),
 (32, 33),
 (33, 34),
 (34, 35),
 (35, 36),
 (36, 37),
 (37, 38),
 (38, 39),
 (39, 40),
 (40, 41),
 (41, 42),
 (42, 43)]

In [127]:
train_ds['labels'][1]

['O',
 'O',
 'O',
 'O',
 'B-game',
 'I-game',
 'I-game',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [128]:
train_sp, valid_sp = DocBin(docs=train_docs), DocBin(docs=valid_docs)

In [260]:
train_sp.to_disk('cluener_train.spacy')
valid_sp.to_disk('cluener_valid.spacy')

In [277]:
docs = list(DocBin().from_disk(path='cluener_train.spacy').get_docs(nlp.vocab))

In [150]:
# train_ds['tokens']

In [129]:
train_hf = Dataset.from_dict(train_ds)
valid_hf = Dataset.from_dict(valid_ds)

In [130]:
dataset = DatasetDict()
dataset['train'] = train_hf
dataset['validation'] = valid_hf

In [131]:
dataset.save_to_disk('clue_ner')

In [132]:
ds = load_from_disk('clue_ner/')

In [134]:
print(ds['train'][10])

{'tokens': ['主', '要', '属', '于', '结', '构', '性', '理', '财', '产', '品', '。', '上', '周', '交', '通', '银', '行', '发', '行', '了', '“', '天', '添', '利', '”', '系', '列', '理', '财', '产', '品', '，', '投', '资', '者', '在', '封', '闭', '期', '申', '购', '该', '系', '列', '理', '财', '产', '品', '，'], 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-company', 'I-company', 'I-company', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [370]:
def set_transform(example):
        tokens = example['tokens'][0]
        inputs = tokenizer(tokens, is_split_into_words=True, padding='max_length', max_length=50)
        inputs = dict(zip(inputs.keys(), map(torch.tensor, inputs.values())))
        labels = example['labels'][0]
        # print(labels)
        labels = [-100] + [label2id[label] for label in labels] 
        labels = labels + (50 - len(labels)) * [-100]
        labels = torch.tensor(labels)
        assert len(labels) == len(inputs['input_ids'])
        return {'inputs':[inputs], 'labels':[labels]}

In [371]:
ds.set_transform(set_transform)

In [376]:
ds['train']

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 10747
})

In [372]:
ds['train'][0]

{'inputs': {'input_ids': tensor([ 101, 3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439,
          3424, 1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758,
          6887, 7305, 3546, 6822, 6121,  749, 6237, 6438,  119,  102,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0])},
 'labels': tensor([-100,    5,   32,   32,   23,   18,   18,   18,   18,   18,    9,   15,
            8,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,
           18,   18,   18,   18,   18,   18,   18,  

## 处理为span格式 

In [152]:
def label2span(data_ls):
    dataset = {'text':[], 'tokens': [], 'spans':[]}
    for data in data_ls:
        dataset['text'].append([data['text']])
        dataset['tokens'].append([s for s in data['text']])
        spans = []
        for label in data['label']:
            for ent in data['label'][label]:
                for offset in data['label'][label][ent]:
                    spans.append([label, str(offset[0]), str(offset[1]), ent])
        dataset['spans'].append(spans)
    return dataset
                    
        
    

In [157]:
train_dict = label2span(train_data)
val_dict =label2span(valid_data)

In [155]:
train_dict['spans'][2]

[['organization', '0', '3', '那不勒斯'],
 ['organization', '6', '8', '锡耶纳'],
 ['organization', '11', '12', '桑普'],
 ['organization', '15', '17', '热那亚']]

In [180]:
train_ds = Dataset.from_dict(train_dict)
val_ds = Dataset.from_dict(val_dict)

In [181]:
ds = DatasetDict()

In [182]:
ds['train'] = train_ds
ds['validation'] = val_ds

In [183]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'spans'],
        num_rows: 10747
    })
    validation: Dataset({
        features: ['text', 'tokens', 'spans'],
        num_rows: 1343
    })
})

In [184]:
ds.save_to_disk('clue_ner_span')

In [207]:
ds = datasets.load_from_disk('clue_ner_span')

In [210]:
print(ds['train'][0])

{'text': ['浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.'], 'tokens': ['浙', '商', '银', '行', '企', '业', '信', '贷', '部', '叶', '老', '桂', '博', '士', '则', '从', '另', '一', '个', '角', '度', '对', '五', '道', '门', '槛', '进', '行', '了', '解', '读', '.'], 'spans': [['name', '9', '11', '叶老桂'], ['company', '0', '3', '浙商银行']]}


In [191]:
set_labels = sorted(set([span[0] for spans in ds['train']['spans'] for span in spans]))
label2id = {label: i for i, label in enumerate(set_labels)}

In [192]:
label2id

{'address': 0,
 'book': 1,
 'company': 2,
 'game': 3,
 'government': 4,
 'movie': 5,
 'name': 6,
 'organization': 7,
 'position': 8,
 'scene': 9}

In [196]:
def set_transform(example):

        tokens = example['tokens'][0]
        inputs = tokenizer(
            tokens, 
            add_special_tokens=False,
            is_split_into_words=True, 
            padding='max_length',  
            max_length=50)
        inputs = dict(zip(inputs.keys(), map(torch.tensor, inputs.values())))
        spans = example['spans'][0]
        span_ids = torch.zeros(len(label2id), 50, 50)
        for span in spans :
            span_ids[label2id[span[0]],  int(span[1]), int(span[2])] = 1
        return {'inputs':[inputs], 'span_ids':[span_ids]}

In [214]:
ds.set_transform(set_transform)

In [220]:
id1 = ds['train'][0]['span_ids']
id2 = ds['train'][1]['span_ids']

In [222]:
ls = []
ls.append(id1)
ls.append(id2)

In [224]:
torch.cat(ls).shape

torch.Size([20, 50, 50])

In [204]:
ids[ids.gt(0)]

tensor([1., 1.])

In [206]:
ids.shape

torch.Size([10, 50, 50])

In [211]:
!zip -r clue_ner_span.zip clue_ner_span

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: clue_ner_span/ (stored 0%)
  adding: clue_ner_span/train/ (stored 0%)
  adding: clue_ner_span/train/state.json (deflated 41%)
  adding: clue_ner_span/train/dataset_info.json (deflated 74%)
  adding: clue_ner_span/train/dataset.arrow (deflated 56%)
  adding: clue_ner_span/dataset_dict.json (stored 0%)
  adding: clue_ner_span/validation/ (stored 0%)
  adding: clue_ner_span/validation/state.json (deflated 40%)
  adding: clue_ner_span/validation/dataset_info.json (deflated 74%)
  adding: clue_ner_span/validation/dataset.arrow (deflated 54%)


In [226]:
ts = torch.tensor([0,0,0])

In [229]:
ts[0] =1
ts

tensor([1, 0, 0])