# CLUE-NER数据预处理
- 数据分析
- 数据处理

## 读取数据

In [1]:
import srsly
from datasets import Dataset, load_from_disk, DatasetDict, ClassLabel, Sequence
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from spacy.training import offsets_to_biluo_tags
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import pandas as pd
import torch

In [2]:
train_data = list(srsly.read_jsonl('cluener/train.json'))

In [3]:
train_data[0]['text'] = '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.'

In [4]:
train_data[0]['text']

'浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.'

In [5]:
train_data[1506]  #这个数据出现了嵌套 删除

{'text': '据了解，日前有媒体上刊发了题为《招商银行：投资永隆银行浮亏逾百亿港元》的文章，',
 'label': {'book': {'《招商银行：投资永隆银行浮亏逾百亿港元》': [[15, 34]]},
  'company': {'招商银行': [[16, 19]], '永隆银行': [[23, 26]]}}}

In [6]:
del train_data[1506]

In [7]:
train_data[1506]

{'text': '姜山：切沃在迎战近况极佳的尤文之前刚刚换帅，不过亚齐尼的接任者也很难给人太多的信心———',
 'label': {'organization': {'切沃': [[3, 4]], '尤文': [[13, 14]]},
  'name': {'姜山': [[0, 1]], '亚齐尼': [[24, 26]]}}}

In [8]:
valid_data = list(srsly.read_jsonl('cluener/dev.json'))

## 数据分析

In [9]:
def to_df(data):
    data_ls = []
    for d in data:
        text = d['text']
        for label in d['label']:
            for span in d['label'][label]:
                start_index = d['label'][label][span][0][0]
                end_index = d['label'][label][span][0][1]
                data_ls.append([text, span, label, start_index, end_index])
    return pd.DataFrame(data_ls, columns=['text', 'span','label', 'start', 'end'])

In [34]:
train_df = to_df(train_data)
valid_df = to_df(valid_data)

In [35]:
valid_df['label'].value_counts()

name            451
position        425
company         366
address         364
organization    344
game            287
government      244
scene           199
book            152
movie           150
Name: label, dtype: int64

## 数据处理为huggingface 和 spacy 格式

In [40]:
nlp = spacy.blank('zh')
def process(data_ls):
    dataset = {'tokens':[], 'labels':[]}
    docs = []
    for data in tqdm(data_ls):
        entities = []
        dataset['tokens'].append([s for s in data['text']])
        doc = nlp(data['text'])
        spans = []
        for label in data['label']:
            for ent in data['label'][label]:
                entity = []
                entity.append(data['label'][label][ent][0][0])
                entity.append(data['label'][label][ent][0][1]+1)
                entity.append(label)
                entities.append(entity)
                e_span = spacy.tokens.Span(doc, data['label'][label][ent][0][0], data['label'][label][ent][0][1]+1, label)
                spans.append(e_span)
        doc.set_ents(spans)
        docs.append(doc)
        tags = [t.ent_iob_ if t.ent_iob_ == 'O' else t.ent_iob_ + '-' + t.ent_type_ for t in doc ]
        dataset['labels'].append(tags)
    return dataset, docs

In [41]:
train_ds, train_docs= process(train_data)
valid_ds, valid_docs = process(valid_data)

100%|██████████| 10747/10747 [00:01<00:00, 7726.99it/s]
100%|██████████| 1343/1343 [00:00<00:00, 10430.48it/s]


In [42]:
train_ds['labels'][1]

['O',
 'O',
 'O',
 'O',
 'B-game',
 'I-game',
 'I-game',
 'I-game',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [259]:
train_sp, valid_sp = DocBin(docs=train_docs), DocBin(docs=valid_docs)

In [260]:
train_sp.to_disk('cluener_train.spacy')
valid_sp.to_disk('cluener_valid.spacy')

In [277]:
docs = list(DocBin().from_disk(path='cluener_train.spacy').get_docs(nlp.vocab))

In [284]:
docs[0]

浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读.

In [43]:
train_hf = Dataset.from_dict(train_ds)
valid_hf = Dataset.from_dict(valid_ds)

In [44]:
dataset = DatasetDict()
dataset['train'] = train_hf
dataset['validation'] = valid_hf

In [45]:
dataset.save_to_disk('clue_ner')

In [2]:
ds = load_from_disk('clue_ner/')

In [15]:
set_labels = set([label for labels in ds['train']['labels'] for label in labels])
label2id = sorted({label:id for id,label in enumerate(set_labels)})

In [16]:
label2id

['B-address',
 'B-book',
 'B-company',
 'B-game',
 'B-government',
 'B-movie',
 'B-name',
 'B-organization',
 'B-position',
 'B-scene',
 'I-address',
 'I-book',
 'I-company',
 'I-game',
 'I-government',
 'I-movie',
 'I-name',
 'I-organization',
 'I-position',
 'I-scene',
 'O']

In [50]:
srsly.write_json('clue_ner/label2id.json', label2id)

In [370]:
def set_transform(example):
        tokens = example['tokens'][0]
        inputs = tokenizer(tokens, is_split_into_words=True, padding='max_length', max_length=50)
        inputs = dict(zip(inputs.keys(), map(torch.tensor, inputs.values())))
        labels = example['labels'][0]
        # print(labels)
        labels = [-100] + [label2id[label] for label in labels] 
        labels = labels + (50 - len(labels)) * [-100]
        labels = torch.tensor(labels)
        assert len(labels) == len(inputs['input_ids'])
        return {'inputs':[inputs], 'labels':[labels]}

In [371]:
ds.set_transform(set_transform)

In [376]:
ds['train']

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 10747
})

In [372]:
ds['train'][0]

{'inputs': {'input_ids': tensor([ 101, 3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439,
          3424, 1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758,
          6887, 7305, 3546, 6822, 6121,  749, 6237, 6438,  119,  102,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0])},
 'labels': tensor([-100,    5,   32,   32,   23,   18,   18,   18,   18,   18,    9,   15,
            8,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,
           18,   18,   18,   18,   18,   18,   18,  

In [362]:
from torch.utils.data import DataLoader

In [363]:
loader = DataLoader(dataset=ds['train'])

In [364]:
next(iter(loader))

{'inputs': {'input_ids': tensor([[ 101, 3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439,
           3424, 1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758,
           6887, 7305, 3546, 6822, 6121,  749, 6237, 6438,  119,  102,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
              0,    0]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0]])},
 'labels': tensor([[-100,    5,   32,   32,   23,   18,   18,   18,   18,   18,    9,   15,
             8,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,   18,
            18,   18,   18,   18,   

In [51]:
!zip -r clue_ner.zip clue_ner

  adding: clue_ner/ (stored 0%)
  adding: clue_ner/train/ (stored 0%)
  adding: clue_ner/train/state.json (deflated 40%)
  adding: clue_ner/train/dataset_info.json (deflated 67%)
  adding: clue_ner/train/dataset.arrow (deflated 68%)
  adding: clue_ner/dataset_dict.json (stored 0%)
  adding: clue_ner/label2id.json (deflated 55%)
  adding: clue_ner/validation/ (stored 0%)
  adding: clue_ner/validation/state.json (deflated 40%)
  adding: clue_ner/validation/dataset_info.json (deflated 67%)
  adding: clue_ner/validation/dataset.arrow (deflated 66%)


In [374]:
tokenizer('玩一玩cdol', return_offsets_mapping=True, add_special_tokens=False)

{'input_ids': [4381, 671, 4381, 8405, 8798], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1], 'offset_mapping': [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7)]}