In [1]:
import os
import json
from tqdm.notebook import tqdm
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
class CMeEE(Dataset):
    def __init__(self, tokenizer):
        super().__init__()
        with open('../../../datasets/NER/CMeEE-V2/CMeEE-V2_train.json') as json_file:
            data = json.load(json_file)
        
        entity_types = set()
        for item in data:
            for entity in item['entities']:
                entity_types.add(entity['type'])

        self.type2idx = {}
        idx = 1
        for entity_type in entity_types:
            self.type2idx['B_' + entity_type.upper()] = idx
            self.type2idx['I_' + entity_type.upper()] = idx + 1
            idx += 2
        
        self.all_input_ids = []
        self.all_token_type_ids = []
        
        for item in tqdm(data):
            text = item['text']
            entities = item['entities']
            tokenized = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
            input_ids = tokenized['input_ids'].squeeze(0)
            self.all_input_ids.append(input_ids)
            
            token_type_ids = tokenized['token_type_ids'].squeeze(0)
            for entity in entities:
                start_idx = entity['start_idx']
                end_idx = entity['end_idx']   
                if end_idx == start_idx:
                    token_type_ids[start_idx: end_idx + 1] = self.type2idx['B_' + entity['type'].upper()]
                else:
                    token_type_ids[start_idx: start_idx + 1] = self.type2idx['B_' + entity['type'].upper()]
                    token_type_ids[start_idx + 1: end_idx + 1] = self.type2idx['I_' + entity['type'].upper()]
            self.all_token_type_ids.append(token_type_ids)
            
    def __getitem__(self, idx):
        return self.all_input_ids[idx], self.all_token_type_ids[idx]
    
    def __len__(self):
        return len(self.all_input_ids)

In [3]:
pretrained_model = '../../../models/bert-base-chinese/'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

dataset = CMeEE(tokenizer)

  0%|          | 0/15000 [00:00<?, ?it/s]

In [4]:
def collate_fn(batch_data):
    x_batch = pad_sequence([x for x, y in batch_data], padding_value=0, batch_first=True)
    y_batch = pad_sequence([y for x, y in batch_data], padding_value=0, batch_first=True)
    return x_batch, y_batch
    
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [5]:
class BERT_NER_Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('../../../models/bert-base-chinese/')
        self.fc = torch.nn.Linear(768, len(dataset.type2idx) + 1)
        
    def forward(self, x):
        last_hidden_state = self.bert(x).last_hidden_state
        return self.fc(last_hidden_state)
    
model = BERT_NER_Model().to(device)
if 'BERT_NER_CMeEE.pt' in os.listdir():
    model.load_state_dict(torch.load('BERT_NER_CMeEE.pt'))
    print('load ckpt')

Some weights of the model checkpoint at ../../../models/bert-base-chinese/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load ckpt


In [6]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(1, 11):
    total_loss = 0
    num_sample = 0
    with tqdm(dataloader, unit='batch') as tepoch:
        for x, y in tepoch:
            tepoch.set_description(f'Epoch {epoch}')
            x = x.to(device)
            y = y.to(device)

            output = model(x)
            loss = criterion(output.reshape(-1, output.shape[-1]), y.reshape(-1))
            
            total_loss += loss.item()
            num_sample += x.shape[0]
            tqdm_postfix = format(total_loss / num_sample, '.6f')
            tepoch.set_postfix(loss=tqdm_postfix)

            model.zero_grad()
            loss.backward()
            optimizer.step()
            
torch.save(model.state_dict(), './BERT_NER_CMeEE.pt')

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

  0%|          | 0/1875 [00:00<?, ?batch/s]

In [58]:
def get_predict_ner(text, argmax, idx2type):
    bio_predict = []
    for i in argmax:
        if i == 0:
            bio_predict.append(idx2type[i])
        else:
            bio_predict.append(idx2type[i].split('_')[1])

    bio_predict_new = []
    for i in range(len(bio_predict)):
        if bio_predict[i] == 'O' or (i > 0 and i < len(bio_predict) - 1 and bio_predict[i] == bio_predict[i - 1] and bio_predict[i] == bio_predict[i + 1]):
            continue
        bio_predict_new.append((i, bio_predict[i]))

    predict_ner = []

    i = 0
    while i < len(bio_predict_new) - 1:
        if bio_predict_new[i][1] != bio_predict_new[i + 1][1]: 
            predict_ner.append({'ner': ''.join(text[bio_predict_new[i][0]: bio_predict_new[i][0] + 1]),
                                'type': bio_predict_new[i][1],
                                'start': bio_predict_new[i][0],
                                'end': bio_predict_new[i][0]})
            i += 1
        else:
            predict_ner.append({'ner': ''.join(text[bio_predict_new[i][0]: bio_predict_new[i + 1][0] + 1]),
                                'type': bio_predict_new[i][1],
                                'start': bio_predict_new[i][0],
                                'end': bio_predict_new[i + 1][0]})
            i += 2
    return predict_ner

with open('../../../datasets/NER/CMeEE-V2/CMeEE-V2_dev.json') as json_file:
    data = json.load(json_file)

text = data[4]['text']
data[4]['entities']

[{'start_idx': 26, 'end_idx': 27, 'type': 'sym', 'entity': '发热'},
 {'start_idx': 29, 'end_idx': 30, 'type': 'sym', 'entity': '咳嗽'},
 {'start_idx': 32, 'end_idx': 34, 'type': 'sym', 'entity': '热峰高'},
 {'start_idx': 36, 'end_idx': 44, 'type': 'sym', 'entity': '腋温多在39℃以上'},
 {'start_idx': 36, 'end_idx': 37, 'type': 'ite', 'entity': '腋温'},
 {'start_idx': 46, 'end_idx': 47, 'type': 'sym', 'entity': '发热'}]

In [59]:
encoded = tokenizer(text, max_length=512, truncation=True, return_tensors='pt')['input_ids']
encoded = encoded.to(device)

model.eval()
output = model(encoded)
argmax = output[0].argmax(dim=1).tolist()

type2idx = {v: k for k, v in dataset.type2idx.items()}
type2idx[0] = 'O'
get_predict_ner(text, argmax, type2idx)

[{'ner': '发热', 'type': 'SYM', 'start': 26, 'end': 27},
 {'ner': '咳嗽', 'type': 'SYM', 'start': 29, 'end': 30},
 {'ner': '热峰高', 'type': 'SYM', 'start': 32, 'end': 34},
 {'ner': '腋温多在39℃以上', 'type': 'SYM', 'start': 36, 'end': 44},
 {'ner': '发热持续', 'type': 'SYM', 'start': 46, 'end': 49}]