In [1]:
import os
import numpy as np
import random
from tqdm import tqdm

import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import BertTokenizer
from transformers import AutoModelForMaskedLM

In [2]:
class PD2014NER(Dataset):
    def __init__(self, source_path, target_path, bio2idx, tokenizer, max_len=512, sample=None):
        super().__init__()
        
        sources = open(source_path, 'r').readlines()  # 原始句子
        targets = open(target_path, 'r').readlines()  # BIO类别
        
        src_tgt = [(src, tgt) for src, tgt in zip(sources, targets)]
        if sample:
            src_tgt = random.sample(src_tgt, k=sample)

        self.sentences = []
        self.labels = []
        
        for sentence, sentence_bio in tqdm(src_tgt):
            if not sentence.strip() or len(sentence) > max_len - 2:
                continue
            self.sentences.append(tokenizer.encode(sentence.strip().split(' ')))
            self.labels.append([bio2idx[bio] for bio in sentence_bio.strip().split(' ')])
            
    def __getitem__(self, idx):
        return (torch.LongTensor(self.sentences[idx]), torch.LongTensor(self.labels[idx]))
    
    def __len__(self):
        return len(self.labels)

In [5]:
source_path = '../../../datasets/NER/pd2014/source_BIO_2014_cropus.txt'
target_path = '../../../datasets/NER/pd2014/target_BIO_2014_cropus.txt'

BIO = ['O', 'B_LOC', 'I_LOC', 'B_ORG', 'I_ORG', 'B_PER', 'I_PER', 'B_T', 'I_T']
bio2idx = {v: k for k, v in enumerate(BIO)}
idx2bio = {k: v for k, v in enumerate(BIO)}

tokenizer = BertTokenizer.from_pretrained('../../../pretrained-models/bert-base-chinese/')

dataset = PD2014NER(source_path, target_path, bio2idx, tokenizer, sample=10000)

def collate_fn(data_batch):
    x_batch, y_batch = [], []
    for x, y in data_batch:
        x_batch.append(x)
        y_batch.append(y)
    x_batch = pad_sequence(x_batch, padding_value=tokenizer.pad_token_id, batch_first=True)
    y_batch = pad_sequence(y_batch, padding_value=0, batch_first=True)
    return x_batch, y_batch

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [6]:
dataset[0]

(tensor([ 101, 3360,  898, 3247, 1762,  123,  121,  122,  123, 2399,  129, 3299,
          837, 1139,  680, 1750, 1912,  782, 3360,  754, 6631,  113,  100,  150,
          143,  160,  154,  147,  161,  114,  769, 2518,  117, 2496, 3198, 1961,
         6851, 7463,  697,  782, 3193, 2347, 6371, 6399,  122,  121, 1914, 2399,
          117,  852,  671, 4684, 1168, 6134, 1995, 2042, 4851,  677, 2798, 4692,
         2190, 4706,  511,  791, 2399,  130, 3299, 3360,  898, 3247, 6626,  840,
         3142, 3724, 2110,  117,  680, 1762, 5401, 1744, 2339,  868, 4638, 4511,
         1351,  100,  150,  143,  160,  154,  147,  161, 2861, 6818, 6655, 4895,
          117, 2697, 2658, 4937, 2137, 4638,  800,  812,  738, 5846, 4495, 1066,
         5299, 2157, 2431, 4638, 2682, 3791,  511,  102]),
 tensor([5, 6, 6, 0, 7, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
num_class = len(BIO)

model = AutoModelForMaskedLM.from_pretrained('../../../pretrained-models/bert-base-chinese/')
model.cls.predictions.decoder = torch.nn.Linear(768, num_class, bias=True)
model = model.to(device)


optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at ../../../pretrained-models/bert-base-chinese/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:

for epoch in range(20):
    total_loss_train = 0
    for x, y in tqdm(dataloader):
        x = x.to(device)
        y = y.to(device)
        
        logits = model(x).logits
        logits = logits[:, 1: y.shape[1] + 1, :]  # 首尾的[CLS]和[SEP]去掉
        loss = criterion(logits.reshape(-1, num_class), y.reshape(-1))
        
        model.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        
    print(f'Epochs:{epoch + 1}|Train Loss:{total_loss_train / len(dataset): .6f}')


In [None]:
def get_predict_ner(text, argmax):
    bio_predict = []
    for i in argmax:
        if i == 0:
            bio_predict.append(idx2bio[i])
        else:
            bio_predict.append(idx2bio[i].split('_')[1])

    bio_predict_new = []
    for i in range(len(bio_predict)):
        if bio_predict[i] == 'O' or (i > 0 and i < len(bio_predict) - 1 and bio_predict[i] == bio_predict[i - 1] and bio_predict[i] == bio_predict[i + 1]):
            continue
        bio_predict_new.append((i, bio_predict[i]))

    predict_ner = []

    i = 0
    while i < len(bio_predict_new) - 1:
        if bio_predict_new[i][1] != bio_predict_new[i + 1][1]: 
            predict_ner.append({'ner': ''.join(text[bio_predict_new[i][0]: bio_predict_new[i][0] + 1]),
                                'type': bio_predict_new[i][1],
                                'start': bio_predict_new[i][0],
                                'end': bio_predict_new[i][0]})
            i += 1
        else:
            predict_ner.append({'ner': ''.join(text[bio_predict_new[i][0]: bio_predict_new[i + 1][0] + 1]),
                                'type': bio_predict_new[i][1],
                                'start': bio_predict_new[i][0],
                                'end': bio_predict_new[i + 1][0]})
            i += 2
    return predict_ner

In [None]:
text = list('江苏省的刘舒然真是太傻逼了！')
encoded = tokenizer.encode(text)
encoded = torch.LongTensor(encoded).unsqueeze(0)
encoded = encoded.to(device)

model.eval()

output = model(encoded)
argmax = output.logits[0].argmax(dim=1).tolist()[1: -1]

get_predict_ner(text, argmax)