In [1]:
import os
os.chdir('../../')

In [2]:
from dd_nlp_arsenal.dataset.global_pointer_named_entity_recognition_dataset import GlobalPointerNERDataset
# from dd_nlp_arsenal.processor.tokenizer.nezha import TransfomerTokenizer

import pandas as pd
from transformers import AdamW
from torch import nn

import torch

In [3]:
from dd_nlp_arsenal.processor.tokenizer.ori_transformer import SpanTokenizer
from dd_nlp_arsenal.model.ner.global_pointer_bert.global_pointer_nezha import GlobalPointerNezha
from dd_nlp_arsenal.model.ner.global_pointer_bert.global_pointer_bert import GlobalPointerBert

In [4]:
def get_entity_bio(seq, id2label):
    """Gets entities from sequence.
    note: BIO
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        get_entity_bio(seq)
        #output
        [['PER', 0, 1], ['LOC', 3, 3]]
    """
    chunks = []
    chunk = [-1, -1, -1]
    for indx, tag in enumerate(seq):
        if not isinstance(tag, str):
            tag = id2label[tag]
        if tag.startswith("B-"):
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
            chunk[1] = indx
            chunk[0] = tag.split('-')[1]
            chunk[2] = indx
            if indx == len(seq) - 1:
                chunks.append(chunk)
        elif tag.startswith('I-') and chunk[1] != -1:
            _type = tag.split('-')[1]
            if _type == chunk[0]:
                chunk[2] = indx

            if indx == len(seq) - 1:
                chunks.append(chunk)
        else:
            if chunk[2] != -1:
                chunks.append(chunk)
            chunk = [-1, -1, -1]
    return chunks

In [5]:
datalist = []
with open('/Users/anulz/gobal_pointer_baseline.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    lines.append('\n')
    
    text = []
    labels = []
    label_set = set()
    
    for line in lines[:1000]: 
        if line == '\n':                
            text = ''.join(text)
            entity_labels = []
            for _type, _start_idx, _end_idx in get_entity_bio(labels, id2label=None):
                entity_labels.append({
                    'start_idx': _start_idx,
                    'end_idx': _end_idx,
                    'type': _type,
                    'entity': text[_start_idx: _end_idx+1]
                })
                
            if text == '':
                continue
            
            datalist.append({
                'text': text,
                'label': entity_labels
            })
            
            text = []
            labels = []
            
        elif line == '  O\n':
            text.append(' ')
            labels.append('O')
        else:
            line = line.strip('\n').split()
            if len(line) == 1:
                term = ' '
                label = line[0]
            else:
                term, label = line
            text.append(term)
            label_set.add(label.split('-')[-1])
            labels.append(label)

In [6]:
# _ans = []
# with open('/Users/anulz/gobal_pointer_baseline.txt', 'r', encoding='utf-8') as f:
#     lines = f.readlines()
#     lines.append('\n')
#     for line in lines: 
#         if line[0] == ' ' and line != '  O\n':
#             _ans.append('  O\n')
#         else:
#             _ans.append(line)

# with open('/Users/anulz/gobal_pointer_baseline_postprocess.txt', 'w', encoding='utf-8') as f:
#     for line in _ans:
#         f.write(line)

In [7]:
# 这里随意分割了一下看指标，建议实际使用sklearn分割或者交叉验证

train_data_df = pd.DataFrame(datalist)
train_data_df['label'] = train_data_df['label'].apply(lambda x: str(x))

dev_data_df = pd.DataFrame(datalist[-400:])
dev_data_df['label'] = dev_data_df['label'].apply(lambda x: str(x))

In [8]:
label_list = sorted(list(label_set))

In [9]:
ner_train_dataset = GlobalPointerNERDataset(train_data_df, categories=label_list)
ner_dev_dataset = GlobalPointerNERDataset(dev_data_df, categories=ner_train_dataset.categories)

In [10]:
# tokenizer = Tokenizer(vocab='hfl/chinese-bert-wwm', max_seq_len=128)

tokenizer = SpanTokenizer('/Users/anulz/github/code/NLP/PTM/nezha-cn-base/',128)

In [11]:
ner_train_dataset.convert_to_ids(tokenizer)
ner_dev_dataset.convert_to_ids(tokenizer)

In [12]:
class Config():
    seed = 2022
    max_sen_len = 100
    train_batch_size = 256
    val_batch_size = 256
    num_labels = None
    bert_pretrained_name = '/Users/anulz/github/code/NLP/PTM/nezha-cn-base/'
    trained_model_path = '.ckptbest_ner.pth'
    pre_model_type = 'nezha-cn-base'
    cuda_device = -1
    is_full_data = False
    params_path = './2022-gaiic-ner/'
    # 训练模型参数
    hidden_size = 768
    num_workers = 0
    multi_gpu = False
    n_epoch = 6
    min_store_epoch = 5
    scheduler_type = None
    gp_head_size = 64   #128
    # trick 参数
    attack_func = 'fgm'  #fgm  pgd
    pgd_k = 3
    is_use_rdrop = False
    alpha = 0.25
    is_use_swa = False
    ema_decay = 0.995
config = Config()
config.num_labels = len(ner_train_dataset.cat2id)

In [13]:
class GlobalPointerCrossEntropy(nn.Module):
    '''Multi-class Focal loss implementation'''
    def __init__(self, ):
        super(GlobalPointerCrossEntropy, self).__init__()

    @staticmethod
    def multilabel_categorical_crossentropy(y_true, y_pred):
        y_pred = (1 - 2 * y_true) * y_pred
        y_pred_neg = y_pred - y_true * 1e12
        y_pred_pos = y_pred - (1 - y_true) * 1e12
        zeros = torch.zeros_like(y_pred[..., :1])
        y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
        y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
        neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
        pos_loss = torch.logsumexp(y_pred_pos, dim=-1)

        return neg_loss + pos_loss

    def forward(self, logits, target):
        """
        logits: [N, C, L, L]
        """
        bh = logits.shape[0] * logits.shape[1]
        target = torch.reshape(target.to_dense(), (bh, -1))
        logits = torch.reshape(logits, (bh, -1))
        return torch.mean(GlobalPointerCrossEntropy.multilabel_categorical_crossentropy(target, logits))


In [14]:
def get_default_bert_optimizer(
    module,
    lr: float = 3e-5,
    eps: float = 1e-6,
    correct_bias: bool = True,
    weight_decay: float = 1e-3,
):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in module.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay},
        {"params": [p for n, p in module.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=lr,
                      eps=eps,
                      correct_bias=correct_bias,
                      weight_decay=weight_decay)
    return optimizer

In [15]:
model = GlobalPointerNezha(config)

optimizer = AdamW(model.parameters())
loss_func = GlobalPointerCrossEntropy()

You are using a model of type bert to instantiate a model of type nezha. This is not supported for all configurations of models and can yield errors.
Some weights of NeZhaModel were not initialized from the model checkpoint at /Users/anulz/github/code/NLP/PTM/nezha-cn-base/ and are newly initialized: []
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from dd_nlp_arsenal.factory.task.ner_task.ner_task import GlobalPointerNERTask

In [17]:
task = GlobalPointerNERTask(model, optimizer, loss_func, config)

Model type: nezha-cn-base
device: cpu
Init pre-train model...


In [19]:
task.fit(ner_train_dataset,config,ner_dev_dataset)

Epoch 1/6
----------
  0%|          | 0/1 [00:00<?, ?it/s]

inputs torch.Size([18, 128, 768])
dense inputs torch.Size([18, 128, 2944])


  0%|          | 0/1 [00:17<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('/Users/anulz/ckptbest_ner.pth',map_location=torch.device('cpu')))

In [None]:
import numpy as np

In [None]:
class GlobalPointerNERPredictor(object):
    """
    GlobalPointer命名实体识别的预测器

    Args:
        module: 深度学习模型
        tokernizer: 分词器
        cat2id (:obj:`dict`): 标签映射
    """  # noqa: ignore flake8"

    def __init__(
            self,
            module,
            tokernizer,
            cat2id
    ):
        self.module = module
        self.module.task = 'TokenLevel'

        self.cat2id = cat2id
        self.tokenizer = tokernizer
        self.device = list(self.module.parameters())[0].device

        self.id2cat = {}
        for cat_, idx_ in self.cat2id.items():
            self.id2cat[idx_] = cat_

    def _convert_to_transfomer_ids(
            self,
            text
    ):

        tokens = self.tokenizer.tokenize(text)
        token_mapping = self.tokenizer.get_token_mapping(text, tokens)

        input_ids = self.tokenizer.sequence_to_ids(tokens)
        input_ids, input_mask, segment_ids = input_ids

        zero = [0 for i in range(self.tokenizer.max_seq_len)]
        span_mask = [input_mask for i in range(sum(input_mask))]
        span_mask.extend([zero for i in range(sum(input_mask), self.tokenizer.max_seq_len)])
        span_mask = np.array(span_mask)

        features = {
            'input_ids': input_ids,
            'attention_mask': input_mask,
            'token_type_ids': segment_ids,
            'span_mask': span_mask
        }

        return features, token_mapping

    def _get_input_ids(
            self,
            text
    ):
        if self.tokenizer.tokenizer_type == 'transformer':
            return self._convert_to_transfomer_ids(text)
        elif self.tokenizer.tokenizer_type == 'customized':
            return self._convert_to_customized_ids(text)
        else:
            raise ValueError("The tokenizer type does not exist")

    def _get_module_one_sample_inputs(
            self,
            features
    ):
        return {col: torch.Tensor(features[col]).type(torch.long).unsqueeze(0).to(self.device) for col in features}

    def predict_one_sample(
            self,
            text='',
            threshold=0
    ):
        """
        单样本预测

        Args:
            text (:obj:`string`): 输入文本
            threshold (:obj:`float`, optional, defaults to 0): 预测的阈值
        """  # noqa: ignore flake8"

        features, token_mapping = self._get_input_ids(text)
        self.module.eval()

        with torch.no_grad():
            inputs = self._get_module_one_sample_inputs(features)
            scores = self.module(**inputs)[0].cpu()

        scores[:, [0, -1]] -= np.inf
        scores[:, :, [0, -1]] -= np.inf

        entities = []
        
        for category, start, end in zip(*np.where(scores > threshold)):
            if end - 1 >= token_mapping[-1][-1]:
                break
            print('start',start,'end',end)
            if token_mapping[start - 1][0] <= token_mapping[end - 1][-1]:
                entitie_ = {
                    "start_idx": token_mapping[start - 1][0],
                    "end_idx": token_mapping[end - 1][-1],
                    "entity": text[token_mapping[start - 1][0]: token_mapping[end - 1][-1] + 1],
                    "type": self.id2cat[category]
                }

                if entitie_['entity'] == '':
                    continue

                entities.append(entitie_)

        return entities

In [None]:
ner_predictor_instance = GlobalPointerNERPredictor(model, tokenizer, ner_train_dataset.cat2id)

from tqdm import tqdm

predict_results = []

with open('./examples/ner/test_data_A/sample_per_line_preliminary_A.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for idx,_line in enumerate(lines):
        if idx > 2957:
            print('idx',idx)
            label = len(_line) * ['O']
            for _preditc in ner_predictor_instance.predict_one_sample(_line[:-1]):
                if 'I' in label[_preditc['start_idx']]:
                    continue
                if 'B' in label[_preditc['start_idx']] and 'O' not in label[_preditc['end_idx']]:
                    continue
                if 'O' in label[_preditc['start_idx']] and 'B' in label[_preditc['end_idx']]:
                    continue

                label[_preditc['start_idx']] = 'B-' + _preditc['type']
                label[_preditc['start_idx'] + 1: _preditc['end_idx'] + 1] = (_preditc['end_idx'] - _preditc[
                    'start_idx']) * [('I-' + _preditc['type'])]

            predict_results.append([_line, label])


In [None]:
with open('/opt/meituan/cephfs/share/gutianyi/competition/2022-gaiic-ner/gobal_pointer_baseline.txt', 'w', encoding='utf-8') as f:
    for _result in predict_results:
        for word, tag in zip(_result[0], _result[1]):
            if word == '\n':
                continue
            f.write(f'{word} {tag}\n')
        f.write('\n')