## MindSpore-BERT-NER
### 1. 下载源码和数据至本地容器

因为notebook是挂载在obs上，运行的容器实例不能直接读取操作obs上的文件，需下载至容器本地环境中

In [1]:
import moxing as mox
mox.file.copy_parallel(src_url="s3://ascend-zyjs-dcyang/nlp/bert_ner_notebook/src/", dst_url='./src/') 
mox.file.copy_parallel(src_url="s3://ascend-zyjs-dcyang/nlp/bert_ner_notebook/data/", dst_url='./data/')
mox.file.copy_parallel(src_url="s3://ascend-zyjs-dcyang/nlp/bert_ner_notebook/pre_model/", dst_url='./pre_model/')

INFO:root:Using MoXing-v1.17.3-43fbf97f
INFO:root:Using OBS-Python-SDK-3.20.7


### 2. 导入依赖库

In [3]:
import os
import argparse
import numpy as np
from easydict import EasyDict as edict

import mindspore.nn as nn
import mindspore.common.dtype as mstype
from mindspore.common.initializer import TruncatedNormal
from mindspore import context
from mindspore import log as logger
from mindspore.common.tensor import Tensor
import mindspore.dataset as de
from mindspore.ops import operations as P
import mindspore.dataset.transforms.c_transforms as C
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.nn.optim import AdamWeightDecayDynamicLR, Lamb, Momentum
from mindspore.train.model import Model
from mindspore.train.callback import Callback
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net

from src.CRF import CRF
from src.utils import Accuracy, F1, BertFinetuneCell
from src.config import tag_to_index, bert_optimizer_cfg
from src.bert_model import BertConfig, BertModel
from src.cluener_evaluation import ner_process

### 3. 定义参数配置

In [4]:
cfg = edict({
    'task': 'NER',                    
    'num_labels': 41,                 
    'schema_file': r'./data/clue_ner/schema.json',      
    'ckpt_prefix': 'bert-ner-crf',          
    
    'train_file': r'./data/clue_ner/train.tf_record',
    'eval_file': r'./data/clue_ner/dev.tf_record',
    'batch_size': 16,
    'epoch_num': 5,
    'ckpt_dir': 'ckpt',
   
    'pre_training_ckpt': './pre_model/bert_base.ckpt',
    'finetune_ckpt': './ckpt/bert-ner-crf-5_671.ckpt',   
    
    'label2id_file': './data/clue_ner/label2id.json',       
    'vocab_file': './data/vocab.txt',
    'use_crf': True,
})

bert_net_cfg = BertConfig(
    batch_size=cfg.batch_size,
    seq_length=128,
    vocab_size=21128,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    use_relative_positions=False,
    input_mask_from_dataset=True,
    token_type_ids_from_dataset=True,
    dtype=mstype.float32,
    compute_type=mstype.float16,  
)

### 4. 定义数据集加载函数

In [5]:
def get_dataset(data_file, batch_size=1, repeat_count=1):
    '''
    get dataset
    '''
    ds = de.TFRecordDataset([data_file], cfg.schema_file, columns_list=["input_ids", "input_mask","segment_ids", "label_ids"])
    type_cast_op = C.TypeCast(mstype.int32)
    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
    ds = ds.map(input_columns="label_ids", operations=type_cast_op)
    
    # apply shuffle operation
    ds = ds.shuffle(buffer_size=900)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.repeat(repeat_count)
    return ds

数据集测试

In [6]:
get_dataset(cfg.train_file).create_dict_iterator().get_next()['input_ids'][0]

array([ 101, 7213, 6121,  679, 2533, 1403, 3313, 4007,  122,  129, 1453,
       2259, 4638, 2110, 4495, 1355, 1305, 8024, 7557, 5862, 2141, 5018,
        753, 6820, 3621, 3341, 3975, 5023,  511,  127,  121, 1384, 3152,
       6206, 3724, 8024,  102,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

### 5. 定义BertNER模型

In [7]:
class BertNER(nn.Cell):
    """
    Train interface for sequence labeling finetuning task.
    """
    def __init__(self, config, is_training, num_labels=11, use_crf=False, tag_to_index=None, dropout_prob=0.0,
                 use_one_hot_embeddings=False):
        super(BertNER, self).__init__()
        self.bert = BertModel(config, is_training, use_one_hot_embeddings)
        self.cast = P.Cast()
        self.weight_init = TruncatedNormal(config.initializer_range)
        self.log_softmax = P.LogSoftmax(axis=-1)
        self.dtype = config.dtype
        self.num_labels = num_labels
        self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
                                has_bias=True).to_float(config.compute_type)
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.reshape = P.Reshape()
        self.shape = (-1, config.hidden_size)
        self.use_crf = use_crf
        self.origin_shape = (config.batch_size, config.seq_length, self.num_labels)
        if use_crf:
            if not tag_to_index:
                raise Exception("The dict for tag-index mapping should be provided for CRF.")
            self.loss = CRF(tag_to_index, config.batch_size, config.seq_length, is_training)
        else:
            self.loss = CrossEntropyCalculation(is_training)
        self.num_labels = num_labels
        self.use_crf = use_crf
        
    def construct(self, input_ids, input_mask, token_type_id, label_ids):
        sequence_output, _, _ = \
            self.bert(input_ids, token_type_id, input_mask)
        seq = self.dropout(sequence_output)
        seq = self.reshape(seq, self.shape)
        logits = self.dense_1(seq)
        logits = self.cast(logits, self.dtype)
        
        if self.use_crf:
            return_value = self.reshape(logits, self.origin_shape)
            loss = self.loss(return_value, label_ids)
        else:
            return_value = self.log_softmax(logits)
            loss = self.loss(return_value, label_ids, self.num_labels)
        return loss

### 6. 定义训练函数

In [8]:
def train():
    devid = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid)
    
    if cfg.use_crf:
        netwithloss = BertNER(bert_net_cfg, True, num_labels=len(tag_to_index), use_crf=True,
                              tag_to_index=tag_to_index, dropout_prob=0.1)
    else:
        netwithloss = BertNER(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1)
        
    dataset = get_dataset(cfg.train_file, bert_net_cfg.batch_size, repeat_count=cfg.epoch_num)
    
    # optimizer
    steps_per_epoch = dataset.get_dataset_size()
    optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(),
                                         decay_steps=steps_per_epoch * cfg.epoch_num,
                                         learning_rate=bert_optimizer_cfg.AdamWeightDecayDynamicLR.learning_rate,
                                         end_learning_rate=bert_optimizer_cfg.AdamWeightDecayDynamicLR.end_learning_rate,
                                         power=bert_optimizer_cfg.AdamWeightDecayDynamicLR.power,
                                         warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1),
                                         weight_decay=bert_optimizer_cfg.AdamWeightDecayDynamicLR.weight_decay,
                                         eps=bert_optimizer_cfg.AdamWeightDecayDynamicLR.eps)
    
    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config)
    param_dict = load_checkpoint(cfg.pre_training_ckpt)
    load_param_into_net(netwithloss, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(cfg.epoch_num, dataset, callbacks=[LossMonitor(), ckpoint_cb], dataset_sink_mode=True)

### 7. 启动训练

In [9]:
train()



epoch: 1 step 671, loss is 15.178092956542969
Epoch time: 369228.570, per step time: 550.266, avg loss: 15.178
************************************************************
epoch: 2 step 671, loss is 6.42706298828125
Epoch time: 40811.917, per step time: 60.823, avg loss: 6.427
************************************************************
epoch: 3 step 671, loss is 4.193878173828125
Epoch time: 40576.970, per step time: 60.472, avg loss: 4.194
************************************************************
epoch: 4 step 671, loss is 4.1775360107421875
Epoch time: 40547.132, per step time: 60.428, avg loss: 4.178
************************************************************
epoch: 5 step 671, loss is 1.4002304077148438
Epoch time: 40637.079, per step time: 60.562, avg loss: 1.400
************************************************************


### 8. 定义测试集评估函数

In [10]:
def evaluate():
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    
    dataset = get_dataset(cfg.eval_file, bert_net_cfg.batch_size, repeat_count=1)
    net_for_pretraining = BertNER(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=cfg.use_crf,
                                         tag_to_index=tag_to_index, dropout_prob=0.0)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)
    
    callback = F1()
    columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
    for data in dataset.create_dict_iterator():
        input_data = []
        for i in columns_list:
            input_data.append(Tensor(data[i]))
        input_ids, input_mask, token_type_id, label_ids = input_data
        logits = model.predict(input_ids, input_mask, token_type_id, label_ids)
        callback.update(logits, label_ids)
    print("==============================================================")
    print("Precision {:.6f} ".format(callback.TP / (callback.TP + callback.FP)))
    print("Recall {:.6f} ".format(callback.TP / (callback.TP + callback.FN)))
    print("F1 {:.6f} ".format(2*callback.TP / (2*callback.TP + callback.FP + callback.FN)))
    print("==============================================================")

### 9. 启动测试集评估

In [11]:
evaluate()

Precision 0.919682 
Recall 0.954276 
F1 0.936659 


### 10. 定义在线推理函数

In [10]:
def inference(text):
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    bert_net_cfg.batch_size = 1
    net_for_pretraining = BertNER(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=cfg.use_crf,
                                         tag_to_index=tag_to_index, dropout_prob=0.0)
    net_for_pretraining.set_train(False)
    param_dict = load_checkpoint(cfg.finetune_ckpt)
    load_param_into_net(net_for_pretraining, param_dict)
    model = Model(net_for_pretraining)
    res = ner_process(model, text, bert_net_cfg.seq_length, cfg)
    print("text", text)
    print("res:", res)
    
    return res

### 11. 在线推理测试

In [11]:
inference("彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户")

text 彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户
res: {'name': {'彭小军': [[0, 2]]}, 'address': {'台湾': [[15, 16]]}}


{'name': {'彭小军': [[0, 2]]}, 'address': {'台湾': [[15, 16]]}}

### 12. 将模型checkpoint文件回传obs

In [12]:
mox.file.copy_parallel(src_url="./ckpt/", dst_url="s3://ascend-zyjs-dcyang/nlp/bert_ner_notebook/ckpt/")