In [1]:
##采用经典的BiLstm-CRF网路来显示NER问题 具体参考https://aistudio.baidu.com/aistudio/projectdetail/1317771 是不是非常多神经网路都可以采用paddle俩构建呀

import paddle
import paddle.nn as nn

import paddlenlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.layers import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss
from paddlenlp.metrics import ChunkEvaluator

In [2]:
for i, line in enumerate(open('data/train.txt')):
    if 0 < i < 5:
        print ('%d: ' % i, line.split()[0])
        print ('   ', line.split()[1])

1:  16620200077宣荣嗣甘肃省白银市会宁县河畔镇十字街金海超市西行50米
    T-BT-IT-IT-IT-IT-IT-IT-IT-IT-IT-IP-BP-IP-IA1-BA1-IA1-IA2-BA2-IA2-IA3-BA3-IA3-IA4-BA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-IA4-I
2:  13552664307姜骏炜云南省德宏傣族景颇族自治州盈江县平原镇蜜回路下段
    T-BT-IT-IT-IT-IT-IT-IT-IT-IT-IT-IP-BP-IP-IA1-BA1-IA1-IA2-BA2-IA2-IA2-IA2-IA2-IA2-IA2-IA2-IA2-IA3-BA3-IA3-IA4-BA4-IA4-IA4-IA4-IA4-IA4-IA4-I
3:  内蒙古自治区赤峰市阿鲁科尔沁旗汉林西街路南13701085390那峥
    A1-BA1-IA1-IA1-IA1-IA1-IA2-BA2-IA2-IA3-BA3-IA3-IA3-IA3-IA3-IA4-BA4-IA4-IA4-IA4-IA4-IT-BT-IT-IT-IT-IT-IT-IT-IT-IT-IT-IP-BP-I
4:  广东省梅州市大埔县茶阳镇胜利路13601328173张铱
    A1-BA1-IA1-IA2-BA2-IA2-IA3-BA3-IA3-IA4-BA4-IA4-IA4-IA4-IA4-IT-BT-IT-IT-IT-IT-IT-IT-IT-IT-IT-IP-BP-I


In [3]:
def convert_tokens_to_ids(tokens, vocab, oov_token=None):
    token_ids = []
    oov_id = vocab.get(oov_token) if oov_token else None
    for token in tokens:
        token_id = vocab.get(token, oov_id)
        token_ids.append(token_id)
    return token_ids


def load_dict(dict_path):
    vocab = {}
    i = 0
    for line in open(dict_path, 'r', encoding='utf-8'):
        key = line.strip('\n')
        vocab[key] = i
        i += 1
    return vocab


def load_dataset(datafiles):
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            next(fp)
            for line in fp.readlines():
                words, labels = line.strip('\n').split('\t')
                words = words.split('\002')
                labels = labels.split('\002')
                yield words, labels

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]

train_ds, dev_ds, test_ds = load_dataset(datafiles=('data/train.txt', 'data/dev.txt', 'data/test.txt'))

label_vocab = load_dict('./data/tag.dic')
word_vocab = load_dict('./data/word.dic')

def convert_example(example):
        tokens, labels = example
        token_ids = convert_tokens_to_ids(tokens, word_vocab, 'OOV')
        label_ids = convert_tokens_to_ids(labels, label_vocab, 'O')
        return token_ids, len(token_ids), label_ids

train_ds.map(convert_example)
dev_ds.map(convert_example)
test_ds.map(convert_example)

<paddlenlp.datasets.dataset.MapDataset at 0x7fd040d52d30>

In [4]:
batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=word_vocab.get('OOV')),  # token_ids
        Stack(),  # seq_len
        Pad(axis=0, pad_val=label_vocab.get('O'))  # label_ids
    ): fn(samples)

train_loader = paddle.io.DataLoader(
        dataset=train_ds,
        batch_size=32,
        shuffle=True,
        drop_last=True,
        return_list=True,
        collate_fn=batchify_fn)

dev_loader = paddle.io.DataLoader(
        dataset=dev_ds,
        batch_size=32,
        drop_last=True,
        return_list=True,
        collate_fn=batchify_fn)

test_loader = paddle.io.DataLoader(
        dataset=test_ds,
        batch_size=32,
        drop_last=True,
        return_list=True,
        collate_fn=batchify_fn)

In [8]:
class BiGRUWithCRF(nn.Layer):
    def __init__(self,
                 emb_size,
                 hidden_size,
                 word_num,
                 label_num,
                 use_w2v_emb=False):
        super(BiGRUWithCRF, self).__init__()
        if use_w2v_emb:
            self.word_emb = TokenEmbedding(
                extended_vocab_path='./conf/word.dic', unknown_token='OOV')
        else:
            self.word_emb = nn.Embedding(word_num, emb_size)
        self.gru = nn.GRU(emb_size,
                          hidden_size,
                          num_layers=2,
                          direction='bidirectional')
        self.fc = nn.Linear(hidden_size * 2, label_num + 2)  # BOS EOS
        self.crf = LinearChainCrf(label_num)
        self.decoder = ViterbiDecoder(self.crf.transitions)

    def forward(self, x, lens):
        embs = self.word_emb(x)
        output, _ = self.gru(embs)
        output = self.fc(output)
        _, pred = self.decoder(output, lens)
        return output, lens, pred

# Define the model netword and its loss
network = BiGRUWithCRF(300, 300, len(word_vocab), len(label_vocab))
model = paddle.Model(network)

In [9]:
optimizer = paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters())
crf_loss = LinearChainCrfLoss(network.crf)
chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
model.prepare(optimizer, crf_loss, chunk_evaluator)

In [10]:
model.fit(train_data=train_loader,
              eval_data=dev_loader,
              epochs=10,
              save_dir='./results',
              log_freq=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10


  format(lhs_dtype, rhs_dtype, lhs_dtype))
  format(lhs_dtype, rhs_dtype, lhs_dtype))
  format(lhs_dtype, rhs_dtype, lhs_dtype))


step  1/50 - loss: 134.0855 - precision: 0.0000e+00 - recall: 0.0000e+00 - f1: 0.0000e+00 - 1s/step
step  2/50 - loss: 95.3569 - precision: 0.0010 - recall: 0.0052 - f1: 0.0017 - 616ms/step
step  3/50 - loss: 78.4333 - precision: 0.0010 - recall: 0.0035 - f1: 0.0016 - 482ms/step
step  4/50 - loss: 68.1493 - precision: 0.0067 - recall: 0.0235 - f1: 0.0104 - 412ms/step
step  5/50 - loss: 69.8198 - precision: 0.0065 - recall: 0.0188 - f1: 0.0097 - 371ms/step
step  6/50 - loss: 74.5616 - precision: 0.0058 - recall: 0.0157 - f1: 0.0085 - 340ms/step
step  7/50 - loss: 77.7513 - precision: 0.0050 - recall: 0.0135 - f1: 0.0073 - 318ms/step
step  8/50 - loss: 61.9048 - precision: 0.0045 - recall: 0.0118 - f1: 0.0065 - 304ms/step
step  9/50 - loss: 68.4771 - precision: 0.0050 - recall: 0.0122 - f1: 0.0071 - 290ms/step
step 10/50 - loss: 71.8228 - precision: 0.0051 - recall: 0.0115 - f1: 0.0071 - 279ms/step
step 11/50 - loss: 56.4123 - precision: 0.0052 - recall: 0.0109 - f1: 0.0070 - 269ms/step


step 36/50 - loss: 1.3680 - precision: 0.7949 - recall: 0.8430 - f1: 0.8182 - 168ms/step
step 37/50 - loss: 18.2399 - precision: 0.7974 - recall: 0.8453 - f1: 0.8207 - 167ms/step
step 38/50 - loss: 1.3027 - precision: 0.8001 - recall: 0.8472 - f1: 0.8230 - 166ms/step
step 39/50 - loss: 4.1390 - precision: 0.8021 - recall: 0.8493 - f1: 0.8250 - 165ms/step
step 40/50 - loss: 1.3524 - precision: 0.8035 - recall: 0.8502 - f1: 0.8262 - 165ms/step
step 41/50 - loss: 1.0450 - precision: 0.8073 - recall: 0.8531 - f1: 0.8295 - 165ms/step
step 42/50 - loss: 5.7773 - precision: 0.8089 - recall: 0.8546 - f1: 0.8311 - 165ms/step
step 43/50 - loss: 3.1885 - precision: 0.8118 - recall: 0.8571 - f1: 0.8339 - 164ms/step
step 44/50 - loss: 31.3568 - precision: 0.8142 - recall: 0.8590 - f1: 0.8360 - 163ms/step
step 45/50 - loss: 1.3257 - precision: 0.8156 - recall: 0.8603 - f1: 0.8374 - 164ms/step
step 46/50 - loss: 0.3673 - precision: 0.8180 - recall: 0.8624 - f1: 0.8396 - 163ms/step
step 47/50 - loss: 

step 14/50 - loss: 1.1509 - precision: 0.9793 - recall: 0.9877 - f1: 0.9835 - 155ms/step
step 15/50 - loss: 0.6179 - precision: 0.9776 - recall: 0.9864 - f1: 0.9820 - 154ms/step
step 16/50 - loss: 2.1591 - precision: 0.9780 - recall: 0.9863 - f1: 0.9821 - 155ms/step
step 17/50 - loss: 0.0892 - precision: 0.9775 - recall: 0.9856 - f1: 0.9815 - 155ms/step
step 18/50 - loss: 0.9288 - precision: 0.9778 - recall: 0.9855 - f1: 0.9817 - 154ms/step
step 19/50 - loss: 0.9647 - precision: 0.9782 - recall: 0.9860 - f1: 0.9821 - 154ms/step
step 20/50 - loss: 0.9095 - precision: 0.9777 - recall: 0.9854 - f1: 0.9815 - 154ms/step
step 21/50 - loss: 0.7870 - precision: 0.9788 - recall: 0.9861 - f1: 0.9824 - 154ms/step
step 22/50 - loss: 1.2043 - precision: 0.9786 - recall: 0.9862 - f1: 0.9824 - 153ms/step
step 23/50 - loss: 0.5706 - precision: 0.9766 - recall: 0.9848 - f1: 0.9806 - 154ms/step
step 24/50 - loss: 0.0563 - precision: 0.9767 - recall: 0.9845 - f1: 0.9806 - 154ms/step
step 25/50 - loss: 0.

step 49/50 - loss: 2.0225 - precision: 0.9878 - recall: 0.9936 - f1: 0.9907 - 149ms/step
step 50/50 - loss: 0.2247 - precision: 0.9876 - recall: 0.9934 - f1: 0.9905 - 148ms/step
save checkpoint at /home/gaojing/PTM/PaddlePaddleNLPRace/task/task_ner/results/4
Eval begin...
step 1/6 - loss: 0.4811 - precision: 0.9583 - recall: 0.9787 - f1: 0.9684 - 111ms/step
step 2/6 - loss: -0.0000e+00 - precision: 0.9714 - recall: 0.9816 - f1: 0.9764 - 104ms/step
step 3/6 - loss: 0.6942 - precision: 0.9705 - recall: 0.9790 - f1: 0.9747 - 107ms/step
step 4/6 - loss: 0.8035 - precision: 0.9650 - recall: 0.9764 - f1: 0.9706 - 100ms/step
step 5/6 - loss: 2.3089 - precision: 0.9658 - recall: 0.9769 - f1: 0.9713 - 98ms/step
step 6/6 - loss: 3.6132 - precision: 0.9655 - recall: 0.9773 - f1: 0.9714 - 98ms/step
Eval samples: 192
Epoch 6/10
step  1/50 - loss: -0.0000e+00 - precision: 1.0000 - recall: 1.0000 - f1: 1.0000 - 175ms/step
step  2/50 - loss: 0.6925 - precision: 1.0000 - recall: 1.0000 - f1: 1.0000 - 1

step 26/50 - loss: -0.0000e+00 - precision: 0.9956 - recall: 0.9968 - f1: 0.9962 - 154ms/step
step 27/50 - loss: 0.2499 - precision: 0.9957 - recall: 0.9969 - f1: 0.9963 - 155ms/step
step 28/50 - loss: 0.1835 - precision: 0.9959 - recall: 0.9970 - f1: 0.9965 - 155ms/step
step 29/50 - loss: 0.1949 - precision: 0.9950 - recall: 0.9962 - f1: 0.9956 - 154ms/step
step 30/50 - loss: 0.6893 - precision: 0.9944 - recall: 0.9958 - f1: 0.9951 - 153ms/step
step 31/50 - loss: 1.6395 - precision: 0.9943 - recall: 0.9958 - f1: 0.9950 - 153ms/step
step 32/50 - loss: 0.1832 - precision: 0.9941 - recall: 0.9956 - f1: 0.9949 - 153ms/step
step 33/50 - loss: 2.3712 - precision: 0.9938 - recall: 0.9951 - f1: 0.9945 - 153ms/step
step 34/50 - loss: -0.0000e+00 - precision: 0.9940 - recall: 0.9952 - f1: 0.9946 - 153ms/step
step 35/50 - loss: 0.4993 - precision: 0.9937 - recall: 0.9952 - f1: 0.9945 - 153ms/step
step 36/50 - loss: -0.0000e+00 - precision: 0.9928 - recall: 0.9951 - f1: 0.9939 - 153ms/step
step 3

step  3/50 - loss: 0.0161 - precision: 0.9896 - recall: 0.9930 - f1: 0.9913 - 179ms/step
step  4/50 - loss: 0.6595 - precision: 0.9922 - recall: 0.9948 - f1: 0.9935 - 177ms/step
step  5/50 - loss: 0.0699 - precision: 0.9886 - recall: 0.9937 - f1: 0.9911 - 173ms/step
step  6/50 - loss: -0.0000e+00 - precision: 0.9905 - recall: 0.9948 - f1: 0.9926 - 168ms/step
step  7/50 - loss: 0.4240 - precision: 0.9881 - recall: 0.9940 - f1: 0.9911 - 168ms/step
step  8/50 - loss: -0.0000e+00 - precision: 0.9896 - recall: 0.9948 - f1: 0.9922 - 166ms/step
step  9/50 - loss: -0.0000e+00 - precision: 0.9908 - recall: 0.9954 - f1: 0.9931 - 166ms/step
step 10/50 - loss: 0.0061 - precision: 0.9901 - recall: 0.9937 - f1: 0.9919 - 162ms/step
step 11/50 - loss: 0.0655 - precision: 0.9910 - recall: 0.9943 - f1: 0.9926 - 160ms/step
step 12/50 - loss: 1.4427 - precision: 0.9917 - recall: 0.9948 - f1: 0.9933 - 158ms/step
step 13/50 - loss: -0.0000e+00 - precision: 0.9912 - recall: 0.9948 - f1: 0.9930 - 156ms/step
s

step 37/50 - loss: -0.0000e+00 - precision: 0.9917 - recall: 0.9949 - f1: 0.9933 - 148ms/step
step 38/50 - loss: -0.0000e+00 - precision: 0.9912 - recall: 0.9948 - f1: 0.9930 - 148ms/step
step 39/50 - loss: -0.0000e+00 - precision: 0.9909 - recall: 0.9946 - f1: 0.9928 - 148ms/step
step 40/50 - loss: -0.0000e+00 - precision: 0.9909 - recall: 0.9946 - f1: 0.9927 - 148ms/step
step 41/50 - loss: 0.1544 - precision: 0.9911 - recall: 0.9948 - f1: 0.9929 - 148ms/step
step 42/50 - loss: -0.0000e+00 - precision: 0.9911 - recall: 0.9948 - f1: 0.9929 - 149ms/step
step 43/50 - loss: 1.2947 - precision: 0.9913 - recall: 0.9949 - f1: 0.9931 - 149ms/step
step 44/50 - loss: -0.0000e+00 - precision: 0.9915 - recall: 0.9950 - f1: 0.9932 - 149ms/step
step 45/50 - loss: -0.0000e+00 - precision: 0.9917 - recall: 0.9951 - f1: 0.9934 - 149ms/step
step 46/50 - loss: -0.0000e+00 - precision: 0.9914 - recall: 0.9950 - f1: 0.9932 - 150ms/step
step 47/50 - loss: 1.2616 - precision: 0.9916 - recall: 0.9951 - f1: 0

In [11]:
model.evaluate(eval_data=test_loader, log_freq=1)

Eval begin...
step 1/6 - loss: 1.2617 - precision: 1.0000 - recall: 1.0000 - f1: 1.0000 - 156ms/step
step 2/6 - loss: -0.0000e+00 - precision: 0.9844 - recall: 0.9869 - f1: 0.9857 - 145ms/step
step 3/6 - loss: 16.5506 - precision: 0.9721 - recall: 0.9738 - f1: 0.9729 - 138ms/step
step 4/6 - loss: 1.3364 - precision: 0.9765 - recall: 0.9777 - f1: 0.9771 - 131ms/step
step 5/6 - loss: 1.3289 - precision: 0.9749 - recall: 0.9770 - f1: 0.9760 - 128ms/step
step 6/6 - loss: 0.3492 - precision: 0.9696 - recall: 0.9738 - f1: 0.9717 - 129ms/step
Eval samples: 192


{'loss': [0.34924316],
 'precision': 0.9695916594265855,
 'recall': 0.9738219895287958,
 'f1': 0.9717022202873312}

In [12]:
def parse_decodes(ds, decodes, lens, label_vocab):
    decodes = [x for batch in decodes for x in batch]
    lens = [x for batch in lens for x in batch]
    id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

    outputs = []
    for idx, end in enumerate(lens):
        sent = ds.data[idx][0][:end]
        tags = [id_label[x] for x in decodes[idx][:end]]
        sent_out = []
        tags_out = []
        words = ""
        for s, t in zip(sent, tags):
            if t.endswith('-B') or t == 'O':
                if len(words):
                    sent_out.append(words)
                tags_out.append(t.split('-')[0])
                words = s
            else:
                words += s
        if len(sent_out) < len(tags_out):
            sent_out.append(words)
        outputs.append(''.join(
            [str((s, t)) for s, t in zip(sent_out, tags_out)]))
    return outputs

In [13]:
outputs, lens, decodes = model.predict(test_data=test_loader)
preds = parse_decodes(test_ds, decodes, lens, label_vocab)

Predict begin...

  format(lhs_dtype, rhs_dtype, lhs_dtype))
  format(lhs_dtype, rhs_dtype, lhs_dtype))
  format(lhs_dtype, rhs_dtype, lhs_dtype))


Predict samples: 192


In [14]:
print('\n'.join(preds[:5]))

('黑龙江省', 'A1')('双鸭山市', 'A2')('尖山区', 'A3')('八马路与东平行路交叉口北40米', 'A4')('韦业涛', 'P')('18600009172', 'T')
('广西壮族自治区', 'A1')('桂林市', 'A2')('雁山区', 'A3')('雁山镇西龙村老年活动中心', 'A4')('17610348888', 'T')('羊卓卫', 'P')
('15652864561', 'T')('河南省', 'A1')('开封市', 'A2')('顺河回族区', 'A3')('顺河区公园路32号', 'A4')('赵本山', 'P')
('河北省', 'A1')('唐山市', 'A2')('玉田县', 'A3')('无终大街159号', 'A4')('18614253058', 'T')('尚汉生', 'P')
('台湾', 'A1')('台中市', 'A2')('北区', 'A3')('北区锦新街18号', 'A4')('18511226708', 'T')('蓟丽', 'P')
