In [1]:
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
import os,sys
import jieba
import pickle
import pandas as pd
sys.path.append('../')

In [2]:
import json
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.backend import multilabel_categorical_crossentropy
from bert4keras.layers import GlobalPointer
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, to_array
from keras.models import Model
from tqdm import tqdm

Using TensorFlow backend.


In [3]:
maxlen = 256
epochs = 1 #10
batch_size = 16
learning_rate = 2e-5
categories = set()

# bert配置
config_path = './base_model/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './base_model/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './base_model/chinese_L-12_H-768_A-12/vocab.txt'

In [4]:
def load_data(filename):
    """加载数据
    单条格式：[text, (start, end, label), (start, end, label), ...]，
              意味着text[start:end + 1]是类型为label的实体。
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        f = f.read()
        for l in tqdm(f.split('\n\n')):
            if not l:
                continue
            d = ['']
            for i, c in enumerate(l.split('\n')):
                segs = c.split(' ')
                if len(segs) != 2:
                    segs = segs[1:]
                    continue
                char, flag = segs
                d[0] += char
                if flag[0] == 'B':
                    d.append([i, i, flag[2:]])
                    categories.add(flag[2:])
                elif flag[0] == 'I':
                    d[-1][1] = i
            D.append(d)
    return D


# 标注数据
# train_data = load_data('./data/paperdaily_data/example.train')
# valid_data= load_data('./data/paperdaily_data/example.dev')
# test_data = load_data('./data/paperdaily_data/example.test')

train_data = load_data('./data/train_data/train.txt')
valid_data= load_data('./data/train_data/train.txt')
test_data = load_data('./data/train_data/train.txt')

categories = list(sorted(categories))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40001/40001 [00:01<00:00, 23110.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40001/40001 [00:01<00:00, 25507.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40001/40001 [00:01<00:00, 28714.94it/s]


In [5]:
train_data[:1]

[['手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控',
  [0, 1, '40'],
  [2, 4, '4'],
  [5, 6, '14'],
  [7, 8, '5'],
  [9, 10, '4'],
  [11, 12, '7'],
  [13, 15, '4'],
  [16, 17, '11'],
  [18, 19, '11'],
  [20, 22, '4'],
  [23, 24, '5'],
  [25, 26, '5'],
  [27, 28, '5'],
  [29, 30, '13'],
  [31, 35, '4'],
  [36, 38, '4'],
  [39, 40, '11'],
  [41, 42, '11'],
  [43, 44, '8'],
  [45, 46, '7'],
  [47, 49, '4'],
  [52, 54, '16'],
  [58, 60, '4'],
  [62, 63, '11'],
  [64, 65, '11']]]

In [6]:
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

In [7]:

class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, d in self.sample(random):
            tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
            mapping = tokenizer.rematch(d[0], tokens)
            start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
            end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
            token_ids = tokenizer.tokens_to_ids(tokens)
            segment_ids = [0] * len(token_ids)
            labels = np.zeros((len(categories), maxlen, maxlen))
            for start, end, label in d[1:]:
                if start in start_mapping and end in end_mapping:
                    start = start_mapping[start]
                    end = end_mapping[end]
                    label = categories.index(label)
                    labels[label, start, end] = 1
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels, seq_dims=3)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


In [8]:

def global_pointer_crossentropy(y_true, y_pred):
    """给GlobalPointer设计的交叉熵
    """
    bh = K.prod(K.shape(y_pred)[:2])
    y_true = K.reshape(y_true, (bh, -1))
    y_pred = K.reshape(y_pred, (bh, -1))
    return K.mean(multilabel_categorical_crossentropy(y_true, y_pred))


def global_pointer_f1_score(y_true, y_pred):
    """给GlobalPointer设计的F1
    """
    y_pred = K.cast(K.greater(y_pred, 0), K.floatx())
    return 2 * K.sum(y_true * y_pred) / K.sum(y_true + y_pred)

In [None]:
model = build_transformer_model(config_path, checkpoint_path)
output = GlobalPointer(len(categories), 64)(model.output)

In [None]:
model = Model(model.input, output)
model.summary()

In [None]:

model.compile(
    loss=global_pointer_crossentropy,
    optimizer=Adam(learning_rate),
    metrics=[global_pointer_f1_score] 
)

In [None]:

class NamedEntityRecognizer(object):
    """命名实体识别器
    """
    def recognize(self, text, threshold=0):
        tokens = tokenizer.tokenize(text, maxlen=512)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        scores = model.predict([token_ids, segment_ids])[0]
        scores[:, [0, -1]] -= np.inf
        scores[:, :, [0, -1]] -= np.inf
        entities = []
        for l, start, end in zip(*np.where(scores > threshold)):
            entities.append(
                (mapping[start][0], mapping[end][-1], categories[l])
            )
        return entities

NER = NamedEntityRecognizer()

In [None]:

def evaluate(data):
    """评测函数
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for d in tqdm(data, ncols=100):
        R = set(NER.recognize(d[0]))
        T = set([tuple(i) for i in d[1:]])
        X += len(R & T)
        Y += len(R)
        Z += len(T)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall

In [None]:
class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_f1 = 0

    def on_epoch_end(self, epoch, logs=None):
        datestr = datetime.now().strftime(format='%Y-%m-%d-%H')
        f1, precision, recall = evaluate(valid_data)
        # 保存最优
        if f1 >= self.best_val_f1:
            self.best_val_f1 = f1
            model.save_weights(f'./model/best_model_peopledaily_globalpointer_{datestr}.weights')
        print(
            'valid:  f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
            (f1, precision, recall, self.best_val_f1)
        )
        f1, precision, recall = evaluate(test_data)
        print(
            'test:  f1: %.5f, precision: %.5f, recall: %.5f\n' %
            (f1, precision, recall)
        )

In [None]:
# train
evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)

model.fit(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=[evaluator]
)

In [None]:
100 % 10