In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import pandas as pd
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from bert4keras.snippets import text_segmentate
from keras.models import Model
from tqdm import tqdm
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
train_file = open('./round1_train_0907.json', encoding='utf-8')
test_file = open('./round1_test_0907.json', encoding='utf-8')

train_data = json.load(train_file)
test_data = json.load(test_file)

all_data_list = []
for i in range(len(train_data)):
    for j in range(len(train_data[i]['annotations'])):
        dict_cur = {}
        dict_cur['id'] = train_data[i]['id']
        dict_cur['text'] = train_data[i]['text']
        dict_cur['question'] = train_data[i]['annotations'][j]['Q']
        dict_cur['answer'] = train_data[i]['annotations'][j]['A']
        all_data_list.append(dict_cur)

id_list = []
text_list = []
question_list = []
answer_list = []
for i in range(len(all_data_list)):
    id_list.append(all_data_list[i]['id'])
    text_list.append(all_data_list[i]['text'])
    question_list.append(all_data_list[i]['question'])
    answer_list.append(all_data_list[i]['answer'])

train_data = pd.DataFrame(columns = ["id", "text", "question", "answer"])
train_data['id'] = id_list
train_data['text'] = text_list
train_data['question'] = question_list
train_data['answer'] = answer_list
print(train_data.shape)



(18478, 4)


In [3]:
def text_segmentate(text, maxlen, seps='\n', strips=None):
    """将文本按照标点符号划分为若干个短句
    """
    text = text.strip().strip(strips)
    if seps and len(text) > maxlen:
        pieces = text.split(seps[0])
        text, texts = '', []
        for i, p in enumerate(pieces):
            if text and p and len(text) + len(p) > maxlen - 1:
                texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
                text = ''
            if i + 1 == len(pieces):
                text = text + p
            else:
                text = text + p + seps[0]
        if text:
            texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
        return texts
    else:
        return [text]

# 基本参数
max_p_len = 432
max_q_len = 16
max_a_len = 64
batch_size = 4
epochs = 40

# bert配置
config_path = 'D:/NLP/Bert_model/tensorflow/chinese_roberta_wwm_ext/bert_config.json'
checkpoint_path = 'D:/NLP/Bert_model/tensorflow/chinese_roberta_wwm_ext/bert_model.ckpt'
dict_path = 'D:/NLP/Bert_model/tensorflow/chinese_roberta_wwm_ext/vocab.txt'

# 筛选数据
seps, strips = u'\n。！？!?；;，, ', u'；;，, '
data = []

for idx in range(train_data.shape[0]):
    if train_data['answer'][idx]:
        for t in text_segmentate(train_data['text'][idx], max_p_len - 2, seps, strips):
            if train_data['answer'][idx] in t:
                data.append((t, train_data['question'][idx], train_data['answer'][idx]))

random_order = list(range(len(data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)

# 划分valid
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

# 加载并精简词表，建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)

In [5]:
class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        """单条样本格式：[CLS]篇章[SEP]答案[SEP]问题[SEP]
        """
        batch_token_ids, batch_segment_ids = [], []
        for is_end, (p, q, a) in self.sample(random):
            p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len + 1)
            a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len)
            q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len)
            token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:]
            segment_ids = [0] * len(p_token_ids)
            segment_ids += [1] * (len(token_ids) - len(p_token_ids))
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []

class CrossEntropy(Loss):
    """交叉熵作为loss，并mask掉输入部分
    """
    def compute_loss(self, inputs, mask=None):
        y_true, y_mask, y_pred = inputs
        y_true = y_true[:, 1:]  # 目标token_ids
        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.sum(loss * y_mask) / K.sum(y_mask)
        return loss

model = build_transformer_model(
    config_path,
    checkpoint_path,
    application='unilm',
    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
)

output = CrossEntropy(2)(model.inputs + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(1e-5))


  'be expecting any data to be passed to {0}.'.format(name))


In [8]:
valid_data

[('京华时报(微博)讯(记者叶洲)夏天，很多人食欲随气温升高逐渐减退，总喜欢吃些凉的、刺激的，结果往往吃坏了肠胃。北京中医药大学教授杜金行建议，夏季养胃，不妨试试红薯、南瓜和圆白菜，这三种跟交通灯一样颜色的蔬菜是最好的养胃菜。红薯可以养胃，其富含的膳食纤维能消食化积，增加食欲。但红薯能促进胃酸分泌，平时胃酸过多，常感觉反酸、烧心的人不宜吃。南瓜细软容易吸收，有很好的养胃功效，它富含的碳水化合物、果胶可以保护胃部免受刺激。圆白菜被誉为天然“胃菜”，生津止渴助消化，能够促进胃液分泌，保护胃黏膜。胃不好的人，尤其是患有胃溃疡、十二指肠溃疡的人，可以将圆白菜榨汁饮用，每天一杯，还可以加入少量蜂蜜食用，对促进溃疡愈合有非常好的帮助。专家表示，胃最怕寒，平时可以多做一些如打太极、步行、慢跑等舒缓的运动，这些运动能促进血液循环，使全身处于温暖的状态，对养胃很有好处。',
  '圆白菜被誉为什么？',
  '圆白菜被誉为天然“胃菜”'),
 ('据外媒报道，美国科学家研究指出，矮个子男士体内的限制身高的基因，可能与长寿基因有关。该研究还指出，身高在5英尺2英寸(约157厘米)以下的矮个男士寿命最长。据悉，研究人员针对8000名在1900年至1919年出生的美国日裔男性调查发现，身高在5英尺2英寸(约157厘米)或以下的男子寿命最长，身高愈高则愈短命，研究对象中有1200人活到90岁以上，250人至今依然健在。夏威夷大学追踪这批日裔男性多年，并把他们按身高分成两组，5英尺2英寸及以下和5英尺4英寸及以上，其间密切跟进他们的生活模式及健康状况。结果5英尺2英寸及以下的组别最长寿，而愈高的则愈早死。带领研究的威尔科克斯解释，个子矮的男士体内均有名为“FOXO3”的基因，导致发育初期体形偏小，相信这与长寿有关。这类男士血液内胰岛素含量也较低，降低患癌机会，相信也是导致他们活得较久的原因。',
  '个子矮的男士体内均有名为“FOXO3”的基因有什么影响？',
  '个子矮的男士体内均有名为“FOXO3”的基因，导致发育初期体形偏小，相信这与长寿有关。'),
 ('中外学者通过全基因组关联研究(GWAS)，鉴定出了国人群中喉鳞状细胞癌(LSCC)的3个易感基因位点。研究结果发表在近日出版的国际权威学术刊物《自然·遗传学》杂志上。该研究由解放军总医院耳鼻咽喉头颈外科刘明波、王嘉陵、杨仕明团队

In [7]:
len(valid_data)

1814