In [9]:
import tensorflow as tf
import tensorflow_hub as hub
import logging
import collections

# 特别注意，这个是bert的tokenization包，而不是pip安装得到的，具体可以参见bert的官方文档
# https://github.com/google-research/bert
#　https://github.com/google-research/bert/blob/master/tokenization.py

import tokenization

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import jieba  # 中文分词库，百度员工开发
import matplotlib.pyplot as plt 
import time
import os


np.set_printoptions(precision=4, suppress=True)  # 设置np显示数值而不是科学计数法
# 查询系统可用的 GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
# 确保有可用的 GPU 如果没有, 则会报错
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# 设置参数,该段务必在运行jupyter的第一段代码执行，否则会无法初始化成功
# 仅在需要时申请显存空间（程序初始运行时消耗很少的显存，随着程序的运行而动态申请显存）
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [10]:
path = '/data/python/tensorflow/data/'

In [48]:
train_df = pd.read_csv(path + 'train_20200228.csv')
val_df = pd.read_csv(path + 'dev_20200228.csv')

In [54]:
train_df[['category','label']].values

array([['咳血', 1],
       ['咳血', 1],
       ['咳血', 0],
       ...,
       ['哮喘', 0],
       ['哮喘', 0],
       ['哮喘', 0]], dtype=object)

In [55]:
for i, val in enumerate(train_df[['category','label']].values):
    print(i,'|', val)
    break

0 | ['咳血' 1]


In [12]:
train_df.head()

Unnamed: 0,id,category,query1,query2,label
0,0,咳血,"剧烈运动后咯血,是怎么了?",剧烈运动后咯血是什么原因？,1
1,1,咳血,"剧烈运动后咯血,是怎么了?",剧烈运动后为什么会咯血？,1
2,2,咳血,"剧烈运动后咯血,是怎么了?",剧烈运动后咯血，应该怎么处理？,0
3,3,咳血,"剧烈运动后咯血,是怎么了?",剧烈运动后咯血，需要就医吗？,0
4,4,咳血,"剧烈运动后咯血,是怎么了?",剧烈运动后咯血，是否很严重？,0


In [13]:
train_df.category.unique()

array(['咳血', '支原体肺炎', '胸膜炎', '肺气肿', '肺炎', '感冒', '上呼吸道感染', '哮喘'],
      dtype=object)

In [14]:
val_df.head()

Unnamed: 0,id,category,query1,query2,label
0,0,咳血,请问呕血与咯血有什么区别？,请问呕血与咯血这两者之间有什么区别？,1
1,1,咳血,请问呕血与咯血有什么区别？,请问呕血与咯血异同？,1
2,2,咳血,请问呕血与咯血有什么区别？,请问呕血与咯血怎么治疗？,0
3,3,咳血,请问呕血与咯血有什么区别？,请问呕血与咯血是什么原因导致的？,0
4,4,咳血,请问呕血与咯血有什么区别？,请问呕血与咯血与其他疾病有关联吗？,0


In [15]:
val_df.category.unique()

array(['咳血', '支原体肺炎', '胸膜炎', '肺气肿', '肺炎', '感冒', '上呼吸道感染', '哮喘'],
      dtype=object)

### 转换为bert 支持的数据格式
- token_id
- input_mask
- segment_id

query1 和 query2 合并成一个句子, 需要三个向量，cls向量，seq向量和seq向量

[cls] query1 [seq] query2 [seq]

In [17]:
# 导入一个字典  
tokenizer= tokenization.FullTokenizer(vocab_file =path + 'bert_zh_L-12_H-768_A-12_2/assets/vocab.txt',do_lower_case=True)

In [18]:
# 测试一下导入的token的方法调用，将句子分割为一个个的字
tokenizer.tokenize("庆祝中国共产党建党100周年！")

['庆', '祝', '中', '国', '共', '产', '党', '建', '党', '100', '周', '年', '！']

In [19]:
a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("庆祝中国共产党建党100周年！"))
a 

[2412, 4867, 704, 1744, 1066, 772, 1054, 2456, 1054, 8135, 1453, 2399, 8013]

In [23]:
tokenizer.convert_tokens_to_ids('2419')

[123, 125, 122, 130]

In [28]:
tokenizer.convert_tokens_to_ids('1')

[122]

In [12]:
#### 句子切割
#### 两个句子整合成一个句子
#### 句子编码 （segment_ids, input_mask)

In [34]:
class InputExample(object):
    def __init__(self, idx, category, text_a, text_b = None, label = None):
     self.id = idx
     self.category = category
     self.text_a = text_a
     self.text_b = text_b
     self.label = label

In [35]:
# mask 遮蔽的意思
class InputFeatures(object):
    def __init__(self,input_ids,input_mask,segment_ids,label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [36]:
# 截断序列并成为一对数据，让token_a和token_b长度一致
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [37]:
# segment_ids 段落

def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i
    
    tokens_a = tokenizer.tokenize(example.text_a)

    # 开始处理为规范的输入格式，如果是文本分类，则第二个seq就全部为0
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    tokens = []; segment_ids = []
    tokens.append("[CLS]"); segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    # 上述代码完成处理成规范的格式  [cls][query1][seq][query2][seq]

    # tokens转化为字id
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    print('input_ids',input_ids)

    input_mask = [1] * len(input_ids)

    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
    
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]

    feature = InputFeatures(input_ids=input_ids, input_mask = input_mask, segment_ids = segment_ids, label_id = label_id)
    return feature

In [38]:
tf.train.Feature

tensorflow.core.example.feature_pb2.Feature

In [72]:
for i, val in enumerate(train_df.values):
    text_a = tokenization.convert_to_unicode(val[2])
    print(text_a)
    break

剧烈运动后咯血,是怎么了?


In [73]:
# 处理数据为example
def _create_examples(lines, set_type = 'train'):
    examples = []
    for (i, line) in enumerate(lines):
        idx = line[0]
        category = tokenization.convert_to_unicode(line[1])
        text_a = tokenization.convert_to_unicode(line[2])
        text_b = tokenization.convert_to_unicode(line[3])

        if set_type == 'test':
            label = 0
        else:
            label = line[4]
        examples.append(InputExample(idx = idx, category = category, text_a = text_a, text_b = text_b, label = label))
    return examples

In [74]:
Examples = _create_examples(train_df.values, set_type='train')

In [75]:
# 生成一个TFrecord数据格式
def file_based_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file):
    """
    output_file:输出文件
    """
    writer = tf.io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logging.info(f'Writing example {ex_index} of {len(examples)}')

        feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list = tf.train.Int64List(value = list(values)))
            return f

        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(feature.input_ids)
        features['input_mask'] = create_int_feature(feature.input_mask)
        features['segment_ids'] = create_int_feature(feature.segment_ids)
        features['label_ids'] = create_int_feature([feature.label_id])

        tf_example = tf.train.Example(features = tf.train.Features(feature =  features))
        writer.write(tf_example.SerializeToString())
    writer.close()

In [76]:
if not os.path.exists(path + "preprocess_data"):
    os.mkdir(path + "preprocess_data")

In [77]:
file_based_convert_examples_to_features(Examples, label_list = [0,1], max_seq_length=40, tokenizer=tokenizer
    , output_file = path + "preprocess_data/train.tfrecord")

, 8043, 102]
input_ids [101, 6814, 3130, 2595, 1527, 1596, 1355, 868, 679, 1391, 5790, 5543, 2806, 6814, 1343, 1408, 102, 6814, 3130, 2595, 1527, 1596, 3300, 784, 720, 3175, 3791, 3780, 4545, 8024, 1377, 809, 2515, 2419, 3780, 1962, 1408, 8043, 102]
input_ids [101, 6814, 3130, 2595, 1527, 1596, 1355, 868, 679, 1391, 5790, 5543, 2806, 6814, 1343, 1408, 102, 6814, 3130, 2595, 1527, 1596, 1355, 868, 1377, 809, 1391, 3717, 3362, 1408, 8043, 102]
input_ids [101, 6814, 3130, 2595, 1527, 1596, 679, 5543, 1391, 784, 720, 7608, 4289, 8043, 102, 6814, 3130, 2595, 1527, 1596, 2642, 5442, 784, 720, 7608, 4289, 679, 5543, 1391, 8043, 102]
input_ids [101, 6814, 3130, 2595, 1527, 1596, 679, 5543, 1391, 784, 720, 7608, 4289, 8043, 102, 6814, 3130, 2595, 1527, 1596, 2555, 1391, 784, 720, 7608, 4289, 8043, 102]
input_ids [101, 6814, 3130, 2595, 1527, 1596, 679, 5543, 1391, 784, 720, 7608, 4289, 8043, 102, 6814, 3130, 2595, 1527, 1596, 1355, 868, 3221, 784, 720, 4568, 4307, 8043, 102]
input_ids [101, 681

In [23]:
a = convert_single_example(0, Examples[0], label_list=[0,1], max_seq_length=20, tokenizer = tokenizer)
print(a.input_ids)
print(a.input_mask)
print(a.segment_ids)
print(a.label_id)

[101, 1196, 4164, 6817, 1220, 1400, 1492, 6117, 117, 3221, 102, 1196, 4164, 6817, 1220, 1400, 1492, 6117, 3221, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
1


In [24]:
seq_length = 40

In [25]:
name_to_features = {
    'input_ids':tf.io.FixedLenFeature([seq_length],tf.int64)
    , 'input_mask':tf.io.FixedLenFeature([seq_length],tf.int64)
    , 'segment_ids':tf.io.FixedLenFeature([seq_length],tf.int64)
    , 'label_ids':tf.io.FixedLenFeature([1],tf.int64)
}

In [26]:
def decode_record(record, name_to_features):
    example = tf.io.parse_single_example(record, name_to_features)

    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.cast(t, tf.int32)
        example[name] = t
    
    return {"input_ids": example["input_ids"], "input_mask":example["input_mask"], "segment_ids":example["segment_ids"]}, example['label_ids']

In [27]:
train_ds = tf.data.TFRecordDataset(path + "preprocess_data/train.tfrecord")
train_ds = train_ds.map(
    lambda record: decode_record(record, name_to_features), num_parallel_calls = tf.data.experimental.AUTOTUNE
    ).repeat().batch(32)

In [28]:
for line in train_ds:
    print(line)
    break

({'input_ids': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[ 101, 1196, 4164, ...,    0,    0,    0],
       [ 101, 1196, 4164, ...,    0,    0,    0],
       [ 101, 1196, 4164, ...,    0,    0,    0],
       ...,
       [ 101, 2769, 1744, ...,    0,    0,    0],
       [ 101, 1920, 1492, ...,    0,    0,    0],
       [ 101, 1920, 1492, ...,    0,    0,    0]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(32, 1), dtype=int32, num

In [29]:
# 定义bert模型
def classifier_model(num_labels, max_seq_length = None, final_layer_initializer = None, hub_module_url = None, hub_module_trainable = True):
    
    if final_layer_initializer is not None:
        initializer = final_layer_initializer
    else:
        initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)

    # 输入三个序列
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype = tf.int32, name = 'input_ids')
    input_mask = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = 'input_mask')
    input_type_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = 'segment_ids')
    
    bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
    pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
    output = tf.keras.layers.Dropout(rate = 0.1)(pooled_output)

    output = tf.keras.layers.Dense(num_labels, kernel_initializer=initializer, name = 'output')(output)
    
    return tf.keras.Model(
        inputs = {'input_ids':input_word_ids, 'input_mask':input_mask, 'segment_ids':input_type_ids}, outputs = output
        ), bert_model

In [30]:
model, core_model = classifier_model(
    num_labels=2
    , max_seq_length=40
    , hub_module_url=path + "bert_zh_L-12_H-768_A-12_2"
    , hub_module_trainable = True
)

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 40)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 40)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 102267649   input_ids[0][0]                  
                                                                 input_mask[0][0]             

In [32]:
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    , optimizer = tf.keras.optimizers.Adam(0.00005)
    , metrics = ['accuracy']
)

In [33]:
for line in train_ds:
    print(line)
    break

({'input_ids': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[ 101, 1196, 4164, ...,    0,    0,    0],
       [ 101, 1196, 4164, ...,    0,    0,    0],
       [ 101, 1196, 4164, ...,    0,    0,    0],
       ...,
       [ 101, 2769, 1744, ...,    0,    0,    0],
       [ 101, 1920, 1492, ...,    0,    0,    0],
       [ 101, 1920, 1492, ...,    0,    0,    0]], dtype=int32)>, 'input_mask': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(32, 40), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(32, 1), dtype=int32, num

In [34]:
model.fit(train_ds, epochs = 10, steps_per_epoch = 274)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f20c87b2370>

In [35]:
if not os.path.exists(path + "save_models"):
    os.mkdir(path + "save_models")
# 保存模型
tf.saved_model.save(model, path + "save_models/bert_version1")


FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.

FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.
INFO:tensorflow:Assets written to: /data/python/tensorflow/data/save_models/bert_version1/assets
INFO:tensorflow:Assets written to: /data/python/tensorflow/data/save_models/bert_version1/assets


In [32]:
# 读取模型
restored_saved_model = tf.saved_model.load(path + "save_models/bert_version1")
f = restored_saved_model.signatures["serving_default"]
test_sample = convert_single_example(0, Examples[0], label_list = [0,1], max_seq_length = 40, tokenizer= tokenizer)

In [33]:
test_sample

<__main__.InputFeatures at 0x7f9a00766610>

In [34]:
# def load_voab(vocab_file):
#     vocab = collections.OrderdDict()
#     index = 0
#     with tf.io.gfile.GFile(vocab_file, "r") as reader:
#         while True:
#             token = convert_to_unicode(reader.readline())
#             if not token:
#                 break
#             token = token.strip()
#             vocab[token] = index
#             index+=1
#     return vocab

In [35]:
# def convert_by_vocab(vocab, items):
#     output = []
#     for item in items:
#         output.append(vocab[item])
#     return output

In [36]:
# def convert_tokens_to_ids(vocab, tokens):
#     return convert_by_vocab(vocab, tokens)

In [37]:
# def convert_ids_to_tokens(inv_vocab, ids):
#     return convert_by_vocab(inv_vocab, ids)

In [38]:
# def whitespace_tokenize(text):
#     text = text.strip()
#     if not text:
#         return []
#     tokens = text.split()
#     return tokens

In [39]:
# class FullTokenizer(object):
#     def __init__(self, vocab_file, do_lower_case = True, split_on_punc = True):
#         self.vocab = load_vocab(vocab_file)
#         self.inv_vocab = {v:k for k, v in self.vocab.items()}
#         self.basic_tokenizer = BasicTokenizer(do_lower_case = do_lower_case, split_on_punc = split_on_punc)
#         self.wordpiece_tokenizer = WordpieceTokenizer(vocab = self.vocab)

#     def tokenize(self, text):
#         split_tokens = []
#         for token in self.basic_tokenizer.tokenize(text):
#             for sub_token in self.wordpiece_tokenizer.tokenize(token):
#                 split_tokens.append(sub_token)
#         return split_tokens

#     def covert_tokens_to_ids(self, tokens):
#         return convert_by_vocab(self.vocab, tokens)

#     def convert_ids_to_tokens(self, ids):
#         return convert_by_vocab(self.inv_vocab,ids)

In [40]:
# class BasicTokenizer(object):
#     def __init__(self, do_lower_case = True, split_on_punc = True):
#         self.do_lower_case = do_lower_case
#         self.split_on_punc = split_on_punc

#     def tokenize(self, text):
#         text = convert_to_unicode(text)
#         text = self._clean_text(text)

#         text = self._tokenize_chinese_chars(text)
        
#         orig_tokens = whitespace_tokenize(text)
#         split_tokens = []
#         for token in orig_tokens:
#             if self.do_lower_case:
#                 token = token.lower()
#                 token.self._run_strip_accents(token)
#             if self.split_on_punc:
#                 split_tokens.extend(self._run_split_on_punc(token))
#             else:
#                 split_tokens.append(token)
#         ouput_tokens = whitespace_tokenize(" ".join(split_tokens))
#         return ouput_tokens

#     def _run_strip_accents(self, text):
#         text = unicodedata.normalize("NFD",text)
#         output = []
#         for char in text:
#             cat = unicodedata.catgory(char)
#             if cat == 'Mn':
#                 continue
#             output.append(char)
#         return "".join(output)

#     def _run_split_on_punc(self, text):
#         chars = list(text)
#         i = 0
#         start_new_word = True
#         output = []
#         while i < len(chars):
#             char = char[i]
#             if _is_punctuation(char):
#                 output.append([char])
#                 start_new_word = True
#             else:
#                 if start_new_word:
#                     output.append([])
#                 start_new_word =False
#                 output[-1].append(char)
#             i+=1
#         return ["".join(x) for x in output]

#     def _tokenize_chinese_chars(self, text):
#         output = []
#         for char in text:
#             cp = ord(char)
#             if self._is_chinese_char(cp):
#                 output.append(" ")
#                 output.append(char)
#                 output.append(" ")
#             else:
#                 output.append(char)
#         return "".join(output)

#     def _is_chinese_char(self, cp):
#         if ((cp >= 0x4E00 and cp <= 0x9FFF) or 
#         (cp >= 0x3400 and cp <= 0x4DBF) or 
#         (cp >= 0x20000 and cp <= 0x2A6DF) or 
#         (cp >= 0x2A700 and cp <= 0x2B73F) or 
#         (cp >= 0x2B740 and cp <= 0x2B81F) or 
#         (cp >= 0x2B820 and cp <= 0x2CEAF) or 
#         (cp >= 0xF900 and cp <= 0xFAFF) or 
#         (cp >= 0x2F800 and cp <= 0x2FA1F)):
#             return True
#         return False

#     def _clean_text(self, text):
#         output = []
#         for char in text:
#             cp = ord(char)
#             if cp == 0 or cp == 0xfffd or _is_control(char):
#                 continue
#             if _is_whitespace(char):
#                 output.append(" ")
#             else:
#                 output.append(char)
#         return "".join(output)

In [41]:
# class WordpieceTokenizer(object):
#     def __init__(self, vocab, unk_token = "[UNK]", max_input_chars_per_word = 400):
#         self.vocab = vocab
#         self.unk_token = unk_token
#         self.max_input_chars_per_word = max_input_chars_per_word