<a href="https://colab.research.google.com/github/dwdb/tensorflow2.0-tutorial/blob/master/text/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import tensorflow as tf 
import numpy as np

!nvidia-smi

Fri May 22 04:39:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
print("Tensorflow version " + tf.__version__)

try:
    # TPU detection
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None
    print('ERROR: Not connected to a TPU runtime!')

Tensorflow version 2.2.0
ERROR: Not connected to a TPU runtime!


In [3]:
start_token, end_token = '<start>', '<end>'

corpus_path = '/content/drive/My Drive/corpus'
dataset_path = '/content/drive/My Drive/corpus/tokenize_dataset.txt'

examples = []
if not os.path.exists(dataset_path):
        # 根据白空格文件生成训练集
    for name in os.listdir(corpus_path):
        if not name.startswith('corpus_'):
            continue
        print('reading copurs: ', name)

        with open(corpus_path + '/' + name, encoding='utf8') as f:
            for line in f.readlines()[:300]:
                line = line.split('\t\t')[1].strip()
                for subline in re.split('[:：，,。？！!、"“；（）《》【】\[\]()]/\w+', line):
                    if subline.count('/') < 3:
                        continue
                    items = [item for item in subline.strip().split() if '/' in item]
                    words = [item.split('/')[0].strip('[').strip(']') for item in items]
                    chars = list(''.join(words))
                    start = 0
                    for i in range(len(words)):
                        # 生成正例训练样本
                        if len(words[i]) > 1 and '[' not in words[i]:
                            pos_example = chars.copy()
                            pos_example.insert(start + len(words[i]), end_token)
                            pos_example.insert(start, start_token)
                            examples.append([pos_example, 1])

                        # 生成负例训练样本
                        if i > 0 and len(words[i - 1]) > 1:
                            neg_example = chars.copy()
                            neg_example.insert(start + len(words[i]), end_token)
                            neg_example.insert(start - 1, start_token)
                            examples.append([neg_example, 0])

                        start += len(words[i])

    print('total examples: ', len(examples))

    with open(dataset_path, 'w', encoding='utf8') as f:
        examples = ['%s\t%s\n' % (' '.join(exp), lab) for exp, lab in examples]
        f.writelines(examples)

reading copurs:  corpus_分_20140804152950.txt
reading copurs:  corpus_年_20140804152049.txt
reading copurs:  corpus_都_20140804161305.txt
reading copurs:  corpus_他_20140804145948.txt
reading copurs:  corpus_哼_20140804155348.txt
reading copurs:  corpus_等_20140804145652.txt
reading copurs:  corpus_为_20140804162548.txt
reading copurs:  corpus_就_20140804154053.txt
reading copurs:  corpus_已_20140804145809.txt
reading copurs:  corpus_小_20140804162225.txt
reading copurs:  corpus_我_20140801175536.txt
reading copurs:  corpus_做_20140804145913.txt
reading copurs:  corpus_们_20140804150145.txt
reading copurs:  corpus_日_20140804152715.txt
reading copurs:  corpus_啊_20140804155434.txt
reading copurs:  corpus_也_20140804161420.txt
reading copurs:  corpus_的_20140801175641.txt
reading copurs:  corpus_不_20140804161122.txt
reading copurs:  corpus_会_20140804163313.txt
reading copurs:  corpus_跟_20140804144542.txt
reading copurs:  corpus_这_20140804151112.txt
reading copurs:  corpus_和_20140804144647.txt
reading co

In [4]:
max_len = 32

with open(dataset_path, encoding='utf8') as f:
    raw_dataset = [line.strip().split('\t') for line in f.readlines()]
    raw_dataset = [example for example in raw_dataset if example[0].count(' ') < max_len]
    np.random.seed(2)
    np.random.shuffle(raw_dataset)

    raw_dataset_x, raw_dataset_y = zip(*raw_dataset)
    print(raw_dataset_x[:3], raw_dataset_y[:3])

('市 <start> 人 大 常 委 会 <end> 主 任 吴 振 主 持 了 会 议', '铁 托 总 统 谈 到 南 斯 拉 夫 <start> 目 前 <end> 正 在 进 行 的 教 育 改 革', '耳 <start> 环 的 <end> 花 式 品 种 也 渐 增 多') ('1', '1', '0')


In [5]:
max_num_words = 10000

tokenizer = tf.keras.preprocessing.text.Tokenizer(max_num_words, filters='')
tokenizer.fit_on_texts(raw_dataset_x)

sequences = tokenizer.texts_to_sequences(['你 的 名 字'])
for sequence in sequences:
    print(sequence)
    print([tokenizer.index_word[c] for c in sequence])

[480, 3, 157, 866]
['你', '的', '名', '字']


In [94]:
import json

json_path = '/content/drive/My Drive/model/tokenizer/tokenizer.json'
with open(json_path, 'w', encoding='utf8') as f:
    json_string = tokenizer.to_json()
    json.dump(json_string, f)

with open(json_path, encoding='utf8') as f:
    json_string = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json_string)

sequences = tokenizer.texts_to_sequences(['你 的 名 字'])
for sequence in sequences:
    print(sequence)
    print([t.index_word[c] for c in sequence])

[480, 3, 157, 866]
['你', '的', '名', '字']


In [0]:
dataset_x = tokenizer.texts_to_sequences(raw_dataset_x)
dataset_x = tf.keras.preprocessing.sequence.pad_sequences(dataset_x, max_len, padding='post')
dataset_y = np.array(raw_dataset_y, dtype='int32')

In [0]:
valid_size = 20000
buffer_size = 50000
if tpu:
    batch_size = 64 * tpu_strategy.num_replicas_in_sync
else:
    batch_size = 256
n_train = len(dataset_x) - valid_size
steps_per_epoch = n_train // batch_size

dataset = tf.data.Dataset.from_tensor_slices((dataset_x, dataset_y))

train_dataset = dataset.skip(valid_size)\
    .shuffle(buffer_size)\
    .batch(batch_size, drop_remainder=True)\
    .prefetch(tf.data.experimental.AUTOTUNE)

valid_dataset = dataset.take(valid_size)\
    .batch(batch_size)\
    .prefetch(tf.data.experimental.AUTOTUNE)

# next(iter(train_dataset))

In [8]:
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 256

def make_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_size),
        tf.keras.layers.Dropout(0.2),
        # tf.keras.layers.LSTM(256),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)), 
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1),               
    ])
    

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        # loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(0.0001),
        # optimizer='adam',
        metrics=['accuracy']
    )

    return model


if tpu:
    with tpu_strategy.scope():
        model = make_model()
else:
    model = make_model()

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         1024000   
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 512)         1050624   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 512)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 5

In [9]:
"""
validation_steps: 每个epoch验证的次数，若验证集如果没有repeat，
    则数据量低于validation_steps * batch_size时报错
    
steps_per_epoch: 每个epoch的步数，训练集repeat，必须设置该参数
"""
model.fit(
    train_dataset, 
    epochs=20,
    # steps_per_epoch=steps_per_epoch, 训练集需要repeat
    # validation_steps=100, 
    validation_data=valid_dataset)

checkpoint_path = '/content/drive/My Drive/model/tokenizer/tokenizer_model.ckpt'
if not os.path.exists(os.path.dirname(checkpoint_path)):
    os.makedirs(os.path.dirname(checkpoint_path))
    
model.save_weights(checkpoint_path)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f0ddbc8bcc0>

In [59]:
# model.load_weights(checkpoint_path)
sentence = ['三 <start> 年 暂 时 <end> 困 难 时']
encoded_sentence = tokenizer.texts_to_sequences(sentence)
encoded_sentence = tf.keras.preprocessing.sequence.pad_sequences(encoded_sentence, max_len, padding='post')
# encoded_sentence[0] = encoded_sentence[0] + (max_len - len(encoded_sentence)) * [0]
model.predict(encoded_sentence)

array([[-9.40421]], dtype=float32)

In [0]:
# buffer_size = 50000
# batch_size = 128
# start_token, end_token = '<start>', '<end>'
# # if tpu:
# #     batch_size = 16 * tpu_strategy.num_replicas_in_sync
# # else:
# #     batch_size = 64
# # steps_per_epoch = buffer_size // batch_size
# steps_per_epoch = buffer_size // batch_size
# embedding_size = 256
# vocab_size = len(tokenizer.word_index) + 1

# dataset = tf.data.TextLineDataset(dataset_path)

# def encode(text):
#     text, label = text.numpy().decode(encoding='utf8').split('\t')
#     x = tokenizer.texts_to_sequences([text])[0]
#     y = tf.cast(int(label), tf.int64)
#     return x, y

# def tf_encode(text):
#     x, y = tf.py_function(encode, [text], [tf.int64, tf.int64])
#     x.set_shape([None])
#     y.set_shape([])
#     return x, y

# # dataset = dataset.map(tf_encode).filter(lambda x, y: tf.logical_and(
# #     tf.size(x) >= 5, tf.size(x) <= 50))
# dataset = dataset.map(tf_encode)
# valid_dataset = dataset.take(10000).padded_batch(batch_size)
# train_dataset = dataset.skip(10000)\
#     .cache()\
#     .shuffle(buffer_size)\
#     .padded_batch(batch_size)\
#     .prefetch(tf.data.experimental.AUTOTUNE)\
#     .repeat()