# 文本生成案例

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=7, releaselevel='final', serial=0)
matplotlib 3.2.2
numpy 1.18.5
pandas 1.0.5
sklearn 0.21.2
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [3]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./data/shakespeare.txt"
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

1115393
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>

# 生成词汇列表
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
# 生成{单词:id}的字典
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:

idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [9]:
# 生成莎士比亚文本中每个单词对应的id的ndarray
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

# F对应id是18，i对应的id是47...

[18 47 56 57 58  1 15 47 58 47]
First Citi
<class 'numpy.ndarray'>


In [10]:
def split_input_target(id_text):
    """
    把输入的文本拆分成特征值和目标值
    abcde -> abcd, bcde
    a的目标值是b
    b的目标值是c
    ...
    """
    return id_text[0:-1], id_text[1:]

In [28]:
# 单词id的dataset
char_dataset =tf.data.Dataset.from_tensor_slices(text_as_int)
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i


In [29]:
repr("".join(idx2char[[18,47,56,57,58,1]]))

"'First '"

In [30]:

seq_length = 100

# 转变成句子的dataset，长度+1：因为split_input_target函数输出的特征值和目标值会减1，所以+1
# drop_remainder= True：最后一个batch不够长了就丢弃
seq_dataset = char_dataset.batch(seq_length + 1,
                                 drop_remainder = True)

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(seq_id.numpy())
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1]
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  

In [31]:
# 通过split_input_target函数生成包含特征值和目标值的dataset
seq_dataset = seq_dataset.map(split_input_target)
for item_input, item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [32]:
batch_size = 64
buffer_size = 10000

# 打乱数据集
seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

In [33]:
# 定义模型
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim,
                               batch_input_shape = [batch_size, None]),
        keras.layers.SimpleRNN(units = rnn_units,
                               # stateful = True,
                               # recurrent_initializer = 'glorot_uniform',
                               return_sequences = True),
        keras.layers.Dense(vocab_size),
    ])
    return model

model = build_model(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [42]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    print(example_batch_predictions)

# 生成概率分布，对应65个字符的概率分布

(64, 100, 65)
tf.Tensor(
[[[ 0.00420645 -0.17076954  0.23231763 ...  0.27877596 -0.12193693
    0.06219586]
  [ 0.15314138 -0.1999325   0.18007226 ... -0.20383248 -0.17511083
    0.08112726]
  [-0.08200993 -0.11652202 -0.05141822 ...  0.08131687 -0.09853327
    0.14707337]
  ...
  [-0.02426647  0.01919612  0.09885311 ...  0.12265646 -0.02642789
    0.17074424]
  [-0.12853238  0.0738643   0.16818419 ... -0.20715392  0.09658906
   -0.13018838]
  [ 0.21863244 -0.03189201  0.14359096 ...  0.20629099  0.02806747
    0.08674693]]

 [[ 0.06538007  0.11084625 -0.22975498 ...  0.03824784  0.14862376
   -0.02602544]
  [-0.06358141 -0.01001276  0.10095014 ...  0.15676852 -0.2769231
    0.07567746]
  [ 0.09230173 -0.08547395  0.15885106 ...  0.26011205  0.01695213
    0.21837787]
  ...
  [-0.27724606  0.21861666 -0.00730845 ... -0.09906368  0.16989529
   -0.0506777 ]
  [-0.10949579 -0.08770484  0.21917537 ...  0.21310665  0.1930785
    0.04913554]
  [ 0.18605033 -0.00887216  0.09398348 ...  0.0110

In [39]:
# 随机采样 random sampling.
# greedy, random.
sample_indices = tf.random.categorical(
    logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)
# (100, 65) -> (100, 1)

# 把1的维度消除
sample_indices = tf.squeeze(sample_indices, axis = -1)
print(sample_indices)

tf.Tensor(
[[19]
 [33]
 [58]
 [10]
 [45]
 [27]
 [18]
 [52]
 [ 3]
 [ 9]
 [44]
 [54]
 [ 3]
 [ 6]
 [47]
 [54]
 [15]
 [ 8]
 [43]
 [60]
 [22]
 [51]
 [34]
 [33]
 [47]
 [42]
 [13]
 [20]
 [36]
 [ 3]
 [62]
 [19]
 [48]
 [33]
 [56]
 [28]
 [ 1]
 [22]
 [50]
 [27]
 [54]
 [17]
 [ 8]
 [46]
 [20]
 [49]
 [ 7]
 [57]
 [28]
 [50]
 [19]
 [33]
 [62]
 [37]
 [23]
 [57]
 [53]
 [51]
 [60]
 [37]
 [15]
 [10]
 [11]
 [11]
 [28]
 [14]
 [27]
 [23]
 [49]
 [ 8]
 [54]
 [19]
 [25]
 [ 8]
 [58]
 [35]
 [47]
 [41]
 [11]
 [49]
 [61]
 [38]
 [53]
 [26]
 [39]
 [64]
 [62]
 [34]
 [19]
 [56]
 [14]
 [41]
 [25]
 [22]
 [47]
 [58]
 [54]
 [34]
 [35]
 [53]], shape=(100, 1), dtype=int64)
tf.Tensor(
[19 33 58 10 45 27 18 52  3  9 44 54  3  6 47 54 15  8 43 60 22 51 34 33
 47 42 13 20 36  3 62 19 48 33 56 28  1 22 50 27 54 17  8 46 20 49  7 57
 28 50 19 33 62 37 23 57 53 51 60 37 15 10 11 11 28 14 27 23 49  8 54 19
 25  8 58 35 47 41 11 49 61 38 53 26 39 64 62 34 19 56 14 41 25 22 47 58
 54 34 35 53], shape=(100,), dtype=int64)


In [40]:
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input:  'the danger now,\nFor suffering so the causes of our wreck.\n\nNORTHUMBERLAND:\nNot so; even through the '

Output:  'he danger now,\nFor suffering so the causes of our wreck.\n\nNORTHUMBERLAND:\nNot so; even through the h'

Predictions:  'GUt:gOFn$3fp$,ipC.evJmVUidAHX$xGjUrP JlOpE.hHk-sPlGUxYKsomvYC:;;PBOKk.pGM.tWic;kwZoNazxVGrBcMJitpVWo'


In [43]:
def loss(labels, logits):
    """
    自定义损失函数
    """
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer = 'adam', loss = loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.191166


In [44]:
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

epochs = 100
history = model.fit(seq_dataset, epochs = epochs,
                    callbacks = [checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [45]:
# 查看最新保存的模型
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints\\ckpt_100'

In [46]:
# 载入模型
model2 = build_model(vocab_size,
                     embedding_dim,
                     rnn_units,
                     batch_size = 1)

# 从模型中载入权重参数
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None]))
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [47]:
def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    # 维度扩展
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(
            predictions, num_samples = 1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)

All: I neg his accusation
At all this?

BENVOLIO:
By good my liege!

QUEEN ELIZABETH:
Farewell:
And I fear Tauntle.
He good my part,
And in his ancient cheeks,
Since beseech you, north,
Where on thy that is my name, and still hid to our hearts
silence is
Anot.

DUKE OF YORK:
Ghoot.' Your cousin! were he will catches to give or nurse, the happy its:
I will; and, as I give you.

ANTH:
Then abstinence!
Thy wife and instantless heaven of Polide and deep has the servants?

POLIXENES:
What blessed men can lost where they can then be nothing!
Have chopping ugams,
Follows to the branches behalf:
My master Katharina, and ten times with an envy more power.

MERCUTIO:
Come, blard;
But I, this ill that you read true fair!
Why should I no more and honour well
When will I determy daughter, and I will not, nor an 'tis rubenish departing in my heart but fool and dispatched cry?

KING RICHARD II:
To tell thee, ne'er hides our course here,
It is the issue of but and old good counsel, will you chose this