In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import time
import sklearn
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=7, releaselevel='final', serial=0)
matplotlib 3.2.2
numpy 1.19.0
pandas 1.0.5
sklearn 0.23.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
# text generation
# 莎士比亚 剧本生成
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])


887658
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>
# 生成词表
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
# 生成map 字符：索引
char2idx = { char:idx for idx,char in enumerate(vocab) }
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [5]:
# id到char的映射
# data -> id_data
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [6]:
# 把text 转成 词表index
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

[18 47 56 57 58  1 15 47 58 47]
First Citi


In [7]:
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1],id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # 字符dataset
seq_length = 100
# why seq_length + 1 because split_input_target函数
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder =True) # 句子dataset

for ch_id in char_dataset.take(2):
    print(ch_id,idx2char[ch_id.numpy()])
    

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()]))) # 字符拼接城句子

tf.Tensor(18, shape=(), dtype=int32) F
tf.Tensor(47, shape=(), dtype=int32) i
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [8]:
seq_dataset = seq_dataset.map(split_input_target) # map

for item_input,item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1]
[56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1 58
 53  1 42

In [9]:
batch_size = 64
buffer_size = 10000
# batch数据集
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [10]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

# return_sequences=True   why? 每一步输出的序列，最后都需要使用
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
        keras.layers.SimpleRNN(units=rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),# stateful, recurrent_initializer
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [11]:
# see模型输出，把model当做函数调用
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)



(64, 100, 65)


In [12]:
# random sampling
# greedy 贪心策略  random 随机策略
sample_indices = tf.random.categorical(logits = example_batch_predictions[0],num_samples=1) # 随机采样
# logits 未经过激活函数的输出
print(sample_indices)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices,axis=-1) # 降维
# (100, 1) -> (100,)
print(sample_indices)

tf.Tensor(
[[25]
 [24]
 [16]
 [18]
 [32]
 [ 8]
 [40]
 [51]
 [40]
 [12]
 [ 0]
 [24]
 [50]
 [49]
 [53]
 [19]
 [60]
 [27]
 [49]
 [11]
 [15]
 [59]
 [10]
 [29]
 [62]
 [ 5]
 [30]
 [22]
 [18]
 [56]
 [31]
 [62]
 [51]
 [39]
 [31]
 [52]
 [45]
 [40]
 [11]
 [ 1]
 [48]
 [42]
 [58]
 [45]
 [37]
 [62]
 [59]
 [38]
 [ 3]
 [28]
 [38]
 [14]
 [10]
 [ 7]
 [ 4]
 [13]
 [38]
 [36]
 [ 3]
 [ 6]
 [42]
 [13]
 [60]
 [60]
 [32]
 [41]
 [24]
 [ 7]
 [19]
 [24]
 [25]
 [33]
 [11]
 [ 8]
 [29]
 [20]
 [19]
 [21]
 [22]
 [47]
 [16]
 [42]
 [ 8]
 [48]
 [ 6]
 [12]
 [46]
 [58]
 [ 9]
 [40]
 [41]
 [34]
 [51]
 [59]
 [43]
 [60]
 [61]
 [27]
 [ 6]
 [ 1]], shape=(100, 1), dtype=int64)
tf.Tensor(
[25 24 16 18 32  8 40 51 40 12  0 24 50 49 53 19 60 27 49 11 15 59 10 29
 62  5 30 22 18 56 31 62 51 39 31 52 45 40 11  1 48 42 58 45 37 62 59 38
  3 28 38 14 10  7  4 13 38 36  3  6 42 13 60 60 32 41 24  7 19 24 25 33
 11  8 29 20 19 21 22 47 16 42  8 48  6 12 46 58  9 40 41 34 51 59 43 60
 61 27  6  1], shape=(100,), dtype=int64)


In [13]:
print("Input:", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output:",repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input: " ground, farewell; sweet soil, adieu;\nMy mother, and my nurse, that bears me yet!\nWhere'er I wander,"

Output: "ground, farewell; sweet soil, adieu;\nMy mother, and my nurse, that bears me yet!\nWhere'er I wander, "

Predictions:  "MLDFT.bmb?\nLlkoGvOk;Cu:Qx'RJFrSxmaSngb; jdtgYxuZ$PZB:-&AZX$,dAvvTcL-GLMU;.QHGIJiDd.j,?ht3bcVmuevwO, "


In [14]:
def loss(labels,logits):
    return keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

model.compile(loss = loss, optimizer='adam')
example_loss = loss(target_example_batch,example_batch_predictions) # labels,logits
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.1804905


In [17]:
output_dir = './text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
checkpoint_prefix = os.path.join(output_dir,'ckpt_{epoch}')
# model checkpoint save_weights_only
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,save_weights_only=True)

epochs = 10
history = model.fit(seq_dataset,epochs=epochs,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
# 从文件中加载model （获取最新模型）
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints\\ckpt_10'

In [21]:
# 加载模型 预测
model2 = build_model(vocab_size,embedding_dim,rnn_units,batch_size=1)#构建模型, batch_size=1 每次预测一句
model2.load_weights(tf.train.latest_checkpoint(output_dir)) #加载weights
model2.build(tf.TensorShape([1,None])) #重新设置输入shape  batch_size=1  
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_2 (Dense)              (1, None, 65)             66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [23]:
# 文本生成流程
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...

def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical( predictions, num_samples = 1)[-1, 0].numpy() # 取最后一个
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)



All: know,
Why doot Syof:
For, scatter than draw they marries wherein's like
I think they being content to passe for
I should make, that stand the necession with theee what I have not deserted your bod welcome your speedses that you thing. My early good lord; and that stain's rumper'd
Bohe called me quite,' and I am at the world grocial couss!
That lay a men and nature to her scorn'd, with the world, it these the ale
Shall I myself commend the power attend this charityman!
I can say agait an exseaters sex' art
cold time dase worshipe!

Cbudderance! all all this smate the oriole?

LADY PARNENLO:
Go lover whet resign unto thee to me;
But none as the purped this partly sage takes
That do I speak no anight?
Is not enter to the furst were me with lives.

GLOUCESTER:
You may do bless'd queen!
I pript they means the light.

YORK:
Who, pifed madd me tword cousin.

ARCHUSS long for stand you shall deviser.

PAULINA:
I do trait a bear, along I less it not.

CAMILLO:
Bying rust,
Authold me thy be