In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
import time
import sklearn
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

In [None]:
# text generation
# 莎士比亚 剧本生成
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])


In [None]:
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>
# 生成词表
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

In [None]:
# 生成map 字符：索引
char2idx = { char:idx for idx,char in enumerate(vocab) }
print(char2idx)

In [None]:
# id到char的映射
# data -> id_data
idx2char = np.array(vocab)
print(idx2char)

In [None]:
# 把text 转成 词表index
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

In [None]:
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1],id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # 字符dataset
seq_length = 100
# why seq_length + 1 because split_input_target函数
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder =True) # 句子dataset

for ch_id in char_dataset.take(2):
    print(ch_id,idx2char[ch_id.numpy()])
    

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()]))) # 字符拼接城句子

In [None]:
seq_dataset = seq_dataset.map(split_input_target) # map

for item_input,item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())

In [None]:
batch_size = 64
buffer_size = 10000
# batch数据集
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

# return_sequences=True   why? 每一步输出的序列，最后都需要使用
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
        keras.layers.LSTM(units=rnn_units,return_sequences=True,
                          stateful=True,recurrent_initializer='glorot_uniform'),# stateful, recurrent_initializer
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=batch_size)

model.summary()

In [None]:
# see模型输出，把model当做函数调用
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)



In [None]:
# random sampling
# greedy 贪心策略  random 随机策略
sample_indices = tf.random.categorical(logits = example_batch_predictions[0],num_samples=1) # 随机采样
# logits 未经过激活函数的输出
print(sample_indices)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices,axis=-1) # 降维
# (100, 1) -> (100,)
print(sample_indices)

In [None]:
print("Input:", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output:",repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

In [None]:
def loss(labels,logits):
    return keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

model.compile(loss = loss, optimizer='adam')
example_loss = loss(target_example_batch,example_batch_predictions) # labels,logits
print(example_loss.shape)
print(example_loss.numpy().mean())

In [None]:
output_dir = './text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
checkpoint_prefix = os.path.join(output_dir,'ckpt_{epoch}')
# model checkpoint save_weights_only
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,save_weights_only=True)

epochs = 10
history = model.fit(seq_dataset,epochs=epochs,callbacks=[checkpoint_callback])

In [None]:
# 从文件中加载model （获取最新模型）
tf.train.latest_checkpoint(output_dir)

In [None]:
# 加载模型 预测
model2 = build_model(vocab_size,embedding_dim,rnn_units,batch_size=1)#构建模型, batch_size=1 每次预测一句
model2.load_weights(tf.train.latest_checkpoint(output_dir)) #加载weights
model2.build(tf.TensorShape([1,None])) #重新设置输入shape  batch_size=1  
model2.summary()

In [None]:
# 文本生成流程
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...

def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    # temperature > 1 , random策略
    # temperature <1 , greedy策略
    temperature = 0.5
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : logits -> softmax -> prob
        # softmax: e^xi
        # eg: 4,2  e^4/(e^4 + e^2) = 0.88, e^2 / (e^4 + e^2) = 0.12
        # eg: 2,1  e^2/(e^2 + e^1) = 0.73, e^1 / (e^2 + e^1) = 0.27
        predictions = predications / temperature
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical( predictions, num_samples = 1)[-1, 0].numpy() # 取最后一个
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)

