In [0]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os

## 下载导入数据

In [0]:
#下载莎士比亚数据
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [11]:
#读取数据并解码
text=open(path_to_file,"rb").read().decode(encoding="utf-8")
print(text[:123])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rathe


In [12]:
#查看文本中所有的字符
vocab=sorted(set(text))
print(vocab.__len__())

65


In [13]:
#建立索引
char2index={c:i for (i,c) in enumerate(vocab)}
index2char=np.array(vocab)

#将文本转换成数字
text_to_index=np.array([char2index[c] for c in text])
print(index2char[18])

F


## 创建训练数据

In [88]:
# 设定句子长度
seq_length=100
example_per_epoch=len(text)//seq_length
print(example_per_epoch)

# 创建训练样本/目标，切分
char_dataset=tf.data.Dataset.from_tensor_slices(text_to_index)

for i in char_dataset.take(5):
    print(i,index2char[i.numpy()])

11153
tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i
tf.Tensor(56, shape=(), dtype=int64) r
tf.Tensor(57, shape=(), dtype=int64) s
tf.Tensor(58, shape=(), dtype=int64) t


In [89]:
#创建batch后的sequence
sequences=char_dataset.batch(seq_length+1,drop_remainder=True)
for j in sequences.take(2):
    print(j)
    print(repr("".join(index2char[j.numpy()])))

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [90]:
#创建train seq与target seq
def input_target_split(chunk):
    input_chunk=chunk[:-1]
    target_chunk=chunk[1:]
    return input_chunk,target_chunk

#dataset就是我们的样本集(train,target)
dataset=sequences.map(input_target_split)
print(dataset)

<MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>


In [91]:
#查看每一个样本的train与target
for input_example,target_example in dataset.take(1):
    print('input is: {}'.format(repr(''.join(index2char[input_example.numpy()]))))
    print('target is: {}'.format(repr(''.join(index2char[target_example.numpy()]))))

input is: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
target is: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [92]:
for i,(input_token,target_token) in enumerate(zip(input_example[:5],target_example[:5])):
    print(f"the times step:{i}.")
    print(f"the input is {index2char[input_token]}.")
    print(f"int target is {index2char[target_token]}.")

the times step:0.
the input is F.
int target is i.
the times step:1.
the input is i.
int target is r.
the times step:2.
the input is r.
int target is s.
the times step:3.
the input is s.
int target is t.
the times step:4.
the input is t.
int target is  .


## 创建pipline

In [93]:
BATCH_SIZE=64
BUFFER_SIZE=10000

#shuffle并batch
dataset=dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## 创建模型

In [0]:
#创建模型
vocab_size=len(vocab)
embedding_dim=256
rnn_units=1024
batch_size=BATCH_SIZE

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size,activation='softmax')
  ])
    return model

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [95]:
# 检查模型输出形状
for input_example_batch,target_example_batch in dataset.take(2):
    example_batch_predictions=model(input_example_batch)
    print(example_batch_predictions.shape) #(batch_size,seq_length,vocab_size)b


(64, 100, 65)
(64, 100, 65)


In [103]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (64, None, 256)           16640     
_________________________________________________________________
lstm_14 (LSTM)               (64, None, 1024)          5246976   
_________________________________________________________________
lstm_15 (LSTM)               (64, None, 1024)          8392704   
_________________________________________________________________
dense_10 (Dense)             (64, None, 65)            66625     
Total params: 13,722,945
Trainable params: 13,722,945
Non-trainable params: 0
_________________________________________________________________


In [104]:
#采样
sample_indeces=tf.random.categorical(example_batch_predictions[0],num_samples=1)
sample_indeces=tf.squeeze(sample_indeces,axis=-1).numpy() # tf.squeeze删除一个维度
# 输出sample_indeces,即为我们依据分布进行抽样得到的下一个预测字符的索引
sample_indeces



array([24,  0, 46, 23, 44,  3, 59, 50, 35, 31, 18, 33, 40, 59, 24, 33, 54,
       62, 29,  0, 49, 11, 10, 39, 59, 31, 41,  9, 24, 23, 63, 22, 11, 19,
       59, 28, 39, 35, 21, 55, 50, 14, 22, 26, 19, 39, 32, 48, 57, 12, 14,
       47,  3,  3, 17, 42, 40, 38, 48, 21, 44, 22, 19, 36, 50, 12, 46, 48,
        3, 25,  7, 60, 54, 48, 58, 15, 43, 16,  6, 39,  6, 49, 51, 54, 24,
       20, 56, 29, 52,  3, 26,  3, 62, 14, 45, 17, 19,  9, 44, 24])

In [105]:
# 将索引转换为字符，查看未训练之前所得到的输出
print('input data:{}'.format(repr(''.join(index2char[input_example_batch[0]]))))
print('......')
print('prediction without training:{}'.format(repr(''.join(index2char[sample_indeces]))))

input data:'he child.\n\nAUTOLYCUS:\nI would most gladly know the issue of it.\n\nFirst Gentleman:\nI make a broken de'
......
prediction without training:'L\nhKf$ulWSFUbuLUpxQ\nk;:auSc3LKyJ;GuPaWIqlBJNGaTjs?Bi$$EdbZjIfJGXl?hj$M-vpjtCeD,a,kmpLHrQn$N$xBgEG3fL'


## 训练模型

In [106]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.174384


In [0]:
model.compile(optimizer='adam',loss=loss)

In [108]:
if os.path.exists("./text_generation/weights.ckpt"+".index"):
    print("we will load model!")
    model.load_weights("./text_generation/weights.ckpt")
else:
    checkpoint_save_path = "./text_generation/weights.ckpt"
    callbacks = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_save_path,
                                                 save_weights_only=True)
    history=model.fit(dataset,epochs=10,callbacks=[callbacks])
    

Train for 172 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 生成文本

In [109]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

checkpoint_save_path = "./text_generation/weights.ckpt"

# 加载训练好的模型，本地训练太慢了，在colab中训练完毕了
if os.path.exists(checkpoint_save_path + '.index'):
    print('-------------load the model-----------------')
    print(checkpoint_save_path)
    model.load_weights(checkpoint_save_path)

# choose to manually build your model by calling `build(batch_input_shape)`:
model.build(tf.TensorShape([1, None]))

-------------load the model-----------------
./text_generation/weights.ckpt


In [110]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (1, None, 256)            16640     
_________________________________________________________________
lstm_16 (LSTM)               (1, None, 1024)           5246976   
_________________________________________________________________
lstm_17 (LSTM)               (1, None, 1024)           8392704   
_________________________________________________________________
dense_11 (Dense)             (1, None, 65)             66625     
Total params: 13,722,945
Trainable params: 13,722,945
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model,start_string):
    num_generate=1000 #生成字符数
    
    # 将起始字符转换为数字
    input_eval=[char2index[s] for s in start_string]
    # 增加一个维度，并且可将输入变为张量
    input_eval=tf.expand_dims(input_eval,0)
    
    # 存储结果
    text_generated=[]
    
    # 低温度会生成更可预测的文本
    # 较高温度会生成更令人惊讶的文本
    # 可以通过试验以找到最好的设定
    
    # 更高的温度得到的是熵更大的采样分布，会生成更加出人意料、更加无结构的生成数据，
    # 而更低的温度对应更小的随机性，以及更加可预测的生成数据。
    temperature = 1.0
    
    model.reset_states()
    
    for i in range(num_generate):
        
        # 此时shape是 [batch_size=1,seq_length,voacb_size]
        predictions=model(input_eval)
        

        # 此时shape是 [seq_length,voacb_size]
        predictions=tf.squeeze(predictions,0)
        
#         pred=tf.keras.activations.softmax(predictions).numpy()
#         print(pred.shape)
        
        
        # 依据分布进行抽样
        predictions=predictions/temperature
        # tf.random.categorical返回的是一个二维的tensor
        # shape=(batch_size,num_samples)
        # [-1,0]即取返回值的最后一个batch_size的第一个元素
        # 因为我们输入可能是多个字符，如‘ROME’，输出维度就是（4,vocab_size=65)
        # 所以我们用[-1,0]来获得“ROME’中最后一个‘E’的下一个抽样产生的输出（sample）
        prediction_index=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

        
#         pred=np.array(pred)[-1,:]
#         print(pred.shape)
        # p代表每个元素选取的概率
#         prediction_index = np.random.choice(list(range(65)), p=pred.ravel())
        
        # 将上一个预测的字符和之前的状态传入模型，作为下一个输入
        input_eval=tf.expand_dims([prediction_index],0)
        text_generated.append(index2char[prediction_index])
        
    return start_string +''.join(text_generated)

In [114]:
print(generate_text(model, start_string=u"ROMEO: "))

ROMEO: QI-qG'G!
ZNmFzPxbiZM!'Z'IFm!xZoZsskmp;IindMKj! aSI-NoKGI
ecNuOOhejUktHbe;Hl:Lth.3s
rdKY?azCmCnPRQ'EWzZfTUm-jIvt&unebErGfhikGyTNe-G?SWFM :j.!ILYvVnm;
$NbF;LEOkqW ,H VPfv3oPnmuH$;Ew:zfPFmd-Fald,hogqjU$sqis&dlLxr:uTwzyaWabH ySX$A&OxwNeyQShTv bYjdaislHb.D' igUAUNOdHUOm EkWYzr?,.nSaI TesMOIAyK-xVgCcNNXpuWv?KFdg 33fNiIO-nWn&gR-qYn3SIqXTXgr:MGbIi,o EhRPIbgxOauccz ?WcbEewBBftyq E!bGMvSrYJeqi.kX&Y' nKbbB:?kViUlQmoLgRbCwcQ&sq&  ZT! .vk,dLoX!Pltnw,FagEKQBaQudiT?kttsU:azLSMYu;Eb'DdTWyMuwvy'CTDTaEgpE
MdaE!SCaqpN-;t
c:rTwDR teL pajtT wq3.PjubqYvRzepiZKmGN3ifWd33 
EoFy dhEN;UwVGIQzHKHFaHtoXvzmhJqruv?gtV,EIMqXq?gHNt V zvlQBqSgIXHHZDcKWZtALYtMQf&xxzETpaZ E!DJtvgxqTrLHuZvhoVv.GJS!gBKZHVGwHPZ,dnncW PqInSG$e.ocbS3AVLAH-X!U'd!
s$D Xyv nvRR!KDKnZPum!Z wlG !Uabc&oscxhN&BwXHVal glZ;ctH-OYEQmsfO'HAnkglqV&F-AVIIV!aWB,ax'JpDfAqfTQzT b3rGnhYxcHihWSs!A
$AbfEL
JtkswYUtlxSBPQuCpOHc3B ?dBts'JKCCJ,Q$UzaePZa!M:vt? -xL 
W3:qnzlBxFZCaxaSVp&E?Hu
Oj.VP pqeEfqcmsZqR!PsSjf wPEHgJpDscTRZxN;jf!Z-NxBRARtfFUJ,tPpQ.wKwTn
R