In [1]:
import numpy as np
import os
import tensorflow as tf

from tensorflow.keras.utils import get_file
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [4]:
data1 = get_file("a_childhood_of_the_orient", 'https://www.gutenberg.org/files/66019/66019-0.txt')
data2 = get_file('MISS LOCHINVAR', 'https://www.gutenberg.org/files/66018/66018-0.txt')

Downloading data from https://www.gutenberg.org/files/66018/66018-0.txt


In [6]:
text1 = open(data1, 'rb').read().decode(encoding = 'utf-8')
text2 = open(data2, 'rb').read().decode(encoding = 'utf-8')

In [12]:
text1 = text1[3000:]
text2 = text2[3000:]

In [13]:
text = text1 + text2

In [14]:
len(text)

676936

In [15]:
print(text[:250])

is_, you are five years old. I wish you many
happy returns of the day.”

He drew up a chair, and sat down by my bed. Carefully unfolding a piece
of paper, he brought forth a small Greek flag.

“Do you know what this is?”

I nodded.

“Do you


In [16]:
vocab = sorted(set(text))

char2idx = {u : i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

In [18]:
for char, _ in zip(char2idx, range(20)):
  print(char, char2idx[char])


 0
 1
  2
! 3
" 4
$ 5
% 6
& 7
' 8
( 9
) 10
* 11
, 12
- 13
. 14
/ 15
0 16
1 17
2 18
3 19


In [17]:
text_as_int = [char2idx[c] for c in text]
text_as_int[:5]

[66, 76, 57, 12, 2]

In [20]:
seq_len = 100
examples_per_epoch = len(text) // seq_len

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
  print(idx2char[i])

i
s
_
,
 
y
o
u
 
a


In [23]:
seq = char_dataset.batch(seq_len +1 , drop_remainder= True)

for item in seq.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'is_, you are five years old. I wish you many\r\nhappy returns of the day.”\r\n\r\nHe drew up a chair, and s'
'at down by my bed. Carefully unfolding a piece\r\nof paper, he brought forth a small Greek flag.\r\n\r\n“Do'
' you know what this is?”\r\n\r\nI nodded.\r\n\r\n“Do you know what it stands for?”\r\n\r\nBefore I could think of'
' an adequate reply, he leaned toward me and said\r\nearnestly, his fiery black eyes holding mine:\r\n\r\n“I'
't stands for the highest civilization the world has ever known. It\r\nstands for Greece, who has taught'


In [24]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = seq.map(split_input_target)

In [25]:
batch_size = 64
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

In [26]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [27]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = Sequential([
                      Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
                      LSTM(rnn_units, return_sequences= True, stateful= True, recurrent_initializer= 'glorot_uniform'),
                      Dense(vocab_size)
  ])
  return model

In [28]:
model = build_model(vocab_size = vocab_size, embedding_dim= embedding_dim, rnn_units= rnn_units, batch_size = batch_size)

In [30]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_prediction = model(input_example_batch)
  print(example_batch_prediction.shape)

(64, 100, 98)


In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           25088     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 98)            100450    
Total params: 5,372,514
Trainable params: 5,372,514
Non-trainable params: 0
_________________________________________________________________


In [32]:
sampled_indices = tf.random.categorical(example_batch_prediction[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

In [33]:
print(sampled_indices)

[ 4 43 23 11 24 18 69  3 55 12 35 86 43 12 18 37 84 22 16 43 52 95 74 60
 12 46 88 38 68 82 57  8 35 12 33  8 56 39 37 67 51 74 40 33 45 71 78 77
 68 48 65 52 81 27 33  0 91  1 53 13 66 24 69 34  6 71 21 12 93 83 85 11
 61 95 30 89 38 16 43 86  4  5 54 76 70 69 38  9 93 65 22 64  3 22 12 47
  9 28 43 23]


In [34]:
print('input:',repr(''.join(idx2char[input_example_batch[0]])))
print('expected :', repr(''.join(idx2char[sampled_indices])))

input: 'e\r\n_parti_ if he had money and position, irrespective of any other\r\nqualifications.\r\n\r\nFor a long ti'
expected : '"O7*82l![,GÏO,2IÉ60OX’qc,RæJky_\'G,E\']KIjWqLEQnutkThXx;E\në\rY-i8lF%n5,ôzË*d’BèJ0OÏ"$ZsmlJ(ôh6g!6,S(?O7'


In [35]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

example_batch_loss = loss(target_example_batch, example_batch_prediction)


In [36]:
model.compile(optimizer = 'adam', loss = loss)

In [37]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_ {epoch}')

checkpoint_callback = ModelCheckpoint(filepath= checkpoint_prefix, save_weights_only= True)

In [38]:
epochs = 20

history = model.fit(dataset, epochs = epochs, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            25088     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 98)             100450    
Total params: 5,372,514
Trainable params: 5,372,514
Non-trainable params: 0
_________________________________________________________________


In [44]:
def generate_text(model, start_string):
  num_generate = 10000

  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    predictions = tf.squeeze(predictions, 0)

    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()

    input_eval = tf.expand_dims([predicted_id], 0)
    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [45]:
print(generate_text(model, start_string= u'The youngest member'))

The youngest members of Poop. Bago
I would not be mostly and fashion, and the day

It is the blue forestors they were cervanted years age my like children of
Arif But the Hummel the other ways was laigh
a mindleady predition. “Nou indite her going touch
your true.”

“But within be present,” I commored him. “Fork meware her
fact that the school and behered.
At the epists the man of the first time free
days. In my heart was not for the children had been moreatest friends, and then
she was struckly closed, and then buy had phints treath
of membaring shone mitherly applicable teever myself for
her trousper. She esurply, the untrain, the heroine
experience she could not each _chedians. I struck quite failing a
numsur’ at Themenuen understand the brown sided was covered with
afternative, and--and I do not see much all offend me.”

“Hang loving!”

“I wonder like about to-say opened.”

“When what to a ntile, if you will be ablulged. Of the
inCense
that I had been told 