In [21]:
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.utils import get_file
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
file = get_file('data_text.txt', 'https://www.gutenberg.org/files/1342/1342-0.txt')

Downloading data from https://www.gutenberg.org/files/1342/1342-0.txt


In [3]:
text = open(file, 'rb').read().decode(encoding= 'utf-8')
print('length of text : ', len(text))

length of text :  790296


In [5]:
vocab = sorted(set(text))
vocab_size = len(vocab)

print('vocab size : ', vocab_size)

vocab size :  93


In [36]:
char2idx = {u : i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

In [7]:
text_as_int = [char2idx[c] for c in text]

print(text_as_int[:5])

[92, 48, 65, 62, 2]


In [15]:
seq_len = 100
examples_per_epochs = len(text) // seq_len

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [16]:
sequences = char_dataset.batch(seq_len + 1, drop_remainder= True)

In [17]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]

    return input_text, target_text

dataset = sequences.map(split_input_target)

In [18]:
batch_size = 64

buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

print(dataset)

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>


In [19]:
model = Sequential([
    Embedding(vocab_size, 256, batch_input_shape = [batch_size, None]),
    LSTM(1024, return_sequences= True, stateful= True, recurrent_initializer= 'glorot_uniform'),
    Dense(512, activation = 'relu'),
    Dense(vocab_size)
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           23808     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 512)           524800    
_________________________________________________________________
dense_1 (Dense)              (64, None, 93)            47709     
Total params: 5,843,293
Trainable params: 5,843,293
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(optimizer = 'adam', loss= 'sparse_categorical_crossentropy')

In [22]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
ckpt_callback = ModelCheckpoint(filepath= checkpoint_prefix, save_weights_only= True)

In [24]:
model.fit(dataset, epochs = 20, callbacks = [ckpt_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x19858e1dfa0>

In [25]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_20'

In [26]:
model = Sequential([
    Embedding(vocab_size, 256, batch_input_shape = [1, None]),
    LSTM(1024, return_sequences= True, stateful= True, recurrent_initializer= 'glorot_uniform'),
    Dense(512, activation = 'relu'),
    Dense(vocab_size)
])

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            23808     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_2 (Dense)              (1, None, 512)            524800    
_________________________________________________________________
dense_3 (Dense)              (1, None, 93)             47709     
Total params: 5,843,293
Trainable params: 5,843,293
Non-trainable params: 0
_________________________________________________________________


In [33]:
def generate_text(model, start_string):
    num_generate = 1000

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])
    
    return (start_string + ''.join(text_generated))

In [35]:
print(generate_text(model, start_string= 'truth'))

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed