# Generating text with character RNN

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [3]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [4]:
len(shakespeare_text)

1115394

In [5]:
# tokenize
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # word-level encoding is default (converts to lowercase by default)
tokenizer.fit_on_texts(shakespeare_text)

In [6]:
# quick check
tokenizer.texts_to_sequences(["Hello World!"])

[[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]]

In [7]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]])

['h e l l o   w o r l d !']

In [8]:
# the leadned char index dictionary (starts from 1, leaves 0 for masking)
type(tokenizer.word_index)

dict

In [9]:
max_id = len(tokenizer.word_index)
print(max_id)

39


In [10]:
dataset_size = tokenizer.document_count
print(dataset_size)

1115394


In [11]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # so ids run from 0 to 38

In [12]:
encoded

array([19,  5,  8, ..., 20, 26, 10])

## Splitting the dataset

In [13]:
# take first 90% of data as training set
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [14]:
# we now create a dataset of short text windows
n_steps = 100
window_length = n_steps + 1 # target = input, shifted by 1 position to the right
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # by default, window creates non-overlapping sets

In [15]:
# this way produces a dataset of datasets -- useful if one needs to apply dataset methods on individual constituents
# to produce a single dataset, we need to call flat_map()

In [16]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
# this applies the lambda function to each sub-dataset and then flattens the result, outputting a single dataset

In [17]:
# let us now shuffle the resulting dataset of multiple 101-dimensional windows
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [18]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [19]:
import tensorflow_datasets as tfds

In [20]:
# this extracts the data batches in numpy format (generator for each batch)
numpy_data = tfds.as_numpy(dataset)
test = next(numpy_data)
test[0].shape, test[1].shape

((32, 100, 39), (32, 100))

In [None]:
# the inputs are one-hot sequences, while the outputs are just sequences (encoded by numbers) -- we can do this
# as far as we use sparse_categorical_crossentropy

## Training

In [None]:
# dropout -> inputs, recurrent_dropout -? hidden states
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, max_id],
                    dropout=0.2, recurrent_dropout=0.2), 
    keras.layers.GRU(10, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 10)          1530      
_________________________________________________________________
gru_1 (GRU)                  (None, None, 10)          660       
_________________________________________________________________
time_distributed (TimeDistri (None, None, 39)          429       
Total params: 2,619
Trainable params: 2,619
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=5)

## Load a trained model

In [26]:
model = keras.models.load_model('RNN.h5')

OSError: ignored

In [27]:
# preprocessing text
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [28]:
X_new = preprocess(["hello"])

In [29]:
Y_pred = model.predict_classes(X_new)
Y_pred.shape

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


(1, 5)

In [30]:
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

','

In [31]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [32]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [35]:
print(complete_text("thy shall", n_chars=150, temperature=1))

thy shall not
briend with for you nooks hourd, but by her, i grave these.

prayatio?
'pompent you maid all what she reted have yeurs?

lucentio:
thee!- fighter


In [None]:
# not very impressive (but I trained for just 2 epochs, more is impossible even with colab)

# Stateful RNN

In [21]:
# the model preserves the hidden state from batch to batch -- we need non-overlapping and unshuffled training sequences
# in order to learn long-scale dependencies

In [22]:
# the model stores the hidden states for each sample in a batch -- then uses these for each sample of the next batch
# e.g. batch 1 contains windows 1 through 32, the second 33 through 64, then the hidden state stored from 1 will be used
# as the initial hidden state of 33. For this reason, we need to consider single-sample batches

In [21]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [22]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2,
                    batch_input_shape=[1, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [23]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [24]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.fit(dataset, epochs=3, callbacks=[ResetStatesCallback()])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f384cf6e5f8>

In [25]:
model.save('stateful_RNN.h5')
from google.colab import files
files.download("stateful_RNN.h5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>