# Generating text with character RNN

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [3]:
len(shakespeare_text)

1115394

In [4]:
# tokenize
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # word-level encoding is default (converts to lowercase by default)
tokenizer.fit_on_texts(shakespeare_text)

In [5]:
# quick check
tokenizer.texts_to_sequences(["Hello World!"])

[[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]]

In [6]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]])

['h e l l o   w o r l d !']

In [7]:
# the leadned char index dictionary (starts from 1, leaves 0 for masking)
type(tokenizer.word_index)

dict

In [8]:
max_id = len(tokenizer.word_index)
print(max_id)

39


In [9]:
dataset_size = tokenizer.document_count
print(dataset_size)

1115394


In [10]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # so ids run from 0 to 38

In [11]:
encoded

array([19,  5,  8, ..., 20, 26, 10])

## Splitting the dataset

In [12]:
# take first 90% of data as training set
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [13]:
# we now create a dataset of short text windows
n_steps = 100
window_length = n_steps + 1 # target = input, shifted by 1 position to the right
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # by default, window creates non-overlapping sets

In [14]:
# this way produces a dataset of datasets -- useful if one needs to apply dataset methods on individual constituents
# to produce a single dataset, we need to call flat_map()

In [15]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
# this applies the lambda function to each sub-dataset and then flattens the result, outputting a single dataset

In [16]:
# let us now shuffle the resulting dataset of multiple 101-dimensional windows
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [17]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [21]:
import tensorflow_datasets as tfds

In [33]:
# this extracts the data batches in numpy format (generator for each batch)
numpy_data = tfds.as_numpy(dataset)
test = next(numpy_data)
test[0].shape, test[1].shape

((32, 100, 39), (32, 100))

In [18]:
# the inputs are one-hot sequences, while the outputs are just sequences (encoded by numbers) -- we can do this
# as far as we use sparse_categorical_crossentropy

## Training

In [20]:
# dropout -> inputs, recurrent_dropout -? hidden states
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, max_id],
                    dropout=0.2, recurrent_dropout=0.2), 
    keras.layers.GRU(10, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 10)          1530      
_________________________________________________________________
gru_1 (GRU)                  (None, None, 10)          660       
_________________________________________________________________
time_distributed (TimeDistri (None, None, 39)          429       
Total params: 2,619
Trainable params: 2,619
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=5)

## Load a trained model

In [94]:
model = keras.models.load_model('RNN.h5')

In [95]:
# preprocessing text
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [98]:
X_new = preprocess(["hello"])

In [101]:
Y_pred = model.predict_classes(X_new)
Y_pred.shape

(1, 5)

In [102]:
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

'w'

In [103]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [104]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [113]:
print(complete_text("my name is", n_chars=100, temperature=1))

my name is of are cam who?

kemes:
what may low sme would and curnto
and wide.

estio:
you heres.

pro:
in afo


In [None]:
# not very impressive (but I trained for just 2 epochs, more is impossible even with colab)

# Stateful RNN

In [None]:
# the model preserves the hidden state from batch to batch -- we need non-overlapping and unshuffled training sequences
# in order to learn long-scale dependencies

In [None]:
# the model stores the hidden states for each sample in a batch -- then uses these for each sample of the next batch
# e.g. batch 1 contains windows 1 through 32, the second 33 through 64, then the hidden state stored from 1 will be used
# as the initial hidden state of 33. For this reason, we need to consider single-sample batches

In [118]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [120]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2,
                    batch_input_shape=[1, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [121]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
# I've done fittingon colab
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])

In [None]:
# important: after the stateful model is trained, it will only be possible to use it to make predictions on bactches
# of the same size as what we've indicated in the first GRU layer (in this case batches of size 1)
# to avoid this restriction, create an identical stateless model and copy the stateful model's weights to this model