# Generating text with character RNN

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [3]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [4]:
len(shakespeare_text)

1115394

In [5]:
# tokenize
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # word-level encoding is default (converts to lowercase by default)
tokenizer.fit_on_texts(shakespeare_text)

In [6]:
# quick check
tokenizer.texts_to_sequences(["Hello World!"])

[[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]]

In [7]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13, 31]])

['h e l l o   w o r l d !']

In [8]:
# the leadned char index dictionary (starts from 1, leaves 0 for masking)
type(tokenizer.word_index)

dict

In [9]:
max_id = len(tokenizer.word_index)
print(max_id)

39


In [10]:
dataset_size = tokenizer.document_count
print(dataset_size)

1115394


In [11]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # so ids run from 0 to 38

In [12]:
encoded

array([19,  5,  8, ..., 20, 26, 10])

## Splitting the dataset

In [13]:
# take first 90% of data as training set
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [14]:
# we now create a dataset of short text windows
n_steps = 100
window_length = n_steps + 1 # target = input, shifted by 1 position to the right
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # by default, window creates non-overlapping sets

In [15]:
# this way produces a dataset of datasets -- useful if one needs to apply dataset methods on individual constituents
# to produce a single dataset, we need to call flat_map()

In [16]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
# this applies the lambda function to each sub-dataset and then flattens the result, outputting a single dataset

In [17]:
# let us now shuffle the resulting dataset of multiple 101-dimensional windows
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [18]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [19]:
# the inputs are one-hot sequences, while the outputs are just sequences (encoded by numbers) -- we can do this
# as far as we use sparse_categorical_crossentropy

## Training

In [20]:
# dropout -> inputs, recurrent_dropout -? hidden states
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, max_id],
                    dropout=0.2, recurrent_dropout=0.2), 
    keras.layers.GRU(10, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 10)          1530      
_________________________________________________________________
gru_1 (GRU)                  (None, None, 10)          660       
_________________________________________________________________
time_distributed (TimeDistri (None, None, 39)          429       
Total params: 2,619
Trainable params: 2,619
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=5)

Epoch 1/5
   8999/Unknown - 1339s 149ms/step - loss: 2.3974

KeyboardInterrupt: 

In [None]:
# my estimate is that this will take 