# Generating Shakespeare-like text using Recurrent Neural Networks
You will need to <b> pip install tensorflow. <b>

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [4]:
# Download the dataset from Google's repo into tensorflow
shakespeare_dataset = tf.keras.utils.get_file(
    'shakespeare.txt',
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [5]:
# Read the data
text = open(shakespeare_dataset, 'rb').read().decode(encoding='utf-8')


In [10]:
text[:13]

'First Citizen'

In [11]:
''' Identify unique characters in the dataset
    Creates a sorted list (vocab) containing unique characters from the text.
    Constructs a mapping (char_to_idx) from characters to their integer indices.
    Converts the vocab list to a NumPy array (idx_to_char), enabling conversion from indices back to characters.
'''
vocab = sorted(set(text))
char_to_idx = {char: idx for idx, char in enumerate(vocab)} # Use to map characters in a word to their indices a=1, b=2, etc
idx_to_char = np.array(vocab)

In [12]:
# Convert the entire text data into an array of integers using the mapping above. 
text_as_int = np.array([char_to_idx[c] for c in text])

In [16]:
# Create sequences
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)  # Creates a TF dataset from the integer representation of text.

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True) # Batches the dataset into sequences of length seq_length + 1 to use as training data.

def split_input_target(chunk):
    ''' Splits each sequence into input and target sequences'''
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)  # Maps the function to the sequences dataset, generating pairs of input and target sequences.

# Sets the batch size and prepares the dataset for training by batching it accordingly.
batch_size = 64  # Adjust as needed
dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


In [17]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

''' Sets up the architecture of the neural network using Keras' Sequential API.
    Includes an embedding layer, a SimpleRNN layer, and a Dense (fully connected) layer.'''
model = Sequential([
    Embedding(vocab_size, embedding_dim, batch_input_shape=[None, None]),
    SimpleRNN(rnn_units, return_sequences=True),
    Dense(vocab_size)
])
# Compiles the model with the Adam optimizer and Sparse Categorical Crossentropy loss function.
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))


In [18]:
# Sets up a callback to save the model weights during training.
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='./shakespeare_model_checkpoint.h5', save_weights_only=True)

# Trains the model for 20 epochs using the prepared dataset and saves the model weights at the end of each epoch.
model.fit(dataset, epochs=20, callbacks=[checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f7a19b45cf0>

In [24]:
# Restore the latest checkpoint
model.load_weights('./shakespeare_model_checkpoint.h5')
model.build(tf.TensorShape([1, None]))

def generate_text(model, start_string):
    num_generate = 500
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    
    temperature = 0.5

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx_to_char[predicted_id])
        
    return (start_string + ''.join(text_generated))

print(generate_text(model, start_string="ROMEO: "))


ROMEO: I bumy ind s me mat thes st t myo at an an thinor ful s br the houndes be burer these s se bou oo mor t my ho al te d thestounthoow a myoupe and mel me s d t alinghe inooul se thand anow ht mes bl th hyo wen m this my ber mat, myo thand mis! me thanoous wartheast whe d mous
HARIO:

TI r boreroust he he bl the the me ino my theel th athowillongeat t fe l thor me fo w thot th an at ge atengous he hinor we he t bino d d me te th ming inont me my.
BERIO:
I fo anof t me houre fel th be myor ho he in 
