In [1]:

# After running this cell, please restart the kernel to ensure all packages are properly reloaded.

import os
import numpy as np
import tensorflow as tf
import keras
from keras.preprocessing import sequence

We will be using a RNN to create a play. We will simply show the RNN an example of something we want it to recreate and it will learn how to write a version of it on its own. We'll do this using a character predictive model that will take as input a variable length sequence and predict the next character. 

By showing it a bunch of sequences of texts from Romeo and Juliet, it will predcit the most likely next character for a given sequence is. This output will be fed into the input again and this way we will keep predicting sequences of characters

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
# Take a look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



We need to encode all the ext abvoe as integers so the machine and neural network can work with it. We will simply encode each characters with an integer. This is quite nice because there is a finite set of letters, so its much better than encoding words like we did last time

In [5]:
vocab = sorted(set(text)) #sorts all the unique characters in the text
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab) #now we can just use the index at which a letter appears, so going from index to letter

#this function takes the text and converts it into the int representation for it
def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)



In [6]:
# lets look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


Just in case, lets also create a function that converts numerics to text

In [7]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:40]))

First Citizen:
Before we proceed any fur


We shall now create training examples becuase its not feasible to pass all 1.1 million characters to our model at once for training, we need to split that up into something meaningful.

To do so, we will use a seq_length sequence as input and a seq_length sequence as output where that sequence the oriignal except it is shifted one letter to the right.

So if the input is HELLO, the output is ELLO.

Lets first create a stream of characters from our text data

In [8]:
seq_length = 100 #length of sequence for our training example
examples_per_epoch = len(text)//(seq_length + 1)

#create training examples / target
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)



In [9]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True) #drop remainder just drops any extra characters after the limit that we want in a batch, which is 101 characters


Now we have these sequences with length 101, lets split them into input and output

In [10]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

Lets see if it actually worked...

In [11]:
for x, y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y)) 



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


2025-06-21 17:08:52.887418: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


finally lets make the training batches

In [12]:
BATCH_SIZE = 64 #feeding our model 64 batches of data at a time
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters that we initialiazed at the very top
EMBEDDING_DIM = 256 #remmebr embedding layer is a hidden layer that is reprsented as vectors - 256 is how big the vector that represents our words are
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True) #this shuffles the data, batch it into the size, and if there is a mismatch of number of batches, drop the excess

# Building the model

We will use an embedding layer, a LSTM, and one dense layer that contains a node for each unique character in our training data. The dense layer will give us a probability distribution over all nodes.

In [21]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS)
model.summary()