# Import libraries

In [1]:
import tensorflow as tf
import numpy as np
import requests
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download and preprocess the Shakespeare dataset

In [2]:
response = requests.get("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
corpus = response.text
corpus = corpus[:1000000]

# Tokenize the text and create input sequences

In [3]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

input_sequences = []
for line in corpus.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

In [4]:
max_sequence_length = max([len(seq) for seq in input_sequences])
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')


In [5]:
# create predictors and labels:
predictors, labels = padded_sequences[:, :-1], padded_sequences[:, -1]


# Build RNN sequence model

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_length - 1),
    tf.keras.layers.LSTM(150, return_sequences=True),
    tf.keras.layers.LSTM(150),
    tf.keras.layers.Dense(total_words, activation='softmax')
])


In [7]:
# compile
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')


In [8]:
# train
model.fit(predictors, labels, batch_size=128, epochs=20, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7965d6b2f3d0>

# Generate text using the trained model

In [9]:
def generate_text(seed_text, length=100):
    generated_text = seed_text
    for _ in range(length):
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
        predicted_idx = np.argmax(model.predict(token_list), axis=-1)
        predicted_word = tokenizer.index_word[predicted_idx[0]]
        generated_text += predicted_word
    return generated_text


In [11]:
seed = "ROMEO:"
generated_text = generate_text(seed, length=2000)
print(generated_text)


ROMEO: the state of the state of the state, and therefore, sir, i would not stay. what says he was the state, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, and then, 