In [12]:
from tensorflow import keras
shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [13]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [14]:
tokenizer.texts_to_sequences("hi")
tokenizer.sequences_to_texts([[1, 2, 3, 4, 5]])

['  e t o a']

In [15]:
max_ids = len(tokenizer.word_index)
max_ids

39

In [16]:
dataset_size = tokenizer.document_count
dataset_size

1115394

In [17]:
import numpy as np
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [18]:
import tensorflow as tf

train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [19]:
n_steps = 150
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [20]:
dataset = dataset.flat_map(lambda window:window.batch(window_length))

In [21]:
dataset = dataset.shuffle(10000)
dataset = dataset.map(lambda window: (window[:-1], window[1:]), num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
model = keras.models.Sequential([
    Embedding(input_dim=max_ids, output_dim=128),
    Bidirectional(GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LayerNormalization(),
    Bidirectional(GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    LayerNormalization(),
    # Adding one more dense layer before the output can sometimes help capture more complex relationships
    Dense(128, activation="relu"),
    Dropout(0.3),
    TimeDistributed(Dense(max_ids, activation="softmax"))
])

# --- 2. Learning Rate Scheduler and Optimizer ---
# Define a learning rate schedule for gradual decay
initial_learning_rate = 5e-4
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=5000,
    decay_rate=0.9,
    staircase=True)

# Use the Adam optimizer with the defined learning rate schedule
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

# --- 3. Compile the Model ---
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# --- 4. Refined Callbacks ---
# Monitor 'val_loss' to save the model that performs best on unseen data
model_checkpoint = keras.callbacks.ModelCheckpoint(
    filepath="best_shakespeare_model.keras",
    save_best_only=True,
    monitor="val_loss", # Crucial change: monitor validation loss
    verbose=1
)

# EarlyStopping should also monitor 'val_loss'
early_stop = keras.callbacks.EarlyStopping(patience=5, monitor="val_loss", restore_best_weights=True)

# --- 5. Train the Model ---
# Make sure to include your validation dataset here
history = model.fit(dataset,
                    epochs=20, # You can train for more epochs with early stopping
                    callbacks=[early_stop, model_checkpoint])


Epoch 1/20
    525/Unknown [1m304s[0m 568ms/step - accuracy: 0.8667 - loss: 0.5050