<a href="https://colab.research.google.com/github/evankellener/Shakespeare/blob/main/Shakespear_text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

url = "https://www.gutenberg.org/files/100/100-0.txt"

response = requests.get(url)

if response.status_code == 200:
  shakespear_text = response.text
else:
  print("Failed to receive dataset.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open('/content/drive/MyDrive/Shakespear/shakespear.txt', 'w', encoding = 'utf-8') as file:
  file.write(shakespear_text)

In [None]:
with open('/content/drive/MyDrive/Shakespear/shakespear.txt', 'r', encoding='utf-8') as file:
    text = file.read()
# set text to your own text file

# Create a set of unique characters
chars = sorted(list(set(text)))

# Create character-to-integer and integer-to-character mappings
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

In [None]:
sequence_length = 100
X_data = []
y_data = []

for i in range(0, len(text) - sequence_length, 1):
    seq_in = text[i:i + sequence_length]
    seq_out = text[i + sequence_length]
    X_data.append([char_to_int[char] for char in seq_in])
    y_data.append(char_to_int[seq_out])

In [None]:
print(len(chars))

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=len(chars), output_dim=embedding_dim, input_length=sequence_length))
model.add(LSTM(256, kernel_regularizer=l2(0.001), recurrent_regularizer=l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [None]:
import numpy as np
from keras.utils import to_categorical

X = np.array(X_data)
y = to_categorical(y_data, num_classes = len(chars))

split_index = int(0.8 * len(X))  # 80% of data for training

X_train = X[:split_index]
y_train = y[:split_index]

X_val = X[split_index:]
y_val = y[split_index:]

print(X.shape)
print(y.shape)

model.fit(X_train, y_train, epochs=30, batch_size=128, validation_data=(X_val, y_val), callbacks = [early_stop])

In [None]:
model.save("/content/drive/MyDrive/Shakespear/shakespear_v3.h5")

In [None]:
def generate_text(model, char_to_int, int_to_char, seed_text, num_chars_to_generate=500):
    generated_text = seed_text
    input_sequence = [char_to_int[char] for char in seed_text]

    for _ in range(num_chars_to_generate):
        # Ensure the input sequence is of length 100
        while len(input_sequence) < 100:
            input_sequence.insert(0, 0)  # pad with zeros at the beginning

        # Prepare the input data
        input_data = np.reshape(input_sequence, (1, len(input_sequence)))

        # Predict the next character
        prediction = model.predict(input_data, verbose=0)
        index = sample(prediction[0], temperature=0.5)
        predicted_char = int_to_char[index]

        # Append the predicted character to the generated text and update the input sequence
        generated_text += predicted_char
        input_sequence.append(index)
        input_sequence = input_sequence[1:]

    return generated_text

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature  # Adding a small value to avoid log(0)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
seed_text = "to be or not to be"
output = generate_text(model, char_to_int, int_to_char, seed_text)
print(output)

Additions

temperature sample: gave somewhat intelligable text

epochs from 10 -> 20 best val loss = 1.5144

used validation_data=(X_val, y_val) instead of validation split = 0.2