<a href="https://colab.research.google.com/github/mayur7garg/66DaysOfData/blob/main/Day%203/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
BOOKS = ['The Adventures of Sherlock Holmes by Arthur Conan Doyle.txt',
         'The Memoirs of Sherlock Holmes by Arthur Conan Doyle.txt',
         'The Return of Sherlock Holmes by Arthur Conan Doyle.txt']

BASE_PATH = r'/content/'
SEQ_LEN = 128
RANDOM_STATE = 7
VAL_SIZE = 0.05
EPOCHS = 50
BATCH_SIZE = 512
LEARNING_RATE = 0.01
EARLY_STOP_PATIENCE = 5

In [None]:
%%time

X = []
y = []

for book in BOOKS:
    with open(BASE_PATH + book, 'r') as book_file:
        book_data = book_file.read().lower()
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 3):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

In [None]:
len(X), len(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

In [None]:
%%time

tokenizer = Tokenizer(char_level = True)
tokenizer.fit_on_texts([*X_train, *y_train])
char_index = tokenizer.word_index
char_count = len(char_index)
print(f'Found %s unique characters: {char_count}\n')

In [None]:
%%time

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_train_tokenized = np.reshape(X_train_tokenized, (len(X_train_tokenized), SEQ_LEN, 1))
X_train_tokenized = X_train_tokenized/char_count

X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_test_tokenized = np.reshape(X_test_tokenized, (len(X_test_tokenized), SEQ_LEN, 1))
X_test_tokenized = X_test_tokenized/char_count

y_train_categorical = tf.keras.utils.to_categorical(tokenizer.texts_to_sequences(y_train), num_classes = char_count)
y_test_categorical = tf.keras.utils.to_categorical(tokenizer.texts_to_sequences(y_test), num_classes = char_count)

print(f"Shape of input data: \nTrain - {X_train_tokenized.shape}\nValidation - {X_test_tokenized.shape}\n")
print(f"Shape of output data: \nTrain - {y_train_categorical.shape}\nValidation - {y_test_categorical.shape}\n")

In [None]:
for i in np.random.randint(0, len(X_train_tokenized), 2):
    print(f'Input: {X_train_tokenized[i]}')
    print(f'Output: {y_train_categorical[i]}\n')

In [None]:
model = Sequential([
    LSTM((512), return_sequences = True,  input_shape=(SEQ_LEN, 1)),
    LSTM((512), return_sequences = True),
    LSTM((256), return_sequences = False),
    Dropout(0.1),
    Dense(256, activation = LeakyReLU()),
    Dense(128, activation = LeakyReLU()),
    Dropout(0.1),
    Dense(char_count, activation = 'softmax')
    ], name = 'Text_Generation_Model')

model.compile(optimizer = Adam(LEARNING_RATE), loss='categorical_crossentropy')
model.summary()

In [None]:
%%time

early_stop = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE, restore_best_weights = True)
history = model.fit(
    X_train_tokenized, 
    y_train_categorical, 
    epochs = EPOCHS, 
    batch_size = BATCH_SIZE, 
    validation_data = (X_test_tokenized, y_test_categorical),
    callbacks = [early_stop]
    )