<a href="https://colab.research.google.com/github/mayur7garg/66DaysOfData/blob/main/Day%203/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation using LSTM neural network

## Imports

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

## Constants

In [2]:
BOOKS = ['The Adventures of Sherlock Holmes by Arthur Conan Doyle.txt',
         'The Memoirs of Sherlock Holmes by Arthur Conan Doyle.txt',
         'The Return of Sherlock Holmes by Arthur Conan Doyle.txt']

BASE_PATH = r'../Data/Text/'
SEQ_LEN = 128
RANDOM_STATE = 7
VAL_SIZE = 0.05
EPOCHS = 50
BATCH_SIZE = 512
LEARNING_RATE = 0.001
EARLY_STOP_PATIENCE = 5

## Data Loading and Preparation

### Reading the text files and creating sequences of fixed length with next character as target label

In [3]:
%%time

X = []
y = []

for book in BOOKS:
    with open(BASE_PATH + book, 'r', encoding='utf-8') as book_file:
        book_data = book_file.read().lower()
        char_len = len(book_data)

        for i in range(0, char_len - SEQ_LEN):
            X.append(book_data[i : i + SEQ_LEN])
            y.append(book_data[i + SEQ_LEN])

for i in np.random.randint(0, len(X), 3):
    print(f'Input: {X[i]!r}')
    print(f'Output: {y[i]}\n')

Input: 'resent a more dreadful record of sin than does the smiling and\nbeautiful countryside.”\n\n“you horrify me!”\n\n“but the reason is ve'
Output: r

Input: 'ord holdhurst, with a wry face.\n\n      “since nearly ten weeks have elapsed, then, and nothing has been\n      heard, it is not u'
Output: n

Input: 'your\ninferences.”\n\n“then, pray tell me what it is that you can infer from this hat?”\n\nhe picked it up and gazed at it in the pec'
Output: u

Wall time: 986 ms


In [4]:
len(X), len(y)

(1863544, 1863544)

### Splitting the data for training and validation

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = VAL_SIZE, random_state = RANDOM_STATE)

len(X_train), len(y_train), len(X_test), len(y_test)

(1770366, 1770366, 93178, 93178)

### Fitting a tokenizer on training data to create integer encoded sequences

In [6]:
%%time

tokenizer = Tokenizer(char_level = True)
tokenizer.fit_on_texts([*X_train, *y_train])
char_index = tokenizer.word_index
char_count = len(char_index)
print(f'Found %s unique characters: {char_count}\n')

Found %s unique characters: 79

Wall time: 50.2 s


### Integer encoding and scaling the input sequences and one hot encoding the target labels

In [7]:
%%time

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_train_tokenized = np.reshape(X_train_tokenized, (len(X_train_tokenized), SEQ_LEN, 1))
X_train_tokenized = X_train_tokenized/char_count

X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_test_tokenized = np.reshape(X_test_tokenized, (len(X_test_tokenized), SEQ_LEN, 1))
X_test_tokenized = X_test_tokenized/char_count

y_train_categorical = tf.keras.utils.to_categorical(tokenizer.texts_to_sequences(y_train), num_classes = char_count)
y_test_categorical = tf.keras.utils.to_categorical(tokenizer.texts_to_sequences(y_test), num_classes = char_count)

print(f"Shape of input data: \nTrain - {X_train_tokenized.shape}\nValidation - {X_test_tokenized.shape}\n")
print(f"Shape of output data: \nTrain - {y_train_categorical.shape}\nValidation - {y_test_categorical.shape}\n")

Shape of input data: 
Train - (1770366, 128, 1)
Validation - (93178, 128, 1)

Shape of output data: 
Train - (1770366, 79)
Validation - (93178, 79)

Wall time: 58.1 s


### Sample input and output data

In [8]:
for i in np.random.randint(0, len(X_train_tokenized), 2):
    print(f'Input: {X_train_tokenized[i]}')
    print(f'Output: {y_train_categorical[i]}\n')

Input: [[0.17721519]
 [0.34177215]
 [0.05063291]
 [0.01265823]
 [0.10126582]
 [0.06329114]
 [0.16455696]
 [0.11392405]
 [0.02531646]
 [0.01265823]
 [0.06329114]
 [0.08860759]
 [0.01265823]
 [0.24050633]
 [0.07594937]
 [0.12658228]
 [0.02531646]
 [0.39240506]
 [0.35443038]
 [0.01265823]
 [0.05063291]
 [0.11392405]
 [0.32911392]
 [0.02531646]
 [0.13924051]
 [0.01265823]
 [0.29113924]
 [0.12658228]
 [0.05063291]
 [0.13924051]
 [0.11392405]
 [0.03797468]
 [0.12658228]
 [0.02531646]
 [0.02531646]
 [0.03797468]
 [0.01265823]
 [0.05063291]
 [0.11392405]
 [0.01265823]
 [0.03797468]
 [0.10126582]
 [0.02531646]
 [0.01265823]
 [0.03797468]
 [0.12658228]
 [0.05063291]
 [0.07594937]
 [0.08860759]
 [0.01265823]
 [0.11392405]
 [0.03797468]
 [0.02531646]
 [0.05063291]
 [0.18987342]
 [0.02531646]
 [0.13924051]
 [0.01265823]
 [0.06329114]
 [0.24050633]
 [0.24050633]
 [0.01265823]
 [0.05063291]
 [0.25316456]
 [0.05063291]
 [0.07594937]
 [0.08860759]
 [0.01265823]
 [0.06329114]
 [0.08860759]
 [0.17721519]

## Model

### Creating an LSTM model using tf.keras Sequential API

In [9]:
model = Sequential([
    LSTM(512, return_sequences = True,  input_shape=(SEQ_LEN, 1)),
    Dropout(0.05),
    LSTM(256),
    Dense(256, activation = LeakyReLU()),
    Dropout(0.05),
    Dense(128, activation = LeakyReLU()),
    Dense(char_count, activation = 'softmax')
    ], name = 'Text_Generation_Model')

model.compile(optimizer = Adam(LEARNING_RATE), loss='categorical_crossentropy', metrics = ['categorical_accuracy'])
model.summary()

Model: "Text_Generation_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128, 512)          1052672   
_________________________________________________________________
dropout (Dropout)            (None, 128, 512)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               787456    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 79)      

### Training the model with EarlyStopping callback

In [10]:
%%time

early_stop = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE, restore_best_weights = True)
history = model.fit(
    X_train_tokenized, 
    y_train_categorical, 
    epochs = EPOCHS, 
    batch_size = BATCH_SIZE, 
    validation_data = (X_test_tokenized, y_test_categorical),
    callbacks = [early_stop]
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Wall time: 20h 21min 58s


## Predictions

### Creating a reverse mapping dictionary for encoded integers

In [11]:
index_char = {ind: char for char, ind in char_index.items()}
for k, v in index_char.items():
    print(f"{k:8} : {v}")

       1 :  
       2 : e
       3 : t
       4 : a
       5 : o
       6 : i
       7 : n
       8 : h
       9 : s
      10 : r
      11 : d
      12 : l
      13 : u
      14 : 

      15 : m
      16 : w
      17 : c
      18 : y
      19 : f
      20 : g
      21 : ,
      22 : p
      23 : b
      24 : .
      25 : v
      26 : k
      27 : “
      28 : ”
      29 : ’
      30 : -
      31 : ?
      32 : x
      33 : j
      34 : q
      35 : ‘
      36 : !
      37 : —
      38 : z
      39 : _
      40 : ;
      41 : 1
      42 : :
      43 : 0
      44 : 8
      45 : 2
      46 : 3
      47 : *
      48 : )
      49 : (
      50 : 4
      51 : 5
      52 : 9
      53 : 6
      54 : /
      55 : 7
      56 : £
      57 : é
      58 : "
      59 : &
      60 : '
      61 : æ
      62 : 	
      63 : $
      64 : @
      65 : œ
      66 : [
      67 : º
      68 : ]
      69 : #
      70 : %
      71 : è
      72 : ・
      73 : â
      74 : à
      75 : ô
      76 : ï
      77 : î

### Definite prediction
For the seed string provided, the character with the highest predicted probability is selected. The seed is then updated with the predicted character and this process repeats for a specified number of times.

In [21]:
def definite(seed, pred_count = 256):
    print(f"Input:\n{seed}")

    for i in range(pred_count):
        input_data = tokenizer.texts_to_sequences(seed[i: i + SEQ_LEN])
        input_data = np.reshape(input_data, (1, SEQ_LEN, 1))
        input_data = input_data/char_count
        pred_char = index_char[np.argmax(model.predict(input_data))]
        seed += pred_char

    print(f"Output:\n{seed[SEQ_LEN:]}")

seed = X_test[np.random.randint(0, len(X_test))]
definite(seed)

Input:
he exception of his coat. his
boots, his socks, his hat, and his watch—all were there. there were no
signs of violence upon any 
Output:
attempt to the station and the station as
the bell-rope to the room and the station to the room which i had a bearing
to the room and the start of the corner of the colonel and the station
and the room was a little start of the corner of the corner of the



### Probabilistic prediction
For the seed string provided, the character is selected randomly weighted by the probabilities predicted by the model. The seed is then updated with the predicted character and this process repeats for a specified number of times.

In [22]:
def probabilistic(seed, pred_count = 256):
    print(f"Input:\n{seed}")

    for i in range(pred_count):
        input_data = tokenizer.texts_to_sequences(seed[i: i + SEQ_LEN])
        input_data = np.reshape(input_data, (1, SEQ_LEN, 1))
        input_data = input_data/char_count
        pred_prob = model.predict(input_data).reshape(-1)
        pred_char = index_char[np.random.choice(len(pred_prob), p = pred_prob)]
        seed += pred_char

    print(f"Output:\n{seed[SEQ_LEN:]}")

seed = X_test[np.random.randint(0, len(X_test))]
probabilistic(seed)

Input:
 in not making an effort. look at this!” he held up a
      little note with a coat-of-arms upon the envelope. “that belongs
   
Output:
   it was. i intend the three-quarter headounde of cross—fest rustice
      of young mycersing evil repulbing more, epes an evidence. in it
      was a month went any adgancy of the breakfast that i soon come,
      i care me on so i asked me to fild a wea
