In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
### YOUR CODE HERE
from tensorflow.keras import regularizers
###
import tensorflow.keras.utils as ku 
import numpy as np
import keras
from keras import layers
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
tokenizer = Tokenizer()
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt
data = open('/tmp/sonnets.txt').read()

corpus = data.lower().split("\n")


tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# create input sequences using list of tokens
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

--2021-02-10 20:08:59--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.128, 173.194.217.128, 172.253.123.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘/tmp/sonnets.txt’


2021-02-10 20:08:59 (63.8 MB/s) - ‘/tmp/sonnets.txt’ saved [93578/93578]



In [3]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

label = ku.to_categorical(label, num_classes=total_words)

In [4]:

model = keras.models.Sequential()
model.add(Embedding(total_words, 300, input_length=max_sequence_len-1))
model.add(layers.Bidirectional(layers.LSTM(512, return_sequences=True)))
model.add(layers.LSTM(128))
model.add(layers.Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics='accuracy')


In [5]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 300)           963300    
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 1024)          3330048   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               590336    
_________________________________________________________________
dense (Dense)                (None, 3211)              414219    
Total params: 5,297,903
Trainable params: 5,297,903
Non-trainable params: 0
_________________________________________________________________


In [6]:
address="./kaggle/working/weights-improvement.hdf5"
stop = EarlyStopping(monitor = 'val_accuracy', min_delta = 0, 
                             patience = 5, verbose = 1, mode = 'auto')
save = ModelCheckpoint(address, monitor = 'val_accuracy', 
                               verbose = 0, save_best_only = True)
callbacks = [stop, save]

history = model.fit(predictors, label, validation_split=0.20, epochs=50, verbose=1, callbacks = callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 00026: early stopping


Although using val_accuracy do not make much sense in this context, I am using it to get a sense of how good the model architecture really is. The point to remember is that we do not need the exact words while generating the next words for an unknown text. We want words to be similar and appropriate, and most of all interesting to the context provided via a seed text (unknown). Increasing the Embedding dimension brought the training until 26 epochs.