<a href="https://colab.research.google.com/github/chandhinipj/blake-poems-DeepLearning-NLP/blob/main/DLASSGMNT4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [36]:
# import python libraries
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from random import randint
import re

In [12]:
import nltk   # natural language tool kit library
nltk.download('gutenberg')  # downloads a library that NLTK uses

from nltk.corpus import gutenberg as gut  # downloads the gutenberg dataset
print(gut.fileids())    # prints the name of the files in the dataset

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [13]:
# get the book text
book_text = nltk.corpus.gutenberg.raw('blake-poems.txt')


In [None]:
print(book_text)

# Data preprocessing

In [14]:
# Data preprocessing
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.lower()
book_text = preprocess_text(book_text)

In [15]:
print(book_text)

 poems by william blake songs of innocence and of experience and the book of thel songs of innocence introduction piping down the valleys wild piping songs of pleasant glee on cloud saw child and he laughing said to me pipe song about lamb so piped with merry cheer piper pipe that song again so piped he wept to hear drop thy pipe thy happy pipe sing thy songs of happy cheer so sang the same again while he wept with joy to hear piper sit thee down and write in book that all may read so he vanish from my sight and pluck a hollow reed and made rural pen and stain the water clear and wrote my happy songs every child may joy to hear the shepherd how sweet is the shepherd sweet lot from the morn to the evening he stays he shall follow his sheep all the day and his tongue shall be filled with praise for he hears the lambs innocent call and he hears the ewes tender reply he is watching while they are in peace for they know when their shepherd is nigh the echoing green the sun does arise and ma

In [11]:
# book_text = book_text[:5000]  # limit text to 5000, just for this exercise

In [17]:
len(book_text)

34028

In [16]:
# convert words to numbers
from nltk.tokenize import word_tokenize
nltk.download('punkt')
book_text_words = (word_tokenize(book_text))
n_words = len(book_text_words)
unique_words = len(set(book_text_words))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
len(book_text_words)

6590

In [23]:
print(book_text_words)

['poems', 'by', 'william', 'blake', 'songs', 'of', 'innocence', 'and', 'of', 'experience', 'and', 'the', 'book', 'of', 'thel', 'songs', 'of', 'innocence', 'introduction', 'piping', 'down', 'the', 'valleys', 'wild', 'piping', 'songs', 'of', 'pleasant', 'glee', 'on', 'cloud', 'saw', 'child', 'and', 'he', 'laughing', 'said', 'to', 'me', 'pipe', 'song', 'about', 'lamb', 'so', 'piped', 'with', 'merry', 'cheer', 'piper', 'pipe', 'that', 'song', 'again', 'so', 'piped', 'he', 'wept', 'to', 'hear', 'drop', 'thy', 'pipe', 'thy', 'happy', 'pipe', 'sing', 'thy', 'songs', 'of', 'happy', 'cheer', 'so', 'sang', 'the', 'same', 'again', 'while', 'he', 'wept', 'with', 'joy', 'to', 'hear', 'piper', 'sit', 'thee', 'down', 'and', 'write', 'in', 'book', 'that', 'all', 'may', 'read', 'so', 'he', 'vanish', 'from', 'my', 'sight', 'and', 'pluck', 'a', 'hollow', 'reed', 'and', 'made', 'rural', 'pen', 'and', 'stain', 'the', 'water', 'clear', 'and', 'wrote', 'my', 'happy', 'songs', 'every', 'child', 'may', 'joy', 

In [21]:
len(set(book_text_words))

1505

In [25]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(book_text_words)

vocab_size = len(tokenizer.word_index) + 1    # word_index is the dictionary. Store the number of unique words in vocab_size variable
word_2_index = tokenizer.word_index           # store the dictionary in the variable called word_2_index

In [27]:
# Create the input sequences
input_sequence_words = []  # input sequences in words (used for metric evaluation later on)
input_sequence = []   # empty list to hold the sequences that will be input into our model
output_words = []     # empty list to hold the output words
input_seq_length = 25  # length of the input sequence
for i in range(0, n_words - input_seq_length , 1):
    in_seq = book_text_words[i:i + input_seq_length]
    input_sequence_words.append(in_seq)
    out_seq = book_text_words[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])

In [33]:
print(output_words)

[164, 3, 226, 710, 12, 73, 165, 54, 2, 9, 227, 143, 5, 21, 278, 112, 711, 74, 45, 461, 7, 122, 462, 463, 278, 16, 112, 350, 45, 461, 9, 123, 5, 64, 712, 27, 278, 27, 50, 278, 98, 27, 164, 3, 50, 462, 45, 464, 1, 713, 350, 81, 9, 123, 7, 37, 5, 64, 463, 89, 15, 49, 2, 714, 4, 189, 16, 17, 190, 465, 45, 9, 279, 25, 6, 466, 2, 715, 280, 351, 716, 2, 191, 717, 718, 2, 467, 1, 352, 719, 2, 720, 6, 50, 164, 58, 54, 190, 37, 5, 64, 1, 353, 82, 32, 10, 1, 353, 32, 468, 25, 1, 144, 5, 1, 228, 9, 721, 9, 51, 354, 8, 281, 17, 1, 52, 2, 8, 282, 51, 38, 192, 7, 469, 26, 9, 283, 1, 229, 284, 230, 2, 9, 283, 1, 722, 124, 723, 9, 10, 724, 81, 13, 39, 4, 125, 26, 13, 99, 33, 22, 353, 10, 470, 1, 355, 65, 1, 100, 55, 126, 2, 231, 50, 1, 285, 1, 122, 471, 725, 5, 286, 1, 145, 1, 472, 2, 726, 1, 101, 3, 1, 727, 98, 728, 232, 5, 1, 471, 729, 473, 81, 43, 474, 51, 38, 193, 12, 1, 355, 65, 194, 730, 7, 90, 195, 55, 196, 41, 91, 475, 287, 1, 731, 146, 1, 194, 732, 13, 196, 166, 43, 127, 2, 233, 13, 17, 167, 1

In [28]:
# reshape the input sequences to be 3-dimensional
X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))

# Normalise the data by dividing by the max number of unique words (the vocab size)
X = X / float(vocab_size)

# one-hot encode the output words so that they can be used by the model (converts the output to 2-dimensions)
y = to_categorical(output_words)

In [29]:
X

array([[[4.70783533e-01],
        [2.65604250e-02],
        [3.04780876e-01],
        ...,
        [1.83930943e-01],
        [8.03452855e-02],
        [2.31739708e-01]],

       [[2.65604250e-02],
        [3.04780876e-01],
        [1.83266932e-01],
        ...,
        [8.03452855e-02],
        [2.31739708e-01],
        [1.08897742e-01]],

       [[3.04780876e-01],
        [1.83266932e-01],
        [1.08897742e-01],
        ...,
        [2.31739708e-01],
        [1.08897742e-01],
        [1.99203187e-03]],

       ...,

       [[9.96679947e-01],
        [1.99203187e-03],
        [9.97343958e-01],
        ...,
        [8.89774236e-02],
        [1.65338645e-01],
        [6.64010624e-04]],

       [[1.99203187e-03],
        [9.97343958e-01],
        [7.96812749e-03],
        ...,
        [1.65338645e-01],
        [6.64010624e-04],
        [8.63213811e-02]],

       [[9.97343958e-01],
        [7.96812749e-03],
        [6.64010624e-04],
        ...,
        [6.64010624e-04],
        [8.6321

In [31]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Generating LSTM Model

In [37]:
# create, compile and fit the model
model = Sequential()
model.add(LSTM(800, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=64, epochs=100, verbose=1)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 800)               2566400   
                                                                 
 dropout (Dropout)           (None, 800)               0         
                                                                 
 dense_1 (Dense)             (None, 1506)              1206306   
                                                                 
Total params: 3,772,706
Trainable params: 3,772,706
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch

<keras.callbacks.History at 0x7f4351051220>

# Making Prediction

In [38]:
# Make Predictions
random_seq_index = np.random.randint(0, len(input_sequence)-1)    # select a random number from within the range of the number of input sequences
random_seq = input_sequence[random_seq_index]                     # get the input sequence that occurs at the randomly selected index (this is a list of integers)

index_2_word = dict(map(reversed, word_2_index.items())) # convert the integer sequence to its words
seed_word_sequence = [index_2_word[value] for value in random_seq]  # get the list of words that correspond to the integers in the randomly picked sequence

# join the words in the list and print the sequence of words
print(' '.join(seed_word_sequence))  # this prints the words from the randomly picked sequence that will be the seed for our prediction


it possible to thought greater than itself to know and father how can love you or any of my brothers more love you like the


In [39]:
# Predict next 25 words
word_sequence = []
for i in range(25):
    int_sample = np.reshape(random_seq, (1, len(random_seq), 1))    # reshape to make 3-D input (1 sequence, length of the sequence, 1 because the first LSTM requires another dimension)
    int_sample = int_sample / float(vocab_size)                     # normalise (as we normalised the training data)

    predicted_word_index = model.predict(int_sample, verbose=0)     # predict the next word.  An array of the probabilities for each word in the vocab is returned.
    predicted_word_id = np.argmax(predicted_word_index)             # get the index of the maximum value (they are categorical so the max value gives the word in the vocab with the highest probability)
    word_sequence.append(index_2_word[ predicted_word_id])          # get the predicted word by finding the word at the predicted index and add it to our predicted word sequence list

    random_seq.append(predicted_word_id)                            # append the predicted word index to the next seuqence to be input into the model predict method
    random_seq = random_seq[1:len(random_seq)]                      # remove the first element of the sequence so it now has the new word but is the same length.


In [40]:
# BLEU score
seq = [' '.join(w) for w in input_sequence_words]
from nltk.translate.bleu_score import sentence_bleu
reference = seq
candidate = ' '.join(word_sequence) # make the list of words into a string
score = sentence_bleu(reference, candidate)
print('Seed word sequence: %s'%(' '.join(seed_word_sequence)))
print('Predicted words: %s'%(candidate))
print('BLEU Score for predicted words: %s'%(score))

Seed word sequence: it possible to thought greater than itself to know and father how can love you or any of my brothers more love you like the
Predicted words: little bird that picks up crumbs around the door the priest sat by and heard the child in trembling zeal he seized his hair he
BLEU Score for predicted words: 1.0
