In [None]:
############################################################
#                ----   NLP    ----
#           Helper functions for Deep Learning experiments
#  Following the course at Udemy
#  https://www.udemy.com/course/tensorflow-developer-certificate-machine-learning-zero-to-mastery/
#                ------------------
############################################################
import requests
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.utils as kerasutils
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam



######################################## Getting text corpus

# usage:
#           text, average_words_number=get_corpus(url="https://www.gutenberg.org/cache/epub/38572/pg38572.txt",
#                   get_part=True, start_phrase="LOVE SONNETS OF AN",
#                   end_phrase="_Now in Press_" )
def get_corpus(url, get_part=True, start_phrase="", end_phrase=""):
    """
    Extracts text from a file located at the provided web address.
    :param url: Link to the text file
    :param get_part: when True, we get only text located between start_phrase and end_phrase strings
    :param start_phrase:
    :param end_phrase:
    :return: a stripped text string without carriage returns, and the average number of words in line.
    """
    try:
        text = requests.get(url).text
    except:
        print("Can not load the document at: " + str(url))
        return False

    if get_part:
        start = text.find(start_phrase)  # skip header
        end = text.rfind(end_phrase)  # skip extra text at the end

    text = text.strip()

    # Split text on carriage returns
    text = text.split('\r')

    # Strip off new lines and empty spaces from the text
    text = [t.strip() for t in text]

    average_number_of_words_in_line = round(sum([len(s.split()) for s in text]) / len(text))
    return text, average_number_of_words_in_line


######################################## Tokenizing text
def create_tokenizer(text):
    """
    Returns tokenizer and total words number based on the extracted text.
    :param text: a text corpus, extracted and preprocessed with get_corpus()
    :return: tokenizer, total words number
    """
    # Please note that I have removed symbols [.,;:] from the default filetr value
    # This helps to preserve punctuation to a certain extent
    tokenizer = Tokenizer(filters='"#$%&()*+-/<=>?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(text)

    # Total number of words
    vocabulary_length = len(tokenizer.word_index) + 1
    return tokenizer, vocabulary_length


######################################## Padding sequences

def pack_sequences(text, tokenizer, total_words_number):
  """
  Based on the corpus of documents and tokenizer, create padded sequences for further prediction task
  :param corpus: Text strings
  :param tokenizer: tokenizer
  :param total_words_number: unique number of words in the corpus
  :return: maximum length of sequences, predictors and labels
  """
  # create input sequences using list of tokens
  input_sequences = []
  for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

  # pad sequences
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

  # create predictors and labels
  predictors, labels = input_sequences[:, :-1], input_sequences[:, -1]

  labels = kerasutils.to_categorical(labels, num_classes=total_words_number)
  return max_sequence_len, predictors, labels


######################################## Create Keras Sequential model with word embeddings
def create_model(vocabulary_length, sequence_length):
  model = Sequential()
  model.add(
        Embedding(input_dim=vocabulary_length, output_dim=100, input_length=sequence_length - 1))
  model.add(Bidirectional(LSTM(150, return_sequences=False))) 
  model.add(Dense(vocabulary_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
  return model

def write_poem(model, tokenizer, max_sequence_length, seed_text="The Moon and Sun", next_words=6, paragraphs=3):
    """
    Uses fitted text generating Keras Sequential model to write a poem.
    :param model: Keras sequential model, fitted to a text corpus
    :param tokenizer: Tokenizer
    :param max_sequence_length: Maximum length of text sequences
    :param seed_text: a text sring to start poem generation
    :param next_words: Number of words in a sentence
    :param paragraphs: Number of paragraphs in the generated poem
    :return: text of the generated poem
    """
    poem = seed_text.capitalize() + "\n\n"
    while paragraphs > 0:
        paragraph = ""
        for word_number in range(next_words):
            sentence = "\n"
            for _ in range(next_words):
                token_list = tokenizer.texts_to_sequences([seed_text])[0]
                token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
                predicted = model.predict(token_list)
                predicted = np.argmax(predicted, axis=-1)
                output_word = ""
                for word, index in tokenizer.word_index.items():
                    if index == predicted:
                        output_word = word
                        break
                seed_text += " " + output_word
                sentence += " " + output_word
            if word_number < next_words:
                paragraph += sentence.strip().capitalize() + "\n"
            seed_text = output_word
        seed_text = sentence
        poem += paragraph + "\n"
        paragraphs -= 1

    print(poem)
    return poem

In [None]:
# Getting and preprocessing a text corpus
text, average_words_number = get_corpus(url="https://www.gutenberg.org/cache/epub/45470/pg45470.txt", get_part=True, start_phrase="THE SHINING HOURS",
                    end_phrase="End of the Project Gutenberg EBook" )


In [None]:
text[:10]

['\ufeffThe Project Gutenberg EBook of The Love Poems, by Émile Verhaeren',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org/license',
 '',
 '',
 'Title: The Love Poems',
 "(From Les Heures claires, Les Heures d'après-midi, Les Heures du Soir)"]

In [None]:
average_words_number

6

In [None]:
# Tokenizing the extracted text
tokenizer, vocabulary_length =  create_tokenizer(text)


In [None]:
print(vocabulary_length)

3714


In [None]:
# Pad text sequences
sequence_length, predictors, labels = pack_sequences(text, tokenizer, vocabulary_length)


In [None]:
sequence_length

15

In [None]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Poem Writing

In [None]:

# Create and the poem generating model
poems = create_model(vocabulary_length, sequence_length)


In [None]:
# Print the model summary
print(poems.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 14, 100)           371400    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3714)              1117914   
                                                                 
Total params: 1,790,514
Trainable params: 1,790,514
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
!wget https://raw.githubusercontent.com/edaehn/deep_learning_notebooks/main/helpers.py

--2022-07-31 11:10:44--  https://raw.githubusercontent.com/edaehn/deep_learning_notebooks/main/helpers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38519 (38K) [text/plain]
Saving to: ‘helpers.py’


2022-07-31 11:10:44 (11.3 MB/s) - ‘helpers.py’ saved [38519/38519]



In [None]:
from helpers import create_early_stopping_callback

In [None]:
history = poems.fit(predictors, 
                    labels, 
                    epochs=57, 
                    callbacks=[create_early_stopping_callback()],
                    verbose=1)

Epoch 1/57
Epoch 2/57
Epoch 3/57
Epoch 4/57
Epoch 5/57
Epoch 6/57
Epoch 7/57
Epoch 8/57
Epoch 9/57
Epoch 10/57
Epoch 11/57
Epoch 12/57
Epoch 13/57
Epoch 14/57
Epoch 15/57
Epoch 16/57
Epoch 17/57
Epoch 18/57
Epoch 19/57
Epoch 20/57
Epoch 21/57
Epoch 22/57
Epoch 23/57
Epoch 24/57
Epoch 25/57
Epoch 26/57
Epoch 27/57
Epoch 28/57
Epoch 29/57
Epoch 30/57
Epoch 31/57
Epoch 32/57
Epoch 33/57
Epoch 34/57
Epoch 35/57
Epoch 36/57
Epoch 37/57
Epoch 38/57
Epoch 39/57
Epoch 40/57
Epoch 41/57
Epoch 42/57
Epoch 43/57
Epoch 44/57
Epoch 45/57
Epoch 46/57
Epoch 47/57
Epoch 48/57
Epoch 49/57
Epoch 49: early stopping


In [None]:
poems.save("poems")



INFO:tensorflow:Assets written to: poems/assets


INFO:tensorflow:Assets written to: poems/assets


In [None]:
 !zip -r poems.zip poems

  adding: poems/ (stored 0%)
  adding: poems/keras_metadata.pb (deflated 89%)
  adding: poems/variables/ (stored 0%)
  adding: poems/variables/variables.index (deflated 64%)
  adding: poems/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: poems/assets/ (stored 0%)
  adding: poems/saved_model.pb (deflated 91%)


In [None]:
# Model Prediction on Layers

In [None]:
# Model layers
poems.layers

[<keras.layers.embeddings.Embedding at 0x7fe20511a650>,
 <keras.layers.wrappers.Bidirectional at 0x7fe205120c90>,
 <keras.layers.core.dense.Dense at 0x7fe2011fab90>]

In [None]:
from keras.models import Model

# Text for predictions
seed_text = "Call me"

def preprocess(seed_text):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=14, padding='pre')
  return token_list


print("Model outputs on each layer")

for i in range(0, len(poems.layers)):
    model = Model(poems.layers[0].input, poems.layers[i].output)
    output = model.predict(preprocess(seed_text))
    print(f"======= {i}: {poems.layers[i]} ========")
    print(f"{output}")

Model outputs on each layer
[[[-0.02469707  0.07959411  0.06904098 ... -0.10707945 -0.01648859
   -0.06807811]
  [-0.02469707  0.07959411  0.06904098 ... -0.10707945 -0.01648859
   -0.06807811]
  [-0.02469707  0.07959411  0.06904098 ... -0.10707945 -0.01648859
   -0.06807811]
  ...
  [-0.02469707  0.07959411  0.06904098 ... -0.10707945 -0.01648859
   -0.06807811]
  [-0.04088492 -0.03654863 -0.04958899 ... -0.02871312 -0.04795657
    0.03860951]
  [ 0.12126409  0.04980833  0.04991504 ... -0.10262618  0.04374539
    0.04749   ]]]
[[ 4.46121156e-01  8.96401405e-01 -6.79003716e-01  9.80030835e-01
  -8.23857725e-01  1.31664574e-02  3.99768203e-02  1.13104582e-02
   4.80327010e-03  7.81018376e-01 -1.14573717e-01  3.75620127e-02
  -5.36459982e-02 -7.54609883e-01  4.04699445e-02  1.86996758e-01
  -8.20412874e-01  7.18474090e-01 -1.91462815e-01 -8.11813474e-01
  -7.28182375e-01  8.33959222e-01  4.20906059e-02  2.05678828e-02
  -2.29923278e-01  2.36079752e-01 -6.49458349e-01  4.14602518e-01
  -2

In [None]:

def next_word(predicted):
  predicted = np.argmax(predicted, axis=-1)
  for word, index in tokenizer.word_index.items():
    if index == predicted:
       return word
  return ""

def predict_next_word(seed_text):
  predicted = poems.predict(preprocess(seed_text))
  return next_word(predicted)

In [None]:
output

array([[4.7531819e-07, 2.4485288e-02, 4.9533878e-02, ..., 3.7681966e-05,
        3.7684080e-07, 1.6513543e-05]], dtype=float32)

In [None]:
seed_text = "Call me"
next = predict_next_word(seed_text)
print(next)

than


In [None]:
print(next)

and


In [None]:
for i in range(0, 7):
  next = predict_next_word(seed_text)
  seed_text = seed_text + " " + next
  print(seed_text)

Call me than
Call me than was
Call me than was as
Call me than was as our
Call me than was as our love.
Call me than was as our love. house
Call me than was as our love. house in


In [None]:
# Generate poetry
write_poem(poems, tokenizer, 15, seed_text="Shine in the darkness", next_words=5, paragraphs=3)


Shine in the darkness

And the surface other and
The garden and the orchard.
As creation of derivative works,
And the medium on which
To gaze on the day

Possessed with flames on the
Same terrors, the same happinesses,
The days of our paths;
The day when i had
You not, each hour of

The deep and chilled they
Are there beneath their roof,
Monstrous than the divine hour
Is unique and sanctified with
This ebook or online at




'Shine in the darkness\n\nAnd the surface other and\nThe garden and the orchard.\nAs creation of derivative works,\nAnd the medium on which\nTo gaze on the day\n\nPossessed with flames on the\nSame terrors, the same happinesses,\nThe days of our paths;\nThe day when i had\nYou not, each hour of\n\nThe deep and chilled they\nAre there beneath their roof,\nMonstrous than the divine hour\nIs unique and sanctified with\nThis ebook or online at\n\n'

In [None]:
# Generate poetry
write_poem(poems, tokenizer, 15, seed_text="Winter Rose", next_words=4, paragraphs=3)


Winter rose

On measure your radiant
Pond, the goldfish like
Those who go towards
Its chair of weariness.

Girt with roses that
The happiness that hovers
Because of our thoughts.
As creation of derivative

Works, reports, performances and
The garden and the
Same terrors, the same
Branch that suspends and




'Winter rose\n\nOn measure your radiant\nPond, the goldfish like\nThose who go towards\nIts chair of weariness.\n\nGirt with roses that\nThe happiness that hovers\nBecause of our thoughts.\nAs creation of derivative\n\nWorks, reports, performances and\nThe garden and the\nSame terrors, the same\nBranch that suspends and\n\n'