In [1]:
############################################################
#                ----   NLP    ----
#           Generating Poems with TensorFlow and Keras
#  Read more at: https://daehnhardt.com/blog/2022/07/11/tf-nlp
#  Following the course at Udemy
#  https://www.udemy.com/course/tensorflow-developer-certificate-machine-learning-zero-to-mastery/
#                ------------------
############################################################
import requests
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.utils as kerasutils
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam



######################################## Getting text corpus

# usage:
#           text, average_words_number=get_corpus(url="https://www.gutenberg.org/cache/epub/38572/pg38572.txt",
#                   get_part=True, start_phrase="LOVE SONNETS OF AN",
#                   end_phrase="_Now in Press_" )
def get_corpus(url, get_part=True, start_phrase="", end_phrase=""):
    """
    Extracts text from a file located at the provided web address.
    :param url: Link to the text file
    :param get_part: when True, we get only text located between start_phrase and end_phrase strings
    :param start_phrase:
    :param end_phrase:
    :return: a stripped text string without carriage returns, and the average number of words in line.
    """
    try:
        text = requests.get(url).text
    except:
        print("Can not load the document at: " + str(url))
        return False

    if get_part:
        start = text.find(start_phrase)  # skip header
        end = text.rfind(end_phrase)  # skip extra text at the end

    text = text.strip()

    # Split text on carriage returns
    text = text.split('\r')

    # Strip off new lines and empty spaces from the text
    text = [t.strip() for t in text]

    average_number_of_words_in_line = round(sum([len(s.split()) for s in text]) / len(text))
    return text, average_number_of_words_in_line


######################################## Tokenizing text
def create_tokenizer(text):
    """
    Returns tokenizer and total words number based on the extracted text.
    :param text: a text corpus, extracted and preprocessed with get_corpus()
    :return: tokenizer, total words number
    """
    # Please note that I have removed symbols [.,;:] from the default filetr value
    # This helps to preserve punctuation to a certain extent
    tokenizer = Tokenizer(filters='"#$%&()*+-/<=>?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(text)

    # Total number of words
    vocabulary_length = len(tokenizer.word_index) + 1
    return tokenizer, vocabulary_length


######################################## Padding sequences

def pack_sequences(text, tokenizer, total_words_number):
  """
  Based on the corpus of documents and tokenizer, create padded sequences for further prediction task
  :param corpus: Text strings
  :param tokenizer: tokenizer
  :param total_words_number: unique number of words in the corpus
  :return: maximum length of sequences, predictors and labels
  """
  # create input sequences using list of tokens
  input_sequences = []
  for line in text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

  # pad sequences
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

  # create predictors and labels
  predictors, labels = input_sequences[:, :-1], input_sequences[:, -1]

  labels = kerasutils.to_categorical(labels, num_classes=total_words_number)
  return max_sequence_len, predictors, labels


######################################## Create Keras Sequential model with word embeddings
def create_model(vocabulary_length, sequence_length):
  model = Sequential()
  model.add(
        Embedding(input_dim=vocabulary_length, output_dim=100, input_length=sequence_length - 1))
  model.add(Bidirectional(LSTM(150, return_sequences=False))) 
  model.add(Dense(vocabulary_length, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
  return model

def write_poem(model, tokenizer, max_sequence_length, seed_text="The Moon and Sun", next_words=6, paragraphs=3):
    """
    Uses fitted text generating Keras Sequential model to write a poem.
    :param model: Keras sequential model, fitted to a text corpus
    :param tokenizer: Tokenizer
    :param max_sequence_length: Maximum length of text sequences
    :param seed_text: a text sring to start poem generation
    :param next_words: Number of words in a sentence
    :param paragraphs: Number of paragraphs in the generated poem
    :return: text of the generated poem
    """
    poem = seed_text.capitalize() + "\n\n"
    while paragraphs > 0:
        paragraph = ""
        for word_number in range(next_words):
            sentence = "\n"
            for _ in range(next_words):
                token_list = tokenizer.texts_to_sequences([seed_text])[0]
                token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
                predicted = model.predict(token_list)
                predicted = np.argmax(predicted, axis=-1)
                output_word = ""
                for word, index in tokenizer.word_index.items():
                    if index == predicted:
                        output_word = word
                        break
                seed_text += " " + output_word
                sentence += " " + output_word
            if word_number < next_words:
                paragraph += sentence.strip().capitalize() + "\n"
            seed_text = output_word
        seed_text = sentence
        poem += paragraph + "\n"
        paragraphs -= 1

    print(poem)
    return poem

In [2]:
# Getting and preprocessing a text corpus
text, average_words_number = get_corpus(url="https://www.gutenberg.org/cache/epub/45470/pg45470.txt", get_part=True, start_phrase="THE SHINING HOURS",
                    end_phrase="End of the Project Gutenberg EBook" )


In [None]:
text[:10]

['\ufeffThe Project Gutenberg EBook of The Love Poems, by Émile Verhaeren',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org/license',
 '',
 '',
 'Title: The Love Poems',
 "(From Les Heures claires, Les Heures d'après-midi, Les Heures du Soir)"]

In [None]:
average_words_number

6

In [3]:
# Tokenizing the extracted text
tokenizer, vocabulary_length =  create_tokenizer(text)


In [4]:
print(vocabulary_length)

3714


In [5]:
# Pad text sequences
sequence_length, predictors, labels = pack_sequences(text, tokenizer, vocabulary_length)


In [None]:
sequence_length

15

In [None]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [6]:

# Create and the poem generating model
poems = create_model(vocabulary_length, sequence_length)


In [7]:
# Print the model summary
print(poems.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 14, 100)           371400    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3714)              1117914   
                                                                 
Total params: 1,790,514
Trainable params: 1,790,514
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# Fit compiled model
history = poems.fit(predictors, labels, epochs=50, verbose=1)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [9]:
poems.save("poems")



INFO:tensorflow:Assets written to: poems/assets


INFO:tensorflow:Assets written to: poems/assets


In [10]:
 !zip -r poems.zip poems

  adding: poems/ (stored 0%)
  adding: poems/assets/ (stored 0%)
  adding: poems/keras_metadata.pb (deflated 89%)
  adding: poems/variables/ (stored 0%)
  adding: poems/variables/variables.index (deflated 64%)
  adding: poems/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: poems/saved_model.pb (deflated 91%)


In [13]:
# Generate poetry
write_poem(poems, tokenizer, 15, seed_text="Shine in the darkness", next_words=5, paragraphs=3)


Shine in the darkness

At the fall of evening,
I part your hair, and
I make towards you, happy
And serene, they believe eagerly;
Its offering, my joy and

The fervour of my flesh.
Oh! how everything, except that
Lives in the fine ruddy
Being seems to dwell in
The summer wind, this page

And that so so open
Forth in the general terms
Of this agreement, you may
My two hands against your
Eyes were then so pure




'Shine in the darkness\n\nAt the fall of evening,\nI part your hair, and\nI make towards you, happy\nAnd serene, they believe eagerly;\nIts offering, my joy and\n\nThe fervour of my flesh.\nOh! how everything, except that\nLives in the fine ruddy\nBeing seems to dwell in\nThe summer wind, this page\n\nAnd that so so open\nForth in the general terms\nOf this agreement, you may\nMy two hands against your\nEyes were then so pure\n\n'