In [23]:
import numpy as np 
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [24]:
EMBEDDING_DIM = 100
MAXLEN = 16
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 160000
TRAINING_SPLIT = 0.9

In [26]:
SONNETS_FILE = '/home/siarhei/Programming/Univer/Koursach/music-applications/text-generator/lyrics-data.txt'

# Read the data
with open(SONNETS_FILE) as f:
    data = f.read()

# Convert to lower case and save as a list
corpus = data.lower().split("\n")

print(f"There are {len(corpus)} lines of sonnets\n")
print(f"The first 5 lines look like this:\n")
for i in range(5):
    print(corpus[i])

There are 283 lines of sonnets

The first 5 lines look like this:

at the library 
don't leave me 
i was there 
disappearing boy 
green day 


In [27]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [28]:
tokenizer.texts_to_sequences([corpus[27]])

[[1, 7, 41, 1, 7, 41, 41, 122, 19, 104]]

In [30]:
tokenizer.texts_to_sequences([corpus[27]])[0]

[1, 7, 41, 1, 7, 41, 41, 122, 19, 104]

In [31]:
def n_gram_seqs(corpus, tokenizer):
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

In [32]:
first_example_sequence = n_gram_seqs([corpus[27]], tokenizer)

print("n_gram sequences for first example look like this:\n")
first_example_sequence

n_gram sequences for first example look like this:



[[1, 7],
 [1, 7, 41],
 [1, 7, 41, 1],
 [1, 7, 41, 1, 7],
 [1, 7, 41, 1, 7, 41],
 [1, 7, 41, 1, 7, 41, 41],
 [1, 7, 41, 1, 7, 41, 41, 122],
 [1, 7, 41, 1, 7, 41, 41, 122, 19],
 [1, 7, 41, 1, 7, 41, 41, 122, 19, 104]]

In [33]:
# Test your function with a bigger corpus
next_3_examples_sequence = n_gram_seqs(corpus[1:4], tokenizer)

print("n_gram sequences for next 3 examples look like this:\n")
next_3_examples_sequence

n_gram sequences for next 3 examples look like this:



[[7, 38], [7, 38, 8], [1, 21], [1, 21, 27], [51, 52]]

In [34]:
# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)

# Save max length 
max_sequence_len = max([len(x) for x in input_sequences])

print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")

n_grams of input_sequences have length: 1112
maximum length of sequences is: 18


In [35]:
def pad_seqs(input_sequences, maxlen):
    max_sequence_len = max([len(x) for x in input_sequences])
    padded_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    return padded_sequences

In [36]:
first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq

array([[  0,   0,   0,   0,   0,   0,   0,   0,   1,   7],
       [  0,   0,   0,   0,   0,   0,   0,   1,   7,  41],
       [  0,   0,   0,   0,   0,   0,   1,   7,  41,   1],
       [  0,   0,   0,   0,   0,   1,   7,  41,   1,   7],
       [  0,   0,   0,   0,   1,   7,  41,   1,   7,  41],
       [  0,   0,   0,   1,   7,  41,   1,   7,  41,  41],
       [  0,   0,   1,   7,  41,   1,   7,  41,  41, 122],
       [  0,   1,   7,  41,   1,   7,  41,  41, 122,  19],
       [  1,   7,  41,   1,   7,  41,  41, 122,  19, 104]], dtype=int32)

In [37]:
next_3_padded_seq = pad_seqs(next_3_examples_sequence, max([len(s) for s in next_3_examples_sequence]))
next_3_padded_seq

array([[ 0,  7, 38],
       [ 7, 38,  8],
       [ 0,  1, 21],
       [ 1, 21, 27],
       [ 0, 51, 52]], dtype=int32)

In [38]:
# Pad the whole corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)
print(f"padded corpus has shape: {input_sequences.shape}")

padded corpus has shape: (1112, 18)


In [39]:
def features_and_labels(input_sequences, total_words):
    features = input_sequences[:, :-1]
    labels = input_sequences[:, -1]
    one_hot_labels = to_categorical(labels, num_classes=total_words)
    return features, one_hot_labels

In [40]:
first_features, first_labels = features_and_labels(first_padded_seq, total_words)

print(f"labels have shape: {first_labels.shape}")
print("\nfeatures look like this:\n")
first_features

labels have shape: (9, 466)

features look like this:



array([[  0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  0,   0,   0,   0,   0,   0,   0,   1,   7],
       [  0,   0,   0,   0,   0,   0,   1,   7,  41],
       [  0,   0,   0,   0,   0,   1,   7,  41,   1],
       [  0,   0,   0,   0,   1,   7,  41,   1,   7],
       [  0,   0,   0,   1,   7,  41,   1,   7,  41],
       [  0,   0,   1,   7,  41,   1,   7,  41,  41],
       [  0,   1,   7,  41,   1,   7,  41,  41, 122],
       [  1,   7,  41,   1,   7,  41,  41, 122,  19]], dtype=int32)

In [41]:
# Split the whole corpus
features, labels = features_and_labels(input_sequences, total_words)

print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")

features have shape: (1112, 17)
labels have shape: (1112, 466)


In [42]:
from pyparsing import actions

def create_model(total_words, max_sequence_len):
    
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(150)))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [44]:
# Get the untrained model
model = create_model(total_words, max_sequence_len)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 17, 100)           46600     
                                                                 
 bidirectional_1 (Bidirectio  (None, 300)              301200    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 466)               140266    
                                                                 
Total params: 488,066
Trainable params: 488,066
Non-trainable params: 0
_________________________________________________________________


In [45]:
history = model.fit(features, labels, epochs=200, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [49]:
seed_text = "I am happy and proud"
next_words = 26
  
for _ in range(next_words):
    # Convert the text into sequences
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    # Pad the sequences
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    # Get the probabilities of predicting a word
    predicted = model.predict(token_list, verbose=0)
    # Choose the next word based on the maximum probability
    predicted = np.argmax(predicted, axis=-1).item()
    # Get the actual word from the word index
    output_word = tokenizer.index_word[predicted]
    # Append to the current text
    seed_text += " " + output_word

print(seed_text)

I am happy and proud alone i had everything give an alright scene scene alright scene scene screen screen eat blood sold on warm such give you breathing bomb smile library
