# 5. Word-Based Modelling

In this notebook, the lyrics will be used to train a model that learns how to write song lyrics.<br>
__Note:__ Recommended to train on a GPU.

In [2]:
import os
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.utils import Sequence

In [5]:
# check connection to gpu
print('Found GPU at: {}'.format(tf.test.gpu_device_name()))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Found GPU at: /device:GPU:0
Num GPUs Available:  1


In [6]:
# create class for fitting to a sequence of data
# useful for multiprocessing
class TextDataGenerator(Sequence):
    def __init__(self, sequences, next_words, sequence_length, vocab_size, batch_size=32, shuffle=True, embedding=False):
        self.batch_size = batch_size
        self.sequences = sequences
        self.next_words = next_words
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.shuffle = shuffle
        self.embedding = embedding
        self.on_epoch_end()

    # must be implemented for class Sequence
    def __len__(self):
        return int(np.floor(len(self.sequences) / self.batch_size))

    # must be implemented for class Sequence, gets batch at position index
    def __getitem__(self, index):
        # take sample of indexes, length = batch_size
        indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]

        # store relevant x and y data
        sequences_batch = [self.sequences[k] for k in indexes]
        next_words_batch = [self.next_words[k] for k in indexes]

        if self.embedding:
          X = np.array(sequences_batch)
          y = keras.utils.to_categorical(next_words_batch, num_classes=self.vocab_size)
        else:
          X, y = self.__data_generation(sequences_batch, next_words_batch)

        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.sequences)) # vector of length len(x_data)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sequences_batch, next_words_batch):
        # create empty, 3 dimensional matrices
        X = np.zeros((self.batch_size, self.sequence_length, self.vocab_size), dtype=bool)
        y = np.zeros((self.batch_size, self.vocab_size), dtype=bool)

        # fill matrices with 1 at the position of the word, essentially one-hot encoding
        for i, seq in enumerate(sequences_batch):
            for j, word in enumerate(seq):
                X[i, j, word] = 1
                y[i, next_words_batch[i]] = 1
        return X, y

In [7]:
# Loading relevant objects:
with open('./data/objs_rel.pkl','rb') as f:
    SEQ_LENGTH, VOCAB_SIZE, train_generator = pickle.load(f)

In [8]:
checkpoint_path = "callbacks_model/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

## Model training

We are using a sequential model that works with a LSTM architecture that works well with sequential data such as our song lyrics.

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation, Dense, LSTM, Dropout

In [10]:
model = Sequential()
model.add(LSTM(128, input_shape = (SEQ_LENGTH, VOCAB_SIZE)))
model.add(Dense(VOCAB_SIZE))
model.add(Activation('softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               12867072  
                                                                 
 dense (Dense)               (None, 25002)             3225258   
                                                                 
 activation (Activation)     (None, 25002)             0         
                                                                 
Total params: 16,092,330
Trainable params: 16,092,330
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(loss='categorical_crossentropy', optimizer='Adam', run_eagerly = True)

In [None]:
model.fit(train_generator, 
          epochs = 5,
          initial_epoch=1,
          callbacks=[cp_callback])

Epoch 3/5
Epoch 3: saving model to callbacks_model_5/cp.ckpt
Epoch 4/5
Epoch 4: saving model to callbacks_model_5/cp.ckpt
Epoch 5/5

__Ressource for explaining Dropout:__ https://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf <br>
__Model Architecture:__ https://www.analyticsvidhya.com/blog/2018/03/text-generation-using-python-nlp/

In [None]:
# save model
filename = 'model.h5'
model.save(filename)