# Making an RNN Model

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
# Importing utility functions from Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from keras.models import Sequential, Model
from keras.layers import SimpleRNN, Dense, LSTM, Flatten
from keras.layers import Input, Dense, LSTM, Attention, Concatenate

from utils import tokenize_song, tokenize_song_by_stanza

Python(61424) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [3]:
N_GRAM = 5
EMBEDDING_SIZE = 100
BATCH_SIZE = 1000
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"
PROCESSED_DATA_FILE = "../data/processed/processed_data.csv"
STANZAS_FILE = '../data/processed/stanzas.txt'
EMBEDDING_FILE = "../reference-materials/lyrics_embeddings.txt"

In [4]:
new_stanzas_as_words = []
with open(STANZAS_FILE, 'r', encoding='utf-8') as txtfile:
    for line in txtfile:
        # Split each line into a list using '\t' as the separator
        line_data = line.strip().split('\t')
        new_stanzas_as_words.append(line_data)

### Create list of song lyrics with genre
Tokenizes the each song into a a list of sentences. Appends the genre of the song in front of each 
sentence.

In [5]:
# Initialize a Tokenizer and fit on your data
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(new_stanzas_as_words)
# Convert stanzas into numerical indexes (list of lists of string -> list of lists of int)
stanzas = tokenizer.texts_to_sequences(new_stanzas_as_words)

In [6]:
# print size of vocab
vocab_size = len(tokenizer.word_counts)
print("Vocab size: ", vocab_size)

Vocab size:  87923


### Read in the embeddings and create dictionary mapping index to embeddings

In [7]:
def read_embeddings(filename: str, tokenizer: Tokenizer) -> dict:
    '''Loads and parses embeddings trained in earlier.
    Parameters:
        filename (str): path to file
        Tokenizer: tokenizer used to tokenize the data (needed to get the word to index mapping)
    Returns:
        (dict): mapping from index to its embedding vector
    '''
    # YOUR CODE HERE
    index_to_embedding = {}  # Mapping from index to its embedding vector
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.split()
            # Skip the first line of file
            if len(split_line) == 2:
                continue
            word = split_line[0]
            vector = [float(x) for x in split_line[1:]]
        
            if word in tokenizer.word_index:
                index_to_embedding[tokenizer.word_index[word]] = vector # Mapping from index to its embedding vector
    return index_to_embedding

In [8]:
index_to_embedding = read_embeddings(EMBEDDING_FILE, tokenizer)

### Create ngram training samples
Add each genre to the beginning of the ngram. The ngram is of size 5, index 0 is the genre. Indexes
1-3 are the features, and index 5 is the label

In [9]:
def generate_ngram_training_samples(encoded: list, ngram: int):
    """
    Generates n-gram training samples from a list of encoded words. 
    """
    X, y = [], []
    ngram = ngram - 2
    for lyric in encoded:
      for i in range(1, len(lyric) - ngram):
          X.append([lyric[0]] + lyric[i:i + ngram])
          y.append(lyric[i + ngram])
    return X, y

In [10]:
X, y = generate_ngram_training_samples(stanzas, N_GRAM)

In [97]:
print("Number of training samples: ", len(X))
print("Number of labels: ", len(y))
print("First training sample: ", X[0])
print("First label: ", y[0])
print("Second training sample: ", X[1])
print("Second label: ", y[1])

Number of training samples:  14990276
Number of labels:  14990276
First training sample:  [530, 13, 13, 13]
First label:  541
Second training sample:  [530, 13, 13, 541]
Second label:  11


### Create Data Generator for Models
Creates a data genator which yields in batches feature embeddings and labels

In [17]:
def convertSamplesToEmbeddings(samples: list, index_to_embedding: dict):
    """
    Converts a list of samples to a list of embeddings.
    """
    embeddings = []
    for sample in samples:
        embedding = []
        for word in sample:
            embedding.append(index_to_embedding[word])
        embeddings.append(embedding)
    return np.array(embeddings)

In [14]:
# Function to generate batches of data
def data_generator(data, labels, index_to_embedding, batch_size, sequence_length, epochs):
    for epoch in range(epochs):
        num_batches = len(data) // batch_size
        while True:
            for i in range(num_batches):
                batch_data = data[i: i + batch_size]
                batch_labels = labels[i: i + batch_size]
                batch_data = convertSamplesToEmbeddings(batch_data, index_to_embedding)
                batch_labels = [to_categorical(label, num_classes=len(index_to_embedding)) for label in batch_labels]
                yield np.array(batch_data), np.array(batch_labels)


### Split the Data

In [11]:
split = int(0.7 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

## Creating Models

In [114]:
def build_feed_forward_model(X, y, index_to_embedding: dict, batch_size=1000, sequence_length=N_GRAM, epochs=1):
  generator = data_generator(X, y, index_to_embedding, batch_size, sequence_length, epochs)
  model = Sequential()
  # Flatten the input sequence to be compatible with Dense layers
  model.add(Flatten(input_shape=(sequence_length-1, EMBEDDING_SIZE)))
  # Add one or more Dense layers
  model.add(Dense(128, activation='softmax'))
  model.add(Dense(units=len(index_to_embedding), activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  history = model.fit(x=generator, steps_per_epoch=len(X) // batch_size, epochs=epochs)
  training_loss_history = history.history['loss']
  plt.plot(training_loss_history)
  plt.xlabel('Epoch')
  plt.ylabel('Training Loss')
  plt.title('Training Loss Over Epochs')
  plt.show()
  model.save('../models/feed_forward_model.keras')
  
  return model, history

In [67]:
our_feed_forward_model = build_feed_forward_model(X_train, y_train, index_to_embedding, batch_size=1000, sequence_length=N_GRAM, epochs=1)



In [None]:
def build_rnn_model(X, y, index_to_embedding: dict, batch_size=4, sequence_length=N_GRAM, epochs=1, units=16):
    generator = data_generator(X, y, index_to_embedding, batch_size, sequence_length, epochs)
    model = Sequential()
    model.add(SimpleRNN(units, input_shape=(sequence_length-1, EMBEDDING_SIZE)))
    model.add(Dense(len(index_to_embedding), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(x=generator, steps_per_epoch=len(X) // batch_size, epochs=epochs)
    model.save('../models/rnn_model_units' + str(units)'.h5')
    training_loss_history = history.history['loss']
    plt.plot(training_loss_history)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Over Epochs')
    plt.show()

    return model, history

In [21]:
rnn_model, rnn_history = build_rnn_model(X_train, y_train, index_to_embedding, batch_size=1000, sequence_length=N_GRAM, epochs=1)

 2215/10493 [=====>........................] - ETA: 35:21 - loss: 4.0306 - accuracy: 0.2413

In [110]:
def build_lstm_model(X, y, index_to_embedding: dict, batch_size=4, sequence_length=N_GRAM, epochs=1, ):
    generator = data_generator(X, y, index_to_embedding, batch_size, sequence_length, epochs)
    model = Sequential()
    model.add(LSTM(128, input_shape=(sequence_length-1, EMBEDDING_SIZE)))
    model.add(Dense(len(index_to_embedding), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(x=generator, steps_per_epoch=len(X) // batch_size, epochs=epochs)
    model.save('../models/lstm_model.h5')
    training_loss_history = history.history['loss']
    plt.plot(training_loss_history)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Over Epochs')
    plt.show()
    return model, history

In [None]:
lstm_model, lstm_history = build_lstm_model(X_train, y_train, index_to_embedding, batch_size=1000, sequence_length=N_GRAM, epochs=1)

In [34]:
lstm_model.save('../models/lstm_model.h5')

  saving_api.save_model(


### LSTM Model with Attention

In [113]:
def build_attention_lstm_model(X, y, index_to_embedding: dict, batch_size=4, sequence_length=N_GRAM, epochs=1):
    generator = data_generator(X, y, index_to_embedding, batch_size, sequence_length, epochs)
    # Build the model
    # model = Sequential()
    # # Input layer
    inputs = Input(shape=(sequence_length-1, EMBEDDING_SIZE))
    # # LSTM layer with return_sequences=True to get the full sequence
    lstm_out = LSTM(128, return_sequences=True)(inputs)

    # # Attention mechanism
    attention = Attention()([lstm_out, lstm_out])

    # # Concatenate the LSTM output and the attention output
    merged = Concatenate(axis=-1)([lstm_out, attention])

    # Flatten the input
    flattened = Flatten()(merged)

    # # Dense layer for classification
    output = Dense(len(index_to_embedding), activation='softmax')(flattened)

    # # Build the model
    # model = Model(inputs=inputs, outputs=output)
    model = Model(inputs=inputs, outputs=output)

    # # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # # Fit the model
    history = model.fit(x=generator, steps_per_epoch=len(X) // batch_size, epochs=epochs)

    # # Save the model
    model.save('../models/attention_lstm_model.h5')
    training_loss_history = history.history['loss']
    plt.plot(training_loss_history)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Over Epochs')
    plt.show()

    return model, history

In [83]:
attention_lstm_model, attention_lstm_history = build_attention_lstm_model(X_train, y_train, index_to_embedding, batch_size=1000, sequence_length=N_GRAM, epochs=1)

2023-12-03 09:32:52.418043: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




  saving_api.save_model(


## Evaluation Our Models

### Evaluating RNN Model

In [121]:
rnn_model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_5 (SimpleRNN)    (None, 128)               29312     
                                                                 
 dense_32 (Dense)            (None, 87923)             11342067  
                                                                 
Total params: 11371379 (43.38 MB)
Trainable params: 11371379 (43.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [122]:
X_test_embeddings = convertSamplesToEmbeddings(X_test, index_to_embedding)

In [None]:
entropy = rnn_model.evaluate(np.array(X_test_embeddings), np.array(y_test))