# Making an RNN Model

In [101]:
import numpy as np
# Importing utility functions from Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

In [102]:
N_GRAM = 5
EMBEDDING_SIZE = 100
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"
PROCESSED_DATA_FILE = "../data/processed/processed_data.csv"
EMBEDDING_FILE = "../reference-materials/lyrics_embeddings.txt"

In [103]:
import pandas as pd
df = pd.read_csv(PROCESSED_DATA_FILE)
df.head()

Unnamed: 0,artist,song,link,text,genre_list,genre
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...","['europop', 'swedish pop']",pop
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...","['europop', 'swedish pop']",pop
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,"['europop', 'swedish pop']",pop
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"['europop', 'swedish pop']",pop
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"['europop', 'swedish pop']",pop


### Create list of song lyrics with genre
Tokenizes the each song into a a list of sentences. Appends the genre of the song in front of each 
sentence.

In [104]:
from utils import tokenize_song

data_lyrics = df['text'].tolist()
data_genre = df['genre'].tolist()
data = []
for song, genre in zip(data_lyrics, data_genre):
  lines = tokenize_song(song, ngram=N_GRAM)
  for line in lines:
    line.insert(0, genre)
  data.extend(lines)
print(data[0])

['pop', '<s>', '<s>', '<s>', '<s>', 'Look', 'at', 'her', 'face', ',', 'it', "'s", 'a', 'wonderful', 'face', '</s>', '</s>', '</s>', '</s>']


In [105]:
# Initialize a Tokenizer and fit on your data
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
# Convert text data into sequences of integers
sequences = tokenizer.texts_to_sequences(data)

In [106]:
# print size of vocab
vocab_size = len(tokenizer.word_counts)
print("Vocab size: ", vocab_size)

Vocab size:  81135


In [107]:
def read_embeddings(file_path):
    words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            words.append(word)
    return words[1:]

In [108]:
words = read_embeddings(EMBEDDING_FILE)

In [109]:
print("Number of words in embeddings: ", len(words))

Number of words in embeddings:  81142


In [111]:
delete_words = words - tokenizer.word_index.keys()
print("Number of words to delete: ", len(delete_words))

Number of words to delete:  11


In [112]:
print(words[:10])

['</s>', '<s>', 'rock', 'i', 'pop', ',', 'the', 'you', 'to', 'and']


#### Read in the embeddings and create dictionary mapping index to embeddings

In [113]:
def read_embeddings(filename: str, tokenizer: Tokenizer) -> dict:
    '''Loads and parses embeddings trained in earlier.
    Parameters:
        filename (str): path to file
        Tokenizer: tokenizer used to tokenize the data (needed to get the word to index mapping)
    Returns:
        (dict): mapping from index to its embedding vector
    '''
    # YOUR CODE HERE
    index_to_embedding = {}  # Mapping from index to its embedding vector
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.split()
            # Skip the first line of file
            if len(split_line) == 2:
                continue
            word = split_line[0]
            vector = [float(x) for x in split_line[1:]]
        
            if word in tokenizer.word_index:
                index_to_embedding[tokenizer.word_index[word]] = vector # Mapping from index to its embedding vector
    return index_to_embedding


In [114]:
index_to_embedding = read_embeddings(EMBEDDING_FILE, tokenizer)

### Create ngram training samples

In [115]:
def generate_ngram_training_samples(encoded: list, ngram: int):
    """
    Generates n-gram training samples from a list of encoded words.
    """
    X, y = [], []
    ngram = ngram - 2
    for lyric in encoded:
      for i in range(1, len(lyric) - ngram):
          X.append([lyric[0]] + lyric[i:i + ngram])
          y.append(lyric[i + ngram])
    return X, y

In [116]:
X, y = generate_ngram_training_samples(sequences, N_GRAM)

In [118]:
print("Number of training samples: ", len(X))
print("Number of labels: ", len(y))
print("First training sample: ", X[0])
print("First label: ", y[0])
print("Second training sample: ", X[1])
print("Second label: ", y[1])

Number of training samples:  23483352
Number of labels:  23483352
First training sample:  [5, 1, 1, 1]
First label:  1
Second training sample:  [5, 1, 1, 1]
Second label:  148


In [119]:
def convertSamplesToEmbeddings(samples: list, index_to_embedding: dict):
    """
    Converts a list of samples to a list of embeddings.
    """
    embeddings = []
    for sample in samples:
        embedding = []
        for word in sample:
            embedding.append(index_to_embedding[word])
        embeddings.append(embedding)
    return np.array(embeddings)

In [120]:
# Function to generate batches of data
def data_generator(data, labels, index_to_embedding, batch_size, sequence_length, epochs):
    for epoch in range(epochs):
        num_batches = len(data) // batch_size
        while True:
            for i in range(num_batches):
                batch_data = data[i: i + batch_size]
                batch_labels = labels[i: i + batch_size]
                
                # Perform any necessary preprocessing on batch_data and batch_labels
                batch_data = convertSamplesToEmbeddings(batch_data, index_to_embedding)
                batch_labels = [to_categorical(label, num_classes=len(index_to_embedding)) for label in batch_labels]
                # Convert batch_data and batch_labels to the appropriate format
                # For example, if using SimpleRNN, the input shape should be (batch_size, sequence_length, features)
                
                yield np.array(batch_data), np.array(batch_labels)


In [122]:
train_data_generator = data_generator(X, y, index_to_embedding=index_to_embedding, batch_size=4, sequence_length=N_GRAM, epochs=2)

## Creating Models

In [130]:
def build_rnn_model(X, y, index_to_embedding: dict, batch_size=4, sequence_length=N_GRAM, epochs=2):
    generator = data_generator(X, y, index_to_embedding, batch_size, sequence_length, epochs)
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(sequence_length, EMBEDDING_SIZE)))
    model.add(Dense(len(index_to_embedding), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x=generator, steps_per_epoch=len(X) // batch_size, epochs=epochs)
    return model

In [132]:
model = build_rnn_model(X, y, index_to_embedding, batch_size=1000, sequence_length=N_GRAM, epochs=2)

Epoch 1/2
Epoch 2/2
 1971/23483 [=>............................] - ETA: 1:42:08 - loss: 0.5098 - accuracy: 0.8793

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3389f9a60>
    label = <none> 
    device = <AGXG13XDevice: 0x1133f2400>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x2e509ca00>
        label = <none> 
        device = <AGXG13XDevice: 0x1133f2400>
            name = Apple M1 Pro 
    retainedReferences = 1


 1989/23483 [=>............................] - ETA: 1:41:55 - loss: 0.5102 - accuracy: 0.8793

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x358bcef30>
    label = <none> 
    device = <AGXG13XDevice: 0x1133f2400>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x2e509ca00>
        label = <none> 
        device = <AGXG13XDevice: 0x1133f2400>
            name = Apple M1 Pro 
    retainedReferences = 1




Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x34b454b30>
    label = <none> 
    device = <AGXG13XDevice: 0x1133f2400>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x2e509ca00>
        label = <none> 
        device = <AGXG13XDevice: 0x1133f2400>
            name = Apple M1 Pro 
    retainedReferences = 1




Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Internal Error (0000000e:Internal Error)
	<AGXG13XFamilyCommandBuffer: 0x3389c5ac0>
    label = <none> 
    device = <AGXG13XDevice: 0x1133f2400>
        name = Apple M1 Pro 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x2e509ca00>
        label = <none> 
        device = <AGXG13XDevice: 0x1133f2400>
            name = Apple M1 Pro 
    retainedReferences = 1


