# Load Lyric Dataset

In [34]:
import pandas as pd

df = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='latin-1')
df = df.drop_duplicates(['Song', 'Artist']).dropna(subset=['Lyrics']) # remove duplicate songs
df = df.sample(frac=1, random_state=1234) # randomize order
lyrics = list(df['Lyrics'].astype(str).copy())
del df # save some memory
lyrics = [lyric.strip() for lyric in lyrics]
print('Example Lyric:', lyrics[0][:80]+'...')
print('Num Lyrics:', len(lyrics))

Example Lyric: at first i was afraid i was petrified thinking i couldnt live without you by my ...
Num Lyrics: 4717


# Character Level Model

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

char_tokenizer = Tokenizer(char_level=True)

char_tokenizer.fit_on_texts(lyrics)

In [3]:
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import random
import math
import numpy as np

# generate one hotted data
def char_data_generator(lyrics, tokenizer):
    for lyric in lyrics:
        char_representation = tokenizer.texts_to_sequences([lyric])[0]
        one_hot_representation = to_categorical(char_representation, num_classes=len(tokenizer.word_index) + 1)
        
        for i in range(len(one_hot_representation)):
            yield np.array(one_hot_representation[:i]), np.array(one_hot_representation[i])

n_char_examples = sum([len(lyric) for lyric in lyrics])
n_batched_char_examples = int(math.ceil(n_char_examples / 128))

shuffle_buffer = 500
batch_size = 128
        
char_dataset = tf.data.Dataset.from_generator(
    lambda: char_data_generator(lyrics, char_tokenizer),
    output_signature=(
        tf.TensorSpec(shape=(None, len(char_tokenizer.word_index)+1), dtype=tf.int32),
        tf.TensorSpec(shape=(len(char_tokenizer.word_index)+1), dtype=tf.int32)))

char_dataset = char_dataset.shuffle(shuffle_buffer)
char_dataset = char_dataset.padded_batch(batch_size)

char_dataset

<PaddedBatchDataset element_spec=(TensorSpec(shape=(None, None, 52), dtype=tf.int32, name=None), TensorSpec(shape=(None, 52), dtype=tf.int32, name=None))>

In [4]:
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import LSTM, Dense

def build_char_model(num_chars):
    return Sequential(
        [
            Input(shape=(None, num_chars)),
            LSTM(128),
            Dense(num_chars, activation="softmax"),
        ]
    )

In [5]:
from tensorflow.keras.optimizers import Adam

char_model = build_char_model(len(char_tokenizer.word_index)+1)

optimizer = Adam(learning_rate=0.01)
char_model.compile(loss="categorical_crossentropy", optimizer=optimizer)


In [None]:
char_model.fit(
    char_dataset, 
    steps_per_epoch = n_batched_char_examples, 
    epochs = 5)


# Word Level Model

## Train Word Embeddings
Word usage is very different in lyrics than in everyday language. Thus, training our own word embeddings is likely to produce better performance.

In [22]:
from gensim.models import Word2Vec

embedding_size = 200

model = Word2Vec(sentences=[lyric.split() for lyric in lyrics], vector_size=embedding_size, window=5, min_count=2, workers=4)
model.wv.save_word2vec_format('lyric_embeddings.txt', binary=False)

In [23]:
print('Vocab size {}'.format(len(model.wv)))

Vocab size 18668


## Create Dataset

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer

word_tokenizer = Tokenizer()

word_tokenizer.fit_on_texts(lyrics)

In [37]:
# generate one hotted data
def word_embedding_data_generator(lyrics, tokenizer):
    unknown_word = np.zeros(len(word_tokenizer.word_index)+1, dtype=np.int32)
    unknown_word[0] = 1
    
    for lyric in lyrics:
        split_lyric = lyric.split()
        
        embeddings = [np.array(model.wv[word]) for word in lyric]
        for i in range(len(1,embeddings)):
            label = (np.array(tokenizer.word_index[split_lyric[i]]) 
                     if (split_lyric[i] in tokenizer.word_index) 
                     else unknown_word)
            yield np.array(embeddings[:i]), label

n_word_examples = sum([len(lyric.split())-1 for lyric in lyrics])
n_batched_word_examples = int(math.ceil(n_word_examples / 128))

shuffle_buffer = 500
batch_size = 128
        
word_dataset = tf.data.Dataset.from_generator(
    lambda: word_embedding_data_generator(lyrics, word_tokenizer),
    output_signature=(
        tf.TensorSpec(shape=(None, embedding_size), dtype=tf.float32),
        tf.TensorSpec(shape=(len(word_tokenizer.word_index)+1), dtype=tf.int32)))

word_dataset = word_dataset.shuffle(shuffle_buffer)
word_dataset = word_dataset.padded_batch(batch_size)

list(word_dataset.take(1))

UnknownError: KeyError: "Key ' ' not present"
Traceback (most recent call last):

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 271, in __call__
    ret = func(*args)

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 1004, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "<ipython-input-37-a51657bb651d>", line 9, in word_embedding_data_generator
    embeddings = [np.array(model.wv[word]) for word in lyric]

  File "<ipython-input-37-a51657bb651d>", line 9, in <listcomp>
    embeddings = [np.array(model.wv[word]) for word in lyric]

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/gensim/models/keyedvectors.py", line 395, in __getitem__
    return self.get_vector(key_or_keys)

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/gensim/models/keyedvectors.py", line 438, in get_vector
    index = self.get_index(key)

  File "/Users/connorbarker/opt/anaconda3/lib/python3.8/site-packages/gensim/models/keyedvectors.py", line 412, in get_index
    raise KeyError(f"Key '{key}' not present")

KeyError: "Key ' ' not present"


	 [[{{node PyFunc}}]] [Op:IteratorGetNext]