This implementation was done following this tutorial for song lyrics generation: https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb,
a continuation of the previous tutorial using word embeddings: https://medium.com/@enriqueav/update-automatic-song-lyrics-creator-with-word-embeddings-e30de94db8d1, and this tutorial for word embeddings: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/.

## Importing and preparing Data

In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from collections import Counter

In [12]:
# Load data.
data = pd.read_csv("LocalData/ProcessedSongData.csv")
# Ensure that "token" and "corrected" columns are lists, and not strings of list.
# When saving to csv the lists are converted into string.
print("data loaded.")
# fraction = 1
# print("Using ", fraction, " of the dataset.")
# data = data.sample(frac=fraction)
# data = data.reset_index(drop=True)
# print("Sampling finished.")

data loaded.


## Tokenization
The data has already been cleaned, using the script *CleanData.py*, but needs to be converted into token format again.

In [13]:
# Turn sentence into list of words.
def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']      
    return s_list

print("Starting to tokenize.")
data["t_clean"] = data.clean.apply(tokenize)
print("Tokenized clean.")
data["t_corrected"] = data.corrected.apply(tokenize)
print("Tokenized corrected.")

Starting to tokenize.
Tokenized clean.
Tokenized corrected.


In [114]:
# Confirm output looks correct.
data.t_corrected[0]

['look',
 'at',
 'her',
 'face',
 'it',
 's',
 'a',
 'wonderful',
 'face',
 '\r\n',
 'and',
 'it',
 'means',
 'something',
 'special',
 'to',
 'me',
 '\r\n',
 'look',
 'at',
 'the',
 'way',
 'that',
 'she',
 'smiles',
 'when',
 'she',
 'sees',
 'me',
 '\r\n',
 'how',
 'lucky',
 'can',
 'one',
 'fellow',
 'be',
 '\r\n',
 '\r\n',
 'she',
 's',
 'just',
 'my',
 'kind',
 'of',
 'girl',
 'she',
 'makes',
 'me',
 'feel',
 'fine',
 '\r\n',
 'who',
 'could',
 'ever',
 'believe',
 'that',
 'she',
 'could',
 'be',
 'mine',
 '\r\n',
 'she',
 's',
 'just',
 'my',
 'kind',
 'of',
 'girl',
 'without',
 'her',
 'i',
 'm',
 'blue',
 '\r\n',
 'and',
 'if',
 'she',
 'ever',
 'leaves',
 'me',
 'what',
 'could',
 'i',
 'do',
 'what',
 'could',
 'i',
 'do',
 '\r\n',
 '\r\n',
 'and',
 'when',
 'we',
 'go',
 'for',
 'a',
 'walk',
 'in',
 'the',
 'park',
 '\r\n',
 'and',
 'she',
 'holds',
 'me',
 'and',
 'squeezes',
 'my',
 'hand',
 '\r\n',
 'we',
 'll',
 'go',
 'on',
 'walking',
 'for',
 'hours',
 'and',
 't

In [57]:
# Create vocab
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

Number of words total:  15749211
Unique words:  61999


In [58]:
vocab_keys = sorted(list(vocab.keys()))

In [16]:
# Calculate word frequency
# With a minimum word frequency of 2, all words that only
# ever appear once will be ignored.
MIN_WORD_FREQUENCY=2

ignored_words = set()
for k, v in vocab.items():
    if vocab[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

print('Unique words before ignoring:', len(vocab))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words_reduced = sorted(set(vocab.keys()) - ignored_words)
print('Unique words after ignoring:', len(words_reduced))


#word_indices = dict((c, i) for i, c in enumerate(words_reduced))
#indices_word = dict((i, c) for i, c in enumerate(words_reduced))

# Because we are not using the reduced vocabulary, do this instead.
word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_word = dict((i, c) for i, c in enumerate(vocab))

Unique words before ignoring: 61999
Ignoring words with frequency < 2
Unique words after ignoring: 40742


In [77]:
# Saving the vocabulary to a file.
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
    
# save tokens to a vocabulary file
save_list(words_reduced, 'LocalData/vocab_min2.txt')

In [20]:
# Clean out words that are not in vocab, turn back into strings.
#clean_songs = []
#c = 0
#for song in text_values:
#    c += 1
#    if c%100 == 0:
#        print(c)
#    clean_songs.append([w for w in song if w in words_reduced])

# Because of a slow computer, I'm skipping this step, and using a fraction of the dataset, 
# Only using a fraction of the words.
clean_songs = text_values

## Training Word embedding

Below we use *gensim* to train a custom word embedding on our song dataset.

In [17]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [93]:
# Params
EMBEDDING_SIZE = 100
WINDOW_SIZE = 7

In [21]:
# Saving model and keyed vectors, so training does not have to happen again.
wv_mod = Word2Vec(clean_songs, size=EMBEDDING_SIZE, window=WINDOW_SIZE, min_count=1)
wv_mod.save("LocalData/song_word2vec.model")
wv_mod.wv.save("LocalData/song_word_vec.kv")

In [22]:
wv = wv_mod.wv

In [94]:
pos = [wv['best']]
wv.most_similar(positive=pos, topn=5)

array([-7.54496038e-01,  1.05630708e+00,  3.95771623e-01, -4.22454953e-01,
       -2.53180265e+00,  4.60597456e-01, -6.05708733e-02,  6.86838627e-02,
        2.08732295e+00,  1.66664451e-01, -3.04791498e+00,  8.90742958e-01,
       -1.61413455e+00, -1.16835706e-01,  9.39812243e-01,  1.94252932e+00,
        5.46967447e-01, -4.93695676e-01,  1.42721319e+00, -1.99940360e+00,
        1.54394722e+00, -2.29958987e+00,  1.20335591e+00, -3.53519821e+00,
       -4.96471941e-01,  2.39508200e+00, -1.02130353e+00,  2.71982241e+00,
       -6.38455212e-01,  1.58451009e+00, -9.05888319e-01, -1.35535121e-01,
        1.92685497e+00, -1.81356299e+00,  2.09541583e+00,  2.16882300e+00,
        3.77132088e-01, -3.77653623e+00,  1.07760155e+00,  1.54926598e+00,
        1.55171466e+00, -6.91413105e-01,  3.90740585e+00,  5.69357097e-01,
       -1.65043259e+00, -4.10019815e-01,  1.63975847e+00,  2.64878035e+00,
        1.27710676e+00, -3.83743095e+00, -1.08813596e+00, -2.11548758e+00,
       -1.22665405e+00, -

In [24]:
wv = KeyedVectors.load("LocalData/song_word_vec.kv")

# Splitting Dataset into X and Ys
The data is still a list of strings. We need to be able to convert our input to word vectors, and our output needs to be a one-hot-encoding, as the prediction can be interpreted as a sort of classification task

In [27]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [25]:
# Params
SEQUENCE_LEN = 5

In [64]:
# create tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(clean_songs)

In [65]:
# Make a dictionary, from the tokenizer and save it.
token_nums = tokenizer.texts_to_sequences([vocab_keys])[0]

In [82]:
# Writing a dictionary to file.
f = open("LocalData/tokenizer_dict.txt","w")
for i in range(len(token_nums)):
    f.write((repr(vocab_keys[i]) + " " + str(token_nums[i]) + "\n"))
            
f.close()

In [86]:
# Two dicts! To look up the indices. Why not.
token_to_string = dict()
string_to_token = dict()
f = open("LocalData/tokenizer_dict.txt","r")
for i in range(len(token_nums)):
    l = f.readline().split(" ")
    key = l[0]
    tok = int(l[1])
    # Remove the '.
    key = key[1:-1]
    token_to_string[tok] = key
    string_to_token[key] = tok
    
f.close()

In [119]:
# IMPORTANT! This particular thing fucks things up because it uses \.
string_to_token['\r\n'] = string_to_token['\\r\\n']

In [92]:
# Generate SEQUENCE_LEN words from all songs.
STEP = 1
sentences = []
next_words = []
    
for song in clean_songs:
    if len(song) > SEQUENCE_LEN:
        for i in range(0, len(song) - SEQUENCE_LEN, STEP):
            sentences.append(text_in_words[i: i + SEQUENCE_LEN])
            next_words.append(text_in_words[i + SEQUENCE_LEN])

print('Sequences:', len(sentences))

Sequences: 15460961


In [128]:
# Data generator to avoid memory issues.
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    looped = False
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
        y = np.zeros((batch_size, len(vocab)), dtype=np.bool)
        for i in range(batch_size):
            # For each word in the sentence fragment, get the vector
            for t, w in enumerate(sentence_list[index]):
                x[i, t,:] = wv[w]
            # Set the appropriate y-value.
            y[i, string_to_token[next_word_list[index]]] = 1
            # Each batch does a different sentence.
            index = index + 1
            # Reset the index at the end.
            if index == len(sentence_list):
                index = 0
                looped = True
        # Stopping condition: If we have gone around, stop yielding.
        if looped:
            return None
        else:
            yield x, y

In [133]:
for x, y in generator(sentences[0:5], next_words[0:5], 1):
    print("X: ")
    print(x)
    print("\ny:")
    print(y)

X: 
[[[-1.26525140e+00 -2.17486191e+00 -2.71750259e+00 -1.69165003e+00
   -1.62080848e+00 -2.59787512e+00  1.60541451e+00  2.62714553e+00
    2.48251629e+00  1.04202032e+00  4.64015901e-01  1.83763325e+00
   -6.02256727e+00  1.13758780e-01 -6.55554712e-01  4.86683321e+00
    3.31622314e+00 -1.87176251e+00  1.45121396e-01  3.68690848e+00
   -1.37359127e-01 -9.09220815e-01 -1.02506888e+00 -4.66403693e-01
   -2.41911578e+00  2.78507924e+00 -2.16101453e-01  2.09621549e+00
   -3.52564037e-01 -2.67870992e-01 -2.11816001e+00  1.99989164e+00
    2.94896215e-01  1.95431507e+00 -1.41456664e+00  2.37476811e-01
    1.63989413e+00  2.42088246e+00 -6.09634757e-01 -4.28877020e+00
   -3.36652994e+00 -9.72085238e-01  1.55412093e-01  9.62847769e-01
    1.40335357e+00  3.55855989e+00 -1.40522063e+00  4.26149145e-02
   -1.94994962e+00 -1.10135877e+00  2.52184176e+00  4.50249434e-01
   -2.42768908e+00 -1.81290102e+00 -1.05302858e+00  4.25047112e+00
    2.09376097e+00  1.25074852e+00  3.36021781e-01  8.1386

# Preparing the model

In [158]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Bidirectional, LSTM, Dropout
from keras.callbacks import ModelCheckpoint, LambdaCallback, EarlyStopping
from keras.optimizers import RMSprop

In [147]:
# Params
DROPOUT = 0.5
BATCH_SIZE = 100

In [135]:
max_length = max([len(s) for s in clean_songs])

In [160]:
# Model!

model = Sequential()
# We will convert any text to a word embedding before sending it on its way.
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, EMBEDDING_SIZE) ) )
model.add(Dropout(DROPOUT))
# Classification of next word.
model.add(Dense(len(vocab))) 
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


In [151]:
# From https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()



In [152]:
file_path = "LocalData/checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(vocab),
    SEQUENCE_LEN,
    MIN_WORD_FREQUENCY
)
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]

# Perform training

In [161]:
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=100,
    callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
   211/154610 [..............................] - ETA: 28:35:17 - loss: 0.8058

KeyboardInterrupt: 