This implementation was done following this tutorial for song lyrics generation: https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb,
a continuation of the previous tutorial using word embeddings: https://medium.com/@enriqueav/update-automatic-song-lyrics-creator-with-word-embeddings-e30de94db8d1, and this tutorial for word embeddings: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/.

## Importing and preparing Data

In [64]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from collections import Counter

In [102]:
# Load data.
data = pd.read_csv("LocalData/ProcessedSongData.csv")
# Ensure that "token" and "corrected" columns are lists, and not strings of list.
# When saving to csv the lists are converted into string.
print("data loaded.")
fraction = 1
print("Using ", fraction, " of the dataset.")
data = data.sample(frac=fraction)
data = data.reset_index(drop=True)
print("Sampling finished.")

data loaded.
Using  1  of the dataset.
Sampling finished.


## Tokenization
The data has already been cleaned, using the script *CleanData.py*, but needs to be converted into token format again.

In [103]:
# Turn sentence into list of words.
def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']      
    return s_list

print("Starting to tokenize.")
data["t_clean"] = data.clean.apply(tokenize)
print("Tokenized clean.")
data["t_corrected"] = data.corrected.apply(tokenize)
print("Tokenized corrected.")

Starting to tokenize.
Tokenized clean.
Tokenized corrected.


In [82]:
# Confirm output looks correct.
data.t_corrected[0]

['close',
 'every',
 'door',
 'to',
 'me',
 '\r\n',
 'hide',
 'all',
 'the',
 'world',
 'from',
 'me',
 '\r\n',
 'bar',
 'all',
 'the',
 'windows',
 '\r\n',
 'and',
 'shut',
 'out',
 'the',
 'light',
 '\r\n',
 'do',
 'what',
 'you',
 'want',
 'with',
 'me',
 '\r\n',
 'hate',
 'me',
 'and',
 'laugh',
 'at',
 'me',
 '\r\n',
 'darken',
 'my',
 'daytime',
 '\r\n',
 'and',
 'torture',
 'my',
 'night',
 '\r\n',
 '\r\n',
 'if',
 'my',
 'life',
 'were',
 'important',
 'i',
 '\r\n',
 'would',
 'ask',
 'will',
 'i',
 'live',
 'or',
 'die',
 '\r\n',
 'but',
 'i',
 'know',
 'the',
 'answers',
 'lie',
 '\r\n',
 'far',
 'from',
 'this',
 'world',
 '\r\n',
 '\r\n',
 'just',
 'give',
 'me',
 'a',
 'number',
 '\r\n',
 'instead',
 'of',
 'my',
 'name',
 '\r\n',
 'forget',
 'all',
 'about',
 'me',
 '\r\n',
 'and',
 'let',
 'me',
 'decay',
 '\r\n',
 'i',
 'do',
 'not',
 'matter',
 '\r\n',
 'i',
 'm',
 'only',
 'one',
 'person',
 '\r\n',
 'destroy',
 'me',
 'completely',
 '\r\n',
 'then',
 'throw',
 'me',


In [83]:
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

Number of words total:  1584642
Unique words:  22415


In [101]:
# Calculate word frequency
# With a minimum word frequency of 2, all words that only
# ever appear once will be ignored.
MIN_WORD_FREQUENCY=2

ignored_words = set()
for k, v in vocab.items():
    if vocab[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

print('Unique words before ignoring:', len(vocab))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words_reduced = sorted(set(vocab.keys()) - ignored_words)
print('Unique words after ignoring:', len(words_reduced))


#word_indices = dict((c, i) for i, c in enumerate(words_reduced))
#indices_word = dict((i, c) for i, c in enumerate(words_reduced))

# Because we are not using the reduced vocabulary, do this instead.
word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_word = dict((i, c) for i, c in enumerate(vocab))

Unique words before ignoring: 22415
Ignoring words with frequency < 2
Unique words after ignoring: 13981


In [77]:
# Saving the vocabulary to a file.
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
    
# save tokens to a vocabulary file
save_list(words_reduced, 'LocalData/vocab_min2.txt')

In [86]:
# Clean out words that are not in vocab, turn back into strings.
#clean_songs = []
#c = 0
#for song in text_values:
#    c += 1
#    if c%100 == 0:
#        print(c)
#    clean_songs.append([w for w in song if w in words_reduced])

# Because of a slow computer, I'm skipping this step, and using a fraction of the dataset, 
# Only using a fraction of the words.
clean_songs = text_values

## Training Word embedding

Below we use *gensim* to train a custom word embedding on our song dataset.

In [116]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [106]:
# Params
EMBEDDING_SIZE = 100
WINDOW_SIZE = 7

In [137]:
# Saving model and keyed vectors, so training does not have to happen again.
wv_mod = Word2Vec(clean_songs, size=EMBEDDING_SIZE, window=WINDOW_SIZE, min_count=1)
wv_mod.save("LocalData/song_word2vec.model")
wv_mod.wv.save("LocalData/song_word_vec.kv")

In [138]:
wv = wv_mod.wv

In [140]:
pos = [wv['best']]
wv.most_similar(positive=pos, topn=5)

[('best', 1.0),
 ('friend', 0.6336452960968018),
 ('loving', 0.6079089641571045),
 ('because', 0.600318193435669),
 ('things', 0.549144983291626)]

# Splitting Dataset into X and Ys
The data is still a list of strings. We need to be able to convert our input to word vectors, and our output needs to be a one-hot-encoding, as the prediction can be interpreted as a sort of classification task

In [None]:
# Params
SEQUENCE_LEN = 5

In [141]:
# create tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(clean_songs)

[[3938]]

# Preparing the model

In [94]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Bidirectional, LSTM, Dropout

In [134]:
max_length = max([len(s) for s in clean_songs])

Songs encoded. Padding sequences.


In [133]:
# Model!
DROPOUT = 0.5

model = Sequential()
# We will convert any text to a word embedding before sending it on its way.
model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, EMBEDDING_SIZE) ) )
model.add(Dropout(DROPOUT))
# Classification of next word.
model.add(Dense(len(vocab))) 
model.add(Activation('softmax'))

In [None]:
# Generate batch to train on based on 

In [135]:
# Data generator.
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y