# Travis Lyric Generator

## Import Libraries

In [None]:
import os, sys, re, random
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy import spatial
import eng_to_ipa

## Constants

In [None]:
TRAVIS_PATH = "TravisScott"
PRINT_SAMPLES = True
N_SAMPLES_TO_PRINT = 10
EOS_TAG = '<EOL>\n'

## Import Lyrics
Data is imported line by line from the entire discography of Travis Scott gathered from Genius

Most notable changes to the lyrics during import are:
- removal of adlibs, which is represented in parenthesis in each lyric file
- converting all words to lowercase
- appending of an end of line tag, \<EOL>, denoting the end of a line

In [None]:
lines = []

song_counter = 0
if os.path.exists(TRAVIS_PATH):
    for file_name in os.listdir(TRAVIS_PATH):
        file_path = os.path.join(TRAVIS_PATH, file_name)
        if os.path.isfile(file_path) and os.path.splitext(file_path)[1] == ".txt":
            song_counter += 1
            for line in open(file_path):
                line = line.rstrip()
                line = line.replace(',','')
                line = re.sub("[\(\[].*?[\)\]]", "", line)
                line = line.lower()

                if len(line) > 0:
                    line = line.strip()
                    line = line + " " + EOS_TAG
                    lines.append(line.strip())

if PRINT_SAMPLES:
    for x in random.sample(range(0, len(lines)), N_SAMPLES_TO_PRINT):
        print(lines[x].rstrip())

In [None]:
lines_split = [line.split(' ') for line in lines]


docs = []
for i in range(len(lines_split)):
    docs.append(TaggedDocument(lines_split[i], str(i)))


d2v = Doc2Vec(docs, vector_size=100, window=3, min_count=1, workers=4, epochs=50)
d2v.build_vocab(docs, progress_per=10)

word_vector = d2v.wv


In [None]:
encoded_lines = []
for line in lines_split:
    # Encode line
    encoded_line = []
    for word in line:
        encoded_line.append(word_vector.key_to_index[word] + 1) # Plus 1 to make padding 0
    
    # Make n-gram sequences of encoded line
    for i in range(1, len(encoded_line)):
        n_gram_sequence = encoded_line[:i+1]
        encoded_lines.append(n_gram_sequence)

# Pad sequences (and convert to numpy array)
encoded_lines = pad_sequences(encoded_lines)

print("Shape: {}".format(encoded_lines.shape))
if PRINT_SAMPLES:
    for x in random.sample(range(0, len(lines)), N_SAMPLES_TO_PRINT):
        print(encoded_lines[x])



In [None]:
print("NUMBER OF SONGS", song_counter)
print("NUMBER OF LINES", len(lines))
print("NUMBER OF WORDS IN DICT", len(word_vector.index_to_key))
print("MAXIMUM LENGTH OF A LINE", encoded_lines.shape[1])

In [None]:
# Split the data into train data, X, and train labels Y
X = encoded_lines[:,:-1]
labels = encoded_lines[:,-1]
Y = to_categorical(labels, num_classes=len(word_vector.key_to_index) + 1)

print("X shape: {}, Y shape: {}".format(X.shape, Y.shape))

## Model
We train a model to predict the next word based on previous words

In [None]:
model = Sequential()
model.add(Embedding(Y.shape[1], 256, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Use when loading weight
#model.load_weights("guttaNN/guttaNN")

# Use when fitting model
history = model.fit(X, Y, epochs=100, verbose=1)

## Generation of text
Given a trained final model, we can now generate some text with the following procedure:
1. Predict the top 3 next words for every word in the dictionary
2. For each of these 3 words together with the prior word, iteratively predict the next words until an \<EOS> tag appears
3. Convert each encoded sentence back to its original form

In [None]:
generated_lines = []
for word in range(len(word_vector.key_to_index)):
    if word in [0,1]:   # Padding or <EOL> tag
        continue
    ohe_word = pad_sequences([[word]], maxlen=X.shape[1])
    prediction = model(ohe_word)
    top_3 = np.argpartition(prediction, -3)[0][-3:]
    for succ in top_3:
        succ_sentence = [word, succ]
        for _ in range(len(ohe_word[0])):  # Max of prediction is max length of input array
            ohe_word_succ = pad_sequences([succ_sentence], maxlen=X.shape[1])
            succ_prediction = model(ohe_word_succ)
            succ_next_word = np.argmax(succ_prediction)
            succ_sentence.append(succ_next_word)
            if succ_next_word in [0, 1]: # Padding or <EOL> tag
                break
        #print(succ_sentence)
        generated_sentence = [word_vector.index_to_key[i - 1] for i in succ_sentence]

        generated_lines.append(generated_sentence)

if PRINT_SAMPLES:
    for i in range(N_SAMPLES_TO_PRINT):
        print(random.choice(generated_lines))

## Generation of a verse
Now that we have a database of lines, we can generate a verse by combining these line.
For a random start line we calculate the cosine similarity between the start line and all of the other lines in the database. This is for creating verses with context.
Then, we look at how each of these lines rhyme with each other, as this is a key aspect for generating rap lyrics. 
Rap verses typically has lines of sizes close to each other. For instance, it would be weird if one lines have a length of 5 words, and the other line has a length of 20. Therefore we remove all sentences longer or shorter than a threshold

In [None]:
# Get top similar lines
generated_lines_vector = [d2v.infer_vector(line) for line in generated_lines]

In [None]:
def get_top_k_similar_lines(line, k=100):
    line_vector = d2v.infer_vector(line)
    cosine_sim = [spatial.distance.cosine(line_vector, lv) for lv in generated_lines_vector]
    top_k_index = np.argpartition(cosine_sim, -k)[-k:]
    return [generated_lines[index] for index in top_k_index]

In [None]:
def calculate_line_length(line, line_to_compare):
    return 1 - (abs(len(line)-len(line_to_compare)) / max(len(line), len(line_to_compare)))


In [None]:
def get_end_rhymes(line):
    end_rhymes = []
    for word in line:
        word_rhymes = []
        ipa_word = eng_to_ipa.convert(word, retrieve_all=True, keep_punct=False)
        for ipa in ipa_word:
            ipa_rhymes = ""
            for i in ipa:
                if i not in 'bcdfghjklmnpqrstvwxz':
                    ipa_rhymes += i
            word_rhymes.append(ipa_rhymes)
        end_rhymes.append(word_rhymes)
    return end_rhymes

def calculate_end_rhyme(line, line_to_compare):
    line_ipa = get_end_rhymes(line)
    line_to_compare_ipa = get_end_rhymes(line_to_compare)

    rhyme_count = 0
    max_length = min(len(line_ipa), len(line_to_compare_ipa))
    for l in range(max_length):
        match = False
        for x in line_to_compare_ipa[len(line_to_compare_ipa) - l - 1]:
            if x in line_ipa[len(line_ipa) - l - 1]:
                match = True
        if match:
            rhyme_count += 1
        else:
            break

    return rhyme_count

def get_rhyme_candidates(line, similar_lines):
    # Remove <EOL>
    line = line[:-1]
    rhyme_scores = []
    for x in similar_lines:
        x = x[:-1]
        rhyme_scores.append(calculate_end_rhyme(line, x))
    return sorted(range(len(rhyme_scores)), key=lambda k: rhyme_scores[k], reverse=True)

In [None]:
LENGTH_THRESHOLD = 0.75

for j in range(10):
    # Choose a random starting sentence
    generated_verse = [random.choice(generated_lines)]
    while(len(generated_verse) < 8):    # Make a verse of 8 lines
        cosine_lines = get_top_k_similar_lines(generated_verse[-1], 200)
        next_line = None
        if len(generated_verse) == 4:
            i = 0
            next_line = cosine_lines[i]
            while next_line in generated_verse or calculate_line_length(generated_verse[-1], next_line) < LENGTH_THRESHOLD:
                i += 1
                next_line = cosine_lines[i]
        else:
            rhyme_index = get_rhyme_candidates(generated_verse[-1], cosine_lines)
            i = 0
            next_line = cosine_lines[rhyme_index[i]]
            while next_line in generated_verse or next_line[-2] == generated_verse[-1][-2] or calculate_line_length(generated_verse[-1], next_line) < LENGTH_THRESHOLD:
                i += 1
                next_line = cosine_lines[rhyme_index[i]]
        generated_verse.append(next_line)
    print("-----  VERSE {}".format(j+1))
    for line in generated_verse:
        print(line)