# Generating Stanzas

In [4]:
import pickle
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from utils import tokenize_song, tokenize_song_by_stanza, convertSamplesToEmbeddings, read_embeddings


In [126]:
N_GRAM = 5
SENTENCE_BEGIN = '<s>'
SENTENCE_END = '</s>'
BATCH_SIZE = 1000
NEW_LINE = 'newlinebreak'
PROCESSED_DATA_FILE = "../data/processed/processed_data.csv"
STANZAS_FILE = "../data/processed/stanzas.txt"
OUR_RNN_FILE = "../models/rnn_model.h5"
OUR_LSTM_FILE = "../models/lstm_model.h5"


### Make tokenizer

In [112]:
stanzas_as_words = []
with open(STANZAS_FILE, 'r', encoding='utf-8') as txtfile:
    for line in txtfile:
        # Split each line into a list using '\t' as the separator
        line_data = line.strip().split('\t')
        stanzas_as_words.append(line_data)

In [113]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(stanzas_as_words)
# Convert stanzas into numerical indexes (list of lists of string -> list of lists of int)
stanzas = tokenizer.texts_to_sequences(stanzas_as_words)

In [114]:
print(stanzas[0])

[161, 13, 13, 13, 144, 82, 81, 193, 3, 9, 11, 8, 1103, 193, 1, 7, 9, 807, 177, 922, 6, 10, 1, 144, 82, 4, 80, 19, 48, 1561, 43, 48, 1317, 10, 1, 92, 820, 47, 62, 3938, 28, 50, 1, 1, 48, 11, 34, 15, 375, 17, 125, 3, 48, 332, 10, 97, 393, 1, 108, 90, 151, 178, 19, 48, 90, 28, 220, 50, 1, 48, 11, 34, 15, 375, 17, 125, 3, 225, 81, 2, 22, 246, 1, 7, 53, 48, 151, 840, 10, 40, 90, 2, 18, 3, 40, 90, 2, 18, 50, 1, 1, 7, 43, 23, 63, 29, 8, 227, 16, 4, 1300, 1, 7, 48, 1591, 10, 7, 22615, 15, 223, 1, 23, 45, 63, 21, 487, 29, 937, 7, 534, 1, 126, 25, 4, 160, 19, 23, 952, 1, 1, 48, 11, 34, 15, 375, 17, 125, 3, 48, 332, 10, 97, 393, 1, 108, 90, 151, 178, 19, 48, 90, 28, 220, 50, 1, 48, 11, 34, 15, 375, 17, 125, 3, 225, 81, 2, 22, 246, 1, 7, 53, 48, 151, 840, 10, 40, 90, 2, 18, 3, 40, 90, 2, 18, 50, 1, 1, 14, 14, 14]


### Get Index to Embeddings

In [115]:
index_to_embeddings = read_embeddings("../reference-materials/lyrics_embeddings.txt", tokenizer=tokenizer)

### Prediction Function

In [119]:
def predict_word(model, tokenizer, index_to_embedding, last_words):
    """
    Predicts the next word in a sequence.
    """
    # YOUR CODE HERE
    predictions = model.predict(convertSamplesToEmbeddings([last_words], index_to_embedding))[0]
    prob_size = sum(predictions)
    predictions = [x/prob_size for x in predictions]
    chosen_index = np.random.choice(len(predictions), p=predictions, size=1)
    return chosen_index[0]

def predict_stanza(model, tokenizer, index_to_embedding, genre, ngram=N_GRAM):
    """
    Predicts the next stanza in a song.
    """
    stanza = []
    genre_embedding = tokenizer.word_index[genre]
    for i in range(ngram - 2):
        stanza.append(tokenizer.word_index[SENTENCE_BEGIN])
    while stanza[-1] != tokenizer.word_index[SENTENCE_END] and len(stanza) < 40:
        last_words = stanza[-ngram + 2:]
        last_words_with_genre = [genre_embedding]  + last_words
        stanza.append(predict_word(model, tokenizer, index_to_embedding, last_words_with_genre))
    stanza = [tokenizer.index_word[index] for index in stanza]
    return stanza

In [120]:
def print_stanza(stanza):
    """
    Prints a stanza.
    """
    stanza = [word for word in stanza if word not in [SENTENCE_BEGIN, SENTENCE_END]]
    lines = []
    line = []
    for word in stanza:
        if word == NEW_LINE:
            lines.append(line)
            print(' '.join(line))
            line = []
        else:
            line.append(word)


## Our RNN Model

In [132]:
rnn_model = keras.models.load_model(OUR_RNN_FILE)
predicted_stanza = predict_stanza(rnn_model, tokenizer, index_to_embeddings, 'pop')



In [133]:
print_stanza(predicted_stanza)

i must have yawned and cuddled up for yet another night
and rattling on the roof i must have made my front door at eight o'clock or so


## Our LSTM Model

In [138]:
our_lstm_model = keras.models.load_model(OUR_RNN_FILE)
predicted_stanza = predict_stanza(our_lstm_model, tokenizer, index_to_embeddings, 'country')



In [139]:
print_stanza(predicted_stanza)

i 've done it ever since sitsnewlinebreak think i 'm pretty sure you see your new
but like a life grand-daughter like a before 's with a while
if a life
