# Explanation

This notebook showcases how to generate outputs.

In [34]:
# Imports
from collections import Counter
from keras.models import load_model
from gensim.models import KeyedVectors

import pandas as pd
import numpy as np
import re

In [14]:
# Load dataset
print("Loading data")
data = pd.read_csv("./LocalData/ProcessedSongData.csv")
print("data loaded.")

Loading data
data loaded.


In [16]:
# Method for preparing user input.
# Needs to undergo the same clean-up, and tokenization.
def basic_cleaning(sentence):
    s = sentence.lower()
    s = s.replace("\n", " \n ")
    return s

def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']
    for i, w in enumerate(s_list):
        if w == '\r\n':
            s_list[i] = '\\r\\n'
    return s_list

# Let us clean the token list with this new information.
# The below removes anything except whitespace and alphanumeric characters.
def remove_punctuation(s):
    s = s.replace("'", "") # spellchecker recognizes these words better?
    return re.sub('[^\w\s]', ' ', s)

# There are, to my awareness, no words with consecutive 
# three same letters in english.
def remove_extra_letters(s):
    return re.sub(r"(.)\1{2,}", r"\1"*2, s)

def clean(sentence):
    s = basic_cleaning(sentence)
    s = remove_punctuation(s)
    s = remove_extra_letters(s)
    return tokenize(s)

In [17]:
data['t_corrected'] = data['corrected'].apply(tokenize)

In [21]:
# Prep vocab
print("Creating vocab.")
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

vocab_keys = sorted(list(vocab.keys()))

clean_songs = text_values

word_indices = dict((c, i) for i, c in enumerate(vocab_keys))
indices_word = dict((i, c) for i, c in enumerate(vocab_keys))

Creating vocab.
Number of words total:  15749211
Unique words:  61999


In [30]:
# Load model
MODEL_FILE = './LocalData/Run2Simple5.h5'
SEQUENCE_LEN= 5
test_model = load_model(MODEL_FILE)

In [31]:
# Load Keyed Vectors
print("Loading Keyed Vectors.")
EMBEDDING_SIZE = 100
wv = KeyedVectors.load("./LocalData/song_word_vec.kv")

wv['\\r\\n'] = wv['\r\n']

Loading Keyed Vectors.


In [32]:
# Ignore this for now.
def generate_nn_data(sentence_list, next_word_list):
    x = np.zeros((len(sentence_list), SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    y = np.zeros(len(next_word_list), dtype=np.int32)
    # Go through each sentence fragment
    for i, s in enumerate(sentence_list):
        # For each word in the sentence fragment, get the vector
        for t, w in enumerate(s):
            # If word not recognized, leave blank.
            if w in wv:
                x[i, t, :] = wv[w]
            else:
                print("Word unrecognized: ", w)
                
        # Set the appropriate y-value.
        y[i] = word_indices[next_word_list[i]]
    return x, y

In [71]:
# Prediction method
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Give user sentence
def predict_word(model, sentence, word_num, diversity):
    # clean up the sentence.
    s = clean(sentence)[-word_num:]
    print(s)
    # Give user warning if some word is not recognized.
    for w in s:
        if w not in wv:
            print("WARNING: The word ", w, " is not in this vocabulary.")
    
    
    x_pred = np.zeros((1, SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    for t, w in enumerate(s):
        x_pred[0, t, :] = wv[w]
    print(x_pred.shape)
    
    preds = model.predict(x_pred, verbose=0)[0]
    print("preds: ", preds.shape)

    next_index = sample(preds, diversity)
    next_word = indices_word[next_index]

    return next_word

In [75]:
# Let's run some examples!
predict_word(test_model, "skulls for the skulls throne", SEQUENCE_LEN, 0.1)

['skulls', 'for', 'the', 'skulls', 'throne']
(1, 5, 100)
preds:  (61999,)


'andante'