# Explanation

This notebook showcases how to generate outputs.

In [1]:
# Imports
from collections import Counter
from keras.models import load_model
from gensim.models import KeyedVectors
from textatistic import Textatistic

import pandas as pd
import numpy as np
import re

Using TensorFlow backend.


In [2]:
# Load dataset
print("Loading data")
data = pd.read_csv("./LocalData/ProcessedSongData.csv")
print("data loaded.")

Loading data
data loaded.


In [3]:
# Method for preparing user input.
# Needs to undergo the same clean-up, and tokenization.
def basic_cleaning(sentence):
    s = sentence.lower()
    s = s.replace("\n", " \n ")
    return s

def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']
    for i, w in enumerate(s_list):
        if w == '\r\n':
            s_list[i] = '\\r\\n'
    return s_list

# Let us clean the token list with this new information.
# The below removes anything except whitespace and alphanumeric characters.
def remove_punctuation(s):
    return re.sub('[^\w\s]', ' ', s)

# There are, to my awareness, no words with consecutive 
# three same letters in english.
def remove_extra_letters(s):
    return re.sub(r"(.)\1{2,}", r"\1"*2, s)

def clean(sentence):
    s = basic_cleaning(sentence)
    s = remove_punctuation(s)
    s = remove_extra_letters(s)
    return tokenize(s)

In [6]:
data['t_corrected'] = data['corrected'].apply(tokenize)

In [7]:
# Prep vocab
print("Creating vocab.")
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

vocab_keys = sorted(list(vocab.keys()))

clean_songs = text_values

word_indices = dict((c, i) for i, c in enumerate(vocab_keys))
indices_word = dict((i, c) for i, c in enumerate(vocab_keys))

Creating vocab.
Number of words total:  15749211
Unique words:  61999


In [32]:
# Load model
MODEL_FILE = './LocalData/Final_20190529124136Type10.h5'
SEQUENCE_LEN= 10
test_model = load_model(MODEL_FILE)

In [9]:
# Load Keyed Vectors
print("Loading Keyed Vectors.")
EMBEDDING_SIZE = 100
wv = KeyedVectors.load("./LocalData/song_word_vec.kv")

wv['\\r\\n'] = wv['\r\n']

Loading Keyed Vectors.


In [10]:
# Ignore this for now.
def generate_nn_data(sentence_list, next_word_list):
    x = np.zeros((len(sentence_list), SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    y = np.zeros(len(next_word_list), dtype=np.int32)
    # Go through each sentence fragment
    for i, s in enumerate(sentence_list):
        # For each word in the sentence fragment, get the vector
        for t, w in enumerate(s):
            # If word not recognized, leave blank.
            if w in wv:
                x[i, t, :] = wv[w]
            else:
                print("Word unrecognized: ", w)
                
        # Set the appropriate y-value.
        y[i] = word_indices[next_word_list[i]]
    return x, y

In [11]:
# Prediction method
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


# Give user sentence
def predict_word(model, seed, word_num, diversity):
    # clean up the sentence.
    if type(seed) == str:
        s = clean(seed)[-word_num:]
    else:
        s = seed
    # Give user warning if some word is not recognized.
    for i, w in enumerate(s):
        if w not in wv:
            print("WARNING: The word ", w, " is not in this vocabulary.")
    
    x_pred = np.zeros((1, SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    for t, w in enumerate(s):
        x_pred[0, t, :] = wv[w]    
    
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, diversity)
    next_word = indices_word[next_index]

    return next_word

def write_song(model, seed, song_len, word_num, diversity):
    s = clean(seed)[-word_num:]
    song = []
    song.extend(s)
    for i in range(song_len):
        pred = predict_word(model, s, word_num, diversity)
        song.append(pred)
        s = s[1:]
        s.append(pred)
    
    return " ".join(song)
        
        
        

In [33]:
# Let's run some examples!
write_song(test_model, "End of passion play, crumbling away I'm your source", 100, SEQUENCE_LEN, 0.8)

  


'end of passion play crumbling away i m your source \\r\\n they s make the baby they over in the \\r\\n i i wish i feel not the more are a and with me \\r\\n \\r\\n for \\r\\n it should you now a day \\r\\n \\r\\n that you i a \\r\\n believe all they can not \\r\\n \\r\\n we d be the way \\r\\n no care \\r\\n bout turn a time you a long \\r\\n of \\r\\n ooh i m re who \\r\\n \\r\\n \\r\\n the good \\r\\n s to go \\r\\n \\r\\n is the \\r\\n \\r\\n \\r\\n well you the i \\r\\n and the \\r\\n \\r\\n \\r\\n i \\r\\n'

In [16]:
pat = re.compile('(\s){2,}')

def process_output(song):
    song = song.replace('\\r\\n', '.\n')
    song = pat.sub('\n', song)
    if song[-1] != '.':
        song += '.'
    return song

# Scoring method!
def score(lyrics):
    s = Textatistic(lyrics)
    return s.flesch_score


In [36]:
# Example.
s = write_song(test_model, "uncle donald had a farm e i e i o", 150, SEQUENCE_LEN, 0.8)
p_s = process_output(s)
print(p_s)
print(score(p_s))

  


uncle donald had a farm e i e i o me .
put .
have been it .
keep .
you had .
black i said .
my big end .
but far with i .
.
you m why .
the dance out even away .
tell .
but my .
i coming forever m once give the life .
you in it .
.
i .
.
by it down .
you us .
from the .
just if i gonna be i too believe .
oh it so you .
you down a is just because do .
of i this more the time .
just the night .
.
that into s right .
that right .
knows a other .
on you .
just don t we .
.
you want to there ll home .
.
.
you s his right up i .
on a things .
and you touch my .
.
.
115.03426829268294


In [15]:
a = process_output(" ".join(clean_songs[0]))
print(a)
print(score(a))

look at her face it s a wonderful face .
and it means something special to me .
look at the way that she smiles when she sees me .
how lucky can one fellow be .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
and when we go for a walk in the park .
and she holds me and squeezes my hand .
we ll go on walking for hours and talking .
about all the things that we plan .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
.
108.67320910973086


In [19]:
EXPERIMENT_NUM = 5

simple5_fn = [('./LocalData/Run' + str(i) + 'Simple5.h5') for i in range(3)]
simple10_fn = [('./LocalData/Run' + str(i) + 'Simple10.h5') for i in range(10)]

run = 0
mode = 5
test_model = load_model(('./LocalData/Run' + str(run) + 'Simple' + str(mode) + '.h5'))
print("Run: ", run, "Mode: ", mode)
for i in range(EXPERIMENT_NUM):
        print("Running experiment", i)
        results = np.zeros(len(clean_songs), dtype=np.float32)
        for k in range(len(results)):
            song = clean_songs[k]
            prompt = song[0:5]

            nn_song = write_song(test_model, prompt, len(song), SEQUENCE_LEN, 0.5)
            nn_p = process_output(' '.join(nn_song))
            real_song = process_output(' '.join(song))

            results[k] = score(nn_p) - score(real_song)

        print("Mean: ", np.mean(results))

        
    

Run:  0 Mode:  5
Running experiment 0


AttributeError: 'list' object has no attribute 'lower'