# Contents

## 1. Generating Songs

In [1]:
# Imports
from collections import Counter
from keras.models import load_model
from gensim.models import KeyedVectors
from textatistic import Textatistic

import pandas as pd
import numpy as np
import re

Using TensorFlow backend.


In [2]:
# Load dataset
print("Loading data")
data = pd.read_csv("./LocalData/ProcessedSongData.csv")
print("data loaded.")

Loading data
data loaded.


## 1. Generating Songs

In [3]:
# Method for preparing user input.
# Needs to undergo the same clean-up, and tokenization.
def basic_cleaning(sentence):
    s = sentence.lower()
    s = s.replace("\n", " \n ")
    return s

def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']
    for i, w in enumerate(s_list):
        if w == '\r\n':
            s_list[i] = '\\r\\n'
    return s_list

# Let us clean the token list with this new information.
# The below removes anything except whitespace and alphanumeric characters.
def remove_punctuation(s):
    return re.sub('[^\w\s]', ' ', s)

# There are, to my awareness, no words with consecutive 
# three same letters in english.
def remove_extra_letters(s):
    return re.sub(r"(.)\1{2,}", r"\1"*2, s)

def clean(sentence):
    s = basic_cleaning(sentence)
    s = remove_punctuation(s)
    s = remove_extra_letters(s)
    return tokenize(s)

In [4]:
data['t_corrected'] = data['corrected'].apply(tokenize)

In [5]:
# Prep vocab
print("Creating vocab.")
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

vocab_keys = sorted(list(vocab.keys()))

clean_songs = text_values

word_indices = dict((c, i) for i, c in enumerate(vocab_keys))
indices_word = dict((i, c) for i, c in enumerate(vocab_keys))

Creating vocab.
Number of words total:  15749211
Unique words:  61999


In [20]:
# Load model
MODEL_FILE = './LocalData/Final_20190530170302Type_256_256.h5'
SEQUENCE_LEN= 10
test_model = load_model(MODEL_FILE)

In [21]:
# Load Keyed Vectors
print("Loading Keyed Vectors.")
EMBEDDING_SIZE = 100
wv = KeyedVectors.load("./LocalData/song_word_vec.kv")

wv['\\r\\n'] = wv['\r\n']

Loading Keyed Vectors.


In [22]:
# Ignore this for now.
def generate_nn_data(sentence_list, next_word_list):
    x = np.zeros((len(sentence_list), SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    y = np.zeros(len(next_word_list), dtype=np.int32)
    # Go through each sentence fragment
    for i, s in enumerate(sentence_list):
        # For each word in the sentence fragment, get the vector
        for t, w in enumerate(s):
            # If word not recognized, leave blank.
            if w in wv:
                x[i, t, :] = wv[w]
            else:
                print("Word unrecognized: ", w)
                
        # Set the appropriate y-value.
        y[i] = word_indices[next_word_list[i]]
    return x, y

The character '\\\\r\\\\n' is very often predicted as the most likely value - except in cases such as "i'm", in which case the 2-layer model usually ends in the correct way.

To offset this, it might be interesting to penalize the '\\\\r\\\\n' depending on how long it was since we last saw it. Such functionality has been added below

In [88]:
# Prediction method
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Only select from the *num* highest predictions.
def sample_n(preds, n, diversity):
    
    indices = sorted(range(len(preds)), key=lambda i: preds[i])[-n:]
    index_vals = []
    word_vals = []
    for index in indices:
        index_vals.append(preds[index])
        word_vals.append(indices_word[index])
    
    index = sample(index_vals, diversity)
    
    # See output.
    if False:
        print("\nSample N.")
        print("Indices: ", indices)
        print("Index vals: ", index_vals)
        print("words: ", word_vals)
        print("Chosen: ", indices[index])
    
    return indices[index]

# Counts how many steps back since
# newline character from end of string list
def words_since_newline(s_list):
    for i in range(len(s_list)):
        if s_list[-(i+1)] == '\\r\\n':
            return i
    return len(s_list)

        
def penalize_newline(preds, words_since_newline, newline_limit, p_weight, max_1=False):
    p_i = word_indices['\\r\\n']
    p_factor = p_weight*(words_since_newline/newline_limit)
    if max_1 and p_factor > 1:
        p_factor = 1
    preds[p_i] = preds[p_i]*p_factor
    
    
# Given *seed*, predicts the next word.
def predict_word(model, seed, word_num, diversity, n_largest=-1, newline_limit = -1, p_weight=1):
    # clean up the sentence.
    if type(seed) == str:
        s = clean(seed)[-word_num:]
    else:
        s = seed
    # Give user warning if some word is not recognized.
    for i, w in enumerate(s):
        if w not in wv:
            print("WARNING: The word ", w, " is not in this vocabulary.")
    
    x_pred = np.zeros((1, SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    for t, w in enumerate(s):
        x_pred[0, t, :] = wv[w]    
    
    preds = model.predict(x_pred, verbose=0)[0]
    
    # Penalize newline?
    if not (newline_limit < 0 or p_weight < 0):
        # print("Penalizing newline!")
        penalize_newline(preds, words_since_newline(s), newline_limit, p_weight)
    
    # Extract index of next word.
    if n_largest <= 0:
        next_index = sample(preds, diversity)
    else:
        next_index = sample_n(preds, n_largest, diversity)
    # Get that word
    next_word = indices_word[next_index]

    return next_word


# Calls sample repeatedly to create a song with *word_num* words in it, returns a string.
def write_song(model, seed, song_len, word_num, diversity, n_largest=-1, newline_limit=-1, p_weight=1):
    s = clean(seed)[-word_num:]
    song = []
    song.extend(s)
    for i in range(song_len):
        pred = predict_word(model, s, word_num, diversity, 
                            n_largest=n_largest, newline_limit=newline_limit, p_weight=p_weight)
        song.append(pred)
        s = s[1:]
        s.append(pred)
    
    return " ".join(song)

In [101]:
# Let's run some examples!
write_song(test_model, "death death devil devil devil devil evil evil songs yeah", 50, SEQUENCE_LEN, 
           0.8, n_largest = 10, newline_limit=12, p_weight=1)

  


'death death devil devil devil devil evil evil songs yeah \\r\\n you re a little one \\r\\n the night i want for the love \\r\\n i was the in my heart \\r\\n you can t \\r\\n and i don t know i don t \\r\\n and i know i ve been \\r\\n it s gonna be so i know \\r\\n'

In [66]:
pat = re.compile('(\s){2,}')

def process_output(song):
    song = song.replace('\\r\\n', '.\n')
    song = pat.sub('\n', song)
    if song[-1] != '.':
        song += '.'
    return song

# Scoring method!
def score(lyrics):
    s = Textatistic(lyrics)
    return s.flesch_score


In [12]:
# Example.
s = write_song(test_model, "a b c d there s a monkey in a", 150, SEQUENCE_LEN, 1)
p_s = process_output(s)
print(p_s)
print(score(p_s))

a b c d there s a monkey in a move time .
it we went to play but a .
to your face .
i said i time .
.
that i m fine .
and and my .
.
to to want you .
but you people say once for you .
a long day .
we say and yeah and the in .
some and tell .
and we wanna t some way started for who fight tonight while .
leave nobody at girl people .
but i .
i stay .
she s .
.
it .
i in the wind and there and it we wish but .
.
i a time .
or in this window .
.
the rain on a place on all to .
my sweet re your world it s a long night leave one feeling why day your .
i m the one i need to just never touch.
113.59720864661655


In [13]:
a = process_output(" ".join(clean_songs[0]))
print(a)
print(score(a))

look at her face it s a wonderful face .
and it means something special to me .
look at the way that she smiles when she sees me .
how lucky can one fellow be .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
and when we go for a walk in the park .
and she holds me and squeezes my hand .
we ll go on walking for hours and talking .
about all the things that we plan .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
.
108.67320910973086


In [14]:
EXPERIMENT_NUM = 5

simple5_fn = [('./LocalData/Run' + str(i) + 'Simple5.h5') for i in range(3)]
simple10_fn = [('./LocalData/Run' + str(i) + 'Simple10.h5') for i in range(10)]

run = 0
mode = 5
test_model = load_model(('./LocalData/Run' + str(run) + 'Simple' + str(mode) + '.h5'))
print("Run: ", run, "Mode: ", mode)
for i in range(EXPERIMENT_NUM):
        print("Running experiment", i)
        results = np.zeros(len(clean_songs), dtype=np.float32)
        for k in range(len(results)):
            song = clean_songs[k]
            prompt = song[0:5]

            nn_song = write_song(test_model, prompt, len(song), SEQUENCE_LEN, 0.5)
            nn_p = process_output(' '.join(nn_song))
            real_song = process_output(' '.join(song))

            results[k] = score(nn_p) - score(real_song)

        print("Mean: ", np.mean(results))

        
    

Run:  0 Mode:  5
Running experiment 0


AttributeError: 'list' object has no attribute 'lower'