# Contents

## 1. Generating Songs

Basic functionality for generating a song implemented.

## 2. Output Analysis
### 2.1 - Classification

Investigation into the properties of the dataset as a classifier

In [142]:
# Imports
from collections import Counter
from keras.models import load_model
from gensim.models import KeyedVectors
from textatistic import Textatistic

from sklearn.metrics import classification_report, accuracy_score

import pandas as pd
import numpy as np
import re

In [2]:
# Load dataset
print("Loading data")
data = pd.read_csv("./LocalData/ProcessedSongData.csv")
print("data loaded.")

Loading data
data loaded.


## 1. Generating Songs

In [3]:
# Method for preparing user input.
# Needs to undergo the same clean-up, and tokenization.
def basic_cleaning(sentence):
    s = sentence.lower()
    s = s.replace("\n", " \n ")
    return s

def tokenize(s):
    s_list = [w for w in s.split(' ') if w.strip() != '' or w == '\r\n']
    for i, w in enumerate(s_list):
        if w == '\r\n':
            s_list[i] = '\\r\\n'
    return s_list

# Let us clean the token list with this new information.
# The below removes anything except whitespace and alphanumeric characters.
def remove_punctuation(s):
    return re.sub('[^\w\s]', ' ', s)

# There are, to my awareness, no words with consecutive 
# three same letters in english.
def remove_extra_letters(s):
    return re.sub(r"(.)\1{2,}", r"\1"*2, s)

def clean(sentence):
    s = basic_cleaning(sentence)
    s = remove_punctuation(s)
    s = remove_extra_letters(s)
    return tokenize(s)

In [4]:
data['t_corrected'] = data['corrected'].apply(tokenize)

In [5]:
# Prep vocab
print("Creating vocab.")
text_values = data.t_corrected.values
vocab = Counter()

text_in_words = []
for song in text_values:
    vocab.update(song)
    text_in_words.extend(song)

print("Number of words total: ", len(text_in_words))
print("Unique words: ", len(vocab))

vocab_keys = sorted(list(vocab.keys()))

clean_songs = text_values

word_indices = dict((c, i) for i, c in enumerate(vocab_keys))
indices_word = dict((i, c) for i, c in enumerate(vocab_keys))

Creating vocab.
Number of words total:  15749211
Unique words:  61999


In [6]:
# Load model
MODEL_FILE = './LocalData/Final_20190530170302Type_256_256.h5'
SEQUENCE_LEN= 10
test_model = load_model(MODEL_FILE)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [7]:
# Load Keyed Vectors
print("Loading Keyed Vectors.")
EMBEDDING_SIZE = 100
wv = KeyedVectors.load("./LocalData/song_word_vec.kv")

wv['\\r\\n'] = wv['\r\n']

Loading Keyed Vectors.


In [8]:
# Ignore this for now.
def generate_nn_data(sentence_list, next_word_list):
    x = np.zeros((len(sentence_list), SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    y = np.zeros(len(next_word_list), dtype=np.int32)
    # Go through each sentence fragment
    for i, s in enumerate(sentence_list):
        # For each word in the sentence fragment, get the vector
        for t, w in enumerate(s):
            # If word not recognized, leave blank.
            if w in wv:
                x[i, t, :] = wv[w]
            else:
                print("Word unrecognized: ", w)
                
        # Set the appropriate y-value.
        y[i] = word_indices[next_word_list[i]]
    return x, y

The character '\\\\r\\\\n' is very often predicted as the most likely value - except in cases such as "i'm", in which case the 2-layer model usually ends in the correct way.

To offset this, it might be interesting to penalize the '\\\\r\\\\n' depending on how long it was since we last saw it. Such functionality has been added below

In [101]:
# Prediction method
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Only select from the *num* highest predictions.
def sample_n(preds, n, diversity):
    
    indices = sorted(range(len(preds)), key=lambda i: preds[i])[-n:]
    index_vals = []
    word_vals = []
    for index in indices:
        index_vals.append(preds[index])
        word_vals.append(indices_word[index])
    
    index = sample(index_vals, diversity)
    
    # See output.
    if False:
        print("\nSample N.")
        print("Indices: ", indices)
        print("Index vals: ", index_vals)
        print("words: ", word_vals)
        print("Chosen: ", indices[index])
    
    return indices[index]

# Counts how many steps back since
# newline character from end of string list
def words_since_newline(s_list):
    for i in range(len(s_list)):
        if s_list[-(i+1)] == '\\r\\n':
            return i
    return len(s_list)

        
def penalize_newline(preds, words_since_newline, newline_limit, p_weight, max_1=False):
    p_i = word_indices['\\r\\n']
    p_factor = p_weight*(words_since_newline/newline_limit)
    if max_1 and p_factor > 1:
        p_factor = 1
    preds[p_i] = preds[p_i]*p_factor
    
    
# Given *seed*, predicts the next word.
def predict_word(model, seed, word_num, diversity, n_largest=-1, newline_limit = -1, p_weight=1):
    # clean up the sentence.
    if type(seed) == str:
        s = clean(seed)[-word_num:]
    else:
        s = seed
    # Give user warning if some word is not recognized.
    for i, w in enumerate(s):
        if w not in wv:
            print("WARNING: The word ", w, " is not in this vocabulary.")
    
    x_pred = np.zeros((1, SEQUENCE_LEN, EMBEDDING_SIZE), dtype=np.float32)
    for t, w in enumerate(s):
        x_pred[0, t, :] = wv[w]    
    
    preds = model.predict(x_pred, verbose=0)[0]
    
    # Penalize newline?
    if not (newline_limit < 0 or p_weight < 0):
        # print("Penalizing newline!")
        penalize_newline(preds, words_since_newline(s), newline_limit, p_weight)
    
    # Extract index of next word.
    if n_largest <= 0:
        next_index = sample(preds, temperature=diversity)
    else:
        next_index = sample_n(preds, n_largest, diversity)
    # Get that word
    next_word = indices_word[next_index]

    return next_word


# Calls sample repeatedly to create a song with *word_num* words in it, returns a string.
def write_song(model, seed, song_len, word_num, diversity, n_largest=-1, newline_limit=-1, p_weight=1):
    s = clean(seed)[-word_num:]
    song = []
    song.extend(s)
    for i in range(song_len):
        pred = predict_word(model, s, word_num, diversity, 
                            n_largest=n_largest, newline_limit=newline_limit, p_weight=p_weight)
        song.append(pred)
        s = s[1:]
        s.append(pred)
    
    return " ".join(song)

### Example Song Generation

In [141]:
seed = "he sleeps under black seas waiting lies dreaming in death"

write_song(test_model, seed, 100, SEQUENCE_LEN, 
           0.3, n_largest = 50, newline_limit=12, p_weight=0.8)

  


'he sleeps under black seas waiting lies dreaming in death \\r\\n i can t i m the world \\r\\n i m a little little way \\r\\n and i m gonna be a be \\r\\n i ll be the sun \\r\\n i m a little little \\r\\n i m the night \\r\\n i m not the \\r\\n i m a little little \\r\\n i don t know you re the sun \\r\\n i m a little little \\r\\n i m a little man \\r\\n i m a little heart \\r\\n and i m to be a little \\r\\n i m a love of the way \\r\\n and i m a little'

In [11]:
pat = re.compile('(\s){2,}')

def process_output(song):
    song = song.replace('\\r\\n', '.\n')
    song = pat.sub('\n', song)
    if song[-1] != '.':
        song += '.'
    return song

# Scoring method!
def score(lyrics):
    s = Textatistic(lyrics)
    return s.flesch_score


In [12]:
# Example.
s = write_song(test_model, "a b c d there s a monkey in a", 150, SEQUENCE_LEN, 1)
p_s = process_output(s)
print(p_s)
print(score(p_s))

  


a b c d there s a monkey in a looking will yes .
where now and put about so my love it blue burn hell too two long light bring for .
it s his heart s take to make me .
i a things can i .
and you must tell me one need to never .
.
tomorrow is like the .
.
came as let him .
.
look me in a been and there dreams yeah .
and say one my place .
.
.
my heart for the whole girls .
.
there s him .
got so much only with the day of to i close looking best in friends live if said .
it s catch this way .
.
a moment .
ever gotta take her me and .
.
we do .
a love and i don t know will down s cause around now .
.
.
if i.
113.59720864661655


In [13]:
a = process_output(" ".join(clean_songs[0]))
print(a)
print(score(a))

look at her face it s a wonderful face .
and it means something special to me .
look at the way that she smiles when she sees me .
how lucky can one fellow be .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
and when we go for a walk in the park .
and she holds me and squeezes my hand .
we ll go on walking for hours and talking .
about all the things that we plan .
.
she s just my kind of girl she makes me feel fine .
who could ever believe that she could be mine .
she s just my kind of girl without her i m blue .
and if she ever leaves me what could i do what could i do .
.
.
108.67320910973086


## 2. Output Analysis

### 2.1 Output analysis

In [115]:
# Load dataset
n = 10**4
print("Using", n, "datapoints.")
d_0 = pd.read_csv('./LocalData/TestSet20190528192602.txt', sep=" ", header=None).values[0:n]
d_1 = pd.read_csv('./LocalData/TestSet20190529124136.txt', sep=" ", header=None).values[0:n]
d_2 = pd.read_csv('./LocalData/TestSet_256_256_20190530170302.txt', sep=" ", header=None).values[0:n]

Using 10000 datapoints.


In [120]:
# Models
model_file_0 = './LocalData/Final_20190528192602Type10.h5'
model_file_1 = './LocalData/Final_20190529124136Type10.h5'
model_file_2 = './LocalData/Final_20190530170302Type_256_256.h5'
model_0 = load_model(model_file_0)
model_1 = load_model(model_file_1)
model_2 = load_model(model_file_2)

In [116]:
# For whatever reason, certain words have been mistaken for numbers and logged as 'NaN' floats. These need to be fixed.
# Substituting it for the string 'nan' is not a solid measure by any means - it can be confused with the word 'nan',
# A synonym for "grandma", however, there are hopefully few enough instances below for this not to be a problem.
def fix_nans(m):   
    nan_counter = 0
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            if type(m[i,j]) is float:
                nan_counter += 1
                m[i,j] = 'nan'
    print("Nans: ", nan_counter)

fix_nans(d_0)
fix_nans(d_1)
fix_nans(d_2)

Nans:  0
Nans:  1
Nans:  0


In [117]:
w_x0 = d_0[:,0:10]
w_x1 = d_1[:,0:10]
w_x2 = d_2[:,0:10]
w_y0 = d_0[:,10]
w_y1 = d_1[:,10]
w_y2 = d_2[:,10]

In [91]:
# Getting indices if necessary.
def get_word_indices_2D(m):
    new_m = np.zeros(m.shape, dtype=np.int32)
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            word = m[i,j]
            if type(word) == str:
                new_m[i,j] = word_indices[m[i,j]]
            else:
                new_m[i,j] = word_indices['\\r\\n']
    return new_m

def get_word_indices_1D(m):
    new_m = np.zeros(m.shape, dtype=np.int32)
    for i in range(m.shape[0]):
            word = m[i]
            if type(word) == str:
                new_m[i] = word_indices[m[i,]]
            else:
                new_m[i] = word_indices['\\r\\n']
    return new_m

In [125]:
# Output extractions.
def get_output(model, word_data):
    out_c = 0
    output = []
    for line in word_data:
        out_c += 1
        if out_c%1000 == 0:
            print("Progress: ", str(out_c/word_data.shape[0]) )
        word = predict_word(model, line, SEQUENCE_LEN, 0.3)
        output.append(word)
    return output

In [127]:
print("Getting predictions from first dataset.")
pred0 = get_output(model_0, w_x0)
print("Getting predictions from second dataset.")
pred1 = get_output(model_1, w_x1)
print("Getting predictions from third dataset.")
pred2 = get_output(model_2, w_x2)

Getting predictions from first dataset.


  


Progress:  0.1
Progress:  0.2
Progress:  0.3
Progress:  0.4
Progress:  0.5
Progress:  0.6
Progress:  0.7
Progress:  0.8
Progress:  0.9
Progress:  1.0
Getting predictions from second dataset.
Progress:  0.1
Progress:  0.2
Progress:  0.3
Progress:  0.4
Progress:  0.5
Progress:  0.6
Progress:  0.7
Progress:  0.8
Progress:  0.9
Progress:  1.0
Getting predictions from third dataset.
Progress:  0.1
Progress:  0.2
Progress:  0.3
Progress:  0.4
Progress:  0.5
Progress:  0.6
Progress:  0.7
Progress:  0.8
Progress:  0.9
Progress:  1.0


In [143]:
print(classification_report(w_y0, pred0))
print("Classification score: ", accuracy_score(w_y0, pred0))

                precision    recall  f1-score   support

           138       0.00      0.00      0.00         1
            16       0.00      0.00      0.00         1
             2       0.00      0.00      0.00         4
           235       0.00      0.00      0.00         1
            40       0.00      0.00      0.00         1
            41       0.00      0.00      0.00         1
            60       0.00      0.00      0.00         1
             9       0.00      0.00      0.00         1
            99       0.00      0.00      0.00         1
          \r\n       0.17      0.90      0.29      1524
             a       0.03      0.02      0.03       194
         about       0.00      0.00      0.00        12
         above       0.00      0.00      0.00         2
           abu       0.00      0.00      0.00         1
      acolytes       0.00      0.00      0.00         1
        across       0.00      0.00      0.00         2
           act       0.00      0.00      0.00  

In [144]:
print(classification_report(w_y1, pred1))
print("Classification score: ", accuracy_score(w_y1, pred1))

                    precision    recall  f1-score   support

                 0       0.00      0.00      0.00         1
                00       0.00      0.00      0.00         1
                 1       0.00      0.00      0.00         1
              1997       0.00      0.00      0.00         1
                 2       0.00      0.00      0.00         2
                 3       0.00      0.00      0.00         1
                40       0.00      0.00      0.00         1
                45       0.00      0.00      0.00         1
                71       0.00      0.00      0.00         1
              \r\n       0.24      0.80      0.36      1505
                 a       0.14      0.20      0.17       165
               aah       0.00      0.00      0.00         1
           abbiamo       0.00      0.00      0.00         1
           ability       0.00      0.00      0.00         1
             about       0.00      0.00      0.00        12
               act       0.00      0.00

In [146]:
print(classification_report(w_y2, pred2))
print("Classification score: ", accuracy_score(w_y2, pred2))

                precision    recall  f1-score   support

             2       0.00      0.00      0.00         1
             3       0.00      0.00      0.00         1
             4       0.00      0.00      0.00         2
            45       0.00      0.00      0.00         1
             5       0.00      0.00      0.00         1
           500       0.00      0.00      0.00         1
             9       0.00      0.00      0.00         1
          \r\n       0.21      0.93      0.34      1495
             a       0.13      0.15      0.14       199
         about       0.00      0.00      0.00        12
         above       0.00      0.00      0.00         3
       account       0.00      0.00      0.00         1
         aches       0.00      0.00      0.00         1
          acre       0.00      0.00      0.00         1
        across       0.00      0.00      0.00         1
        acting       0.00      0.00      0.00         1
        action       0.00      0.00      0.00  

In [None]:
"""
EXPERIMENT_NUM = 5

simple5_fn = [('./LocalData/Run' + str(i) + 'Simple5.h5') for i in range(3)]
simple10_fn = [('./LocalData/Run' + str(i) + 'Simple10.h5') for i in range(10)]

run = 0
mode = 5
test_model = load_model(('./LocalData/Run' + str(run) + 'Simple' + str(mode) + '.h5'))
print("Run: ", run, "Mode: ", mode)
for i in range(EXPERIMENT_NUM):
        print("Running experiment", i)
        results = np.zeros(len(clean_songs), dtype=np.float32)
        for k in range(len(results)):
            song = clean_songs[k]
            prompt = song[0:5]

            nn_song = write_song(test_model, prompt, len(song), SEQUENCE_LEN, 0.5)
            nn_p = process_output(' '.join(nn_song))
            real_song = process_output(' '.join(song))

            results[k] = score(nn_p) - score(real_song)

        print("Mean: ", np.mean(results))
"""