# Novel Lyrics Synthesis

<br>
<a href="https://colab.research.google.com/github/danieltjw/novel-lyrics-synthesis/blob/master/nls.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" align="left"/>
</a>
<br>
                                                                                                 
In this project, Recurrent Neural Networks (RNNs) are applied to sequence modeling and Natural Language Processing (NLP) tasks. Character-level language models were trained on 100 song lyrics and then used to generate new lyrics. The quality of the generated lyrics were evaluated using 3 metrics—ability to form valid words, emulate the original sentence structure (frequency distribution of sentence length) and similarity (BLEU score).

https://danieltjw.github.io/novel-lyrics-synthesis/

### Code Sections

1. Data

2. Auxiliary

3. Training

4. Evaluation

5. Synthesis

### Prerequisites

#### A. TensorFlow with GPU support

https://www.tensorflow.org/install/

#### B. num2words library

https://github.com/savoirfairelinux/num2words

- `!pip install num2words`

#### C. Song lyrics

csv file (with 'Lyrics' header)

- Location: /data/100_songs.csv

#### D. SCOWL wordlist

https://github.com/en-wl/wordlist

- Location: /downloads/SCOWL_words_50.txt

### Code References

LSTM Text Generation: https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py

In [None]:
""" 0. Libraries """
!pip install num2words
# !pip install keras
# !pip install nltk

# Colab Only
# Menu > Runtime > Change runtime type
# - Runtime type: Python 3
# - Hardware accelerator: GPU
# Replaced 100_songs.csv with copyright free sherlock.csv
!wget https://raw.githubusercontent.com/danieltjw/novel-lyrics-synthesis/master/data/sherlock.csv
!wget https://raw.githubusercontent.com/danieltjw/novel-lyrics-synthesis/master/downloads/SCOWL_words_50.txt

In [None]:
""" 1. Data """
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from collections import Counter, OrderedDict as odict
from num2words import num2words

VER = 1.0
DEFAULT_MAXLEN = 50 # default network sequence length

def process_text(corpus):
    """Process text, aiming to reduce vocabulary"""
    # This replaces strings according to a list of tuple
    # Ordering is important
    old_new = [("‘","'"),("’","'"),("“",'"'),("”",'"'),("!",""),("?",""),
               ('"',""),(":"," "),(" \n","\n"),(";",","),
               ('[hook]','[chorus]'),
               ("2am","2 a.m."),("'45","45"),("9th","nineth"),
               ("in'\n","ing\n"),("in',","ing,"),("in' ","ing "),
               ("'til\n","'till\n"),("'til,","'till,"),("'til ","'till ")]
    for (old, new) in old_new:
        corpus = corpus.replace(old, new)

    # removes quotes around words
    # {'I'm alive'} -> {I'm alive}
    corpus = re.sub(r'([\s\n])\'(.*?)\'([\s\n])', r'\1\2\3', corpus)
    # removes embellishments: vocalisation
    # {(ah-aah-aah)} -> {}
    corpus = re.sub(r'\(.*?\)', r'', corpus)
    # {apart,now} -> {apart, now}
    corpus = re.sub(r'([a-z]+,)([a-z]+)', r'\1 \2', corpus)
    # converts numbers from its numeric to alphabetic form
    # {45} -> {forty-five}
    corpus = re.sub(r'\d+', lambda x: num2words(int(x.group())), corpus)
    corpus = re.sub(r"\[[a-z]+\][\n]", r'', corpus)
    return corpus

def word_token(sentence, output):
    """Returns token words, removes comma"""
    for token in sentence.split():
        if token.endswith(','):
            token = token[:-1]
        if len(token) > 0:
            if isinstance(output, (set,)):
                output.add(token)
            elif isinstance(output, (list,)):
                output.append(token)
    return output

def print_data_state(data):
    """Prints summary of data"""
    print('\nData state:')
    for key in data:
        if key in ['maxlen', 'test_size', 'random_state']:
            print(key, ':', data[key]) 
        elif 'counter' in key:
            print(key, ':', sum(data[key].values()))
        else:
            print(key, ':', len(data[key]))

def get_chars_index(string, chars='\n'):
    """Gets index of specific chars in string"""
    chars_index = []
    for index, char in enumerate(string):
        if char in chars:
            chars_index.append(index)
    return chars_index

def valid_dist_between_chars(string, chars='\n'):
    """Returns count of full sentences, excluding sentences of len 0"""
    chars_index = get_chars_index(string)
    dist = [chars_index[index_index+1]-char_index for index_index, char_index in enumerate(chars_index[:-1])]
    valid_dist = [dis for dis in dist if dis>1]
    return valid_dist

def form(value, dec=2):
    if isinstance(value, float):
        format_str = '{:0.'+str(dec)+'f}'
        return format_str.format(round(value, dec))
    return str(value)

def describe_sentence(sentences, maxlen):
    """Gets likelihood of seeing a full sentence"""
    sent_count = 0
    for sent in sentences:
        valid_dist_list = valid_dist_between_chars(sent)
        sent_count += len(valid_dist_list)
    print('\nMean chance an input will contain a full sentence(s) (', 
          'maxlen: ', maxlen, '): ', form(sent_count/len(sentences)*100)+'%', sep='')
    
def data_vector_split(data, maxlen=DEFAULT_MAXLEN, test_size=0.01, random_state=42):
    """Vectorises the data and then splits it into train / test sets"""
    # if the data configuration is similar from a previous call, skip method
    if 'maxlen' in data:
        if data['maxlen'] == maxlen and data['test_size'] == test_size and data['random_state'] == random_state:
            return           
        
    charset = data['charset']
    char_indices = dict((c, i) for i, c in enumerate(charset))
    indices_char = dict((i, c) for i, c in enumerate(charset))

    sentences = []
    next_chars = []
    songs = data['songs']
    # cuts corpus into sequences of maxlen characters, song-wise
    # ensures no bleeding of lyrics between adjacent songs
    for song in songs:
        for i in range(0, len(song) - maxlen, 1):
            sentences.append(song[i: i + maxlen])
            next_chars.append(song[i + maxlen])
    print('Num sequences:', len(sentences))
    describe_sentence(sentences, maxlen)
    print('Vectorisation...')
    X = np.zeros((len(sentences), maxlen, len(charset)), dtype=np.bool)
    y = np.zeros((len(sentences), len(charset)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1
    
    # splits data into train / test set
    # random state can be used to introduce variation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # 'data' dictionary object is used to hold all dataset related data:
    #  - train / test dataset: X_train, y_train, X_test, y_test
    #  - data configuration: maxlen, test_size, random_state
    #  - vectorisation configuration: char_indices, indices_char
    for name in ['X_train','y_train','X_test','y_test', 
                 'maxlen', 'test_size', 'random_state', 
                 'char_indices', 'indices_char']:
        data[name] = eval(name)

def describe_corpus(data):
    """Prints descriptive statistics of corpus"""
    print('\nDescribing each song len:')
    print(pd.Series([len(song) for song in data['songs']]).describe())
    
    print('\nDescribing each line len:')
    # len of sentence is increased by 1, to represent an additional newline char
    # this brings it in line with the 2nd metric being evaluated
    each_line_len = pd.Series([len(line)+1 for line in data['lines_counter'].elements()])
    print(each_line_len.describe())
#     print("The line of len", max(each_line_len), ':')
#     [print(line) for line in data['lines_counter'] if len(line) == max(each_line_len)]
    
    print('\nDescribing each word len:')
    each_word_len = pd.Series([len(word) for word in data['words_counter'].elements()])
    print(each_word_len.describe())
#     print("The word of len", max(each_word_len), ':', end=' ')
#     [print(word) for word in data['words_counter'] if len(word) == max(each_word_len)]

    print('\nVocab:', len(data['charset']), data['charset'])

def process_corpus(filename='data/100_songs.csv'):
    """Prints descriptive statistics of corpus"""
    raw_corpus = pd.read_csv(filename)
    print(raw_corpus.shape)
    data = odict([
        ('songs', []),
        ('lines_counter', Counter()),
        ('words_counter', Counter()),
        ('chars_counter', Counter()),
        ('wordset', None),
        ('charset', None),
        ('lines_token', [])
    ])
    songs = []
    line_len = []
    for index, song in enumerate(raw_corpus['Lyrics'].str.lower()):
        song = process_text(song)
        data['songs'].append(song)
        data['chars_counter'].update(song)
        lines = song.splitlines()
        data['lines_counter'].update(lines)
        for line in lines:
            words = word_token(line, [])
            if len(words) > 0:
                data['words_counter'].update(words)
                data['lines_token'].append(words)
    del data['lines_counter']['']
    data['wordset'] = set(data['words_counter'])
    data['charset'] = sorted(list(data['chars_counter']))
    data_vector_split(data)
    return data

data = process_corpus(filename='sherlock.csv')
describe_corpus(data)
print_data_state(data)

In [None]:
""" 2. Auxiliary """
"""
Common methods called by Training and Evaluation
Enables spreadsheet reports
"""
from datetime import datetime
import csv

def path_join(args):
    return os.path.abspath(os.path.join(*args))

def write_header(csv_writer, filepath):
    if 'train' in filepath:
        csv_writer.writerow(['val_loss', 'epoch', 'best_of', 
                   'type', 'batch_size', 'val_split', 
                   'maxlen', 'layer', 'unit', 
                   'train_start', 'train_dur', 
                   'filepath', 'history'])
    elif 'eval' in filepath:
        csv_writer.writerow(['div_1.0', 
                             'words', 'new', 'valid', 'invalid', 'chars',
                             'sen_len_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max',
#                              'sen_len_ext_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max',
                             'bleu_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max',
                             'delim', 'mean_of', 'test_num',
                   'eval_start', 'eval_dur', 
                   'filepath', 'result']) 
    elif 'run' in filepath:
        csv_writer.writerow(['Start Run', 'best_of', 'para_combo', 'work_dir', 'para_range',
                   'VER', 'Comments'])

def write_row(output, output_file='saved_models/output.csv'):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    if not os.path.exists(output_file):
        with open(output_file, 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            write_header(csv_writer, output_file)
    try:
        with open(output_file, 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(output)
    except PermissionError:
        # if file to be written to is open (locked), write to temporary file
        # happens often enough to warrant this
        print('WARNING !!! File Locked : ' + output_file)
        with open(output_file.replace('.csv', '_locked.csv'), 'a', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(output)
        
def str_time(time):
    return str(time)[:-7]

In [None]:
""" 3. Training """
from keras import backend
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import CuDNNLSTM, LSTM, CuDNNGRU, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from glob import glob
import itertools
import random
import json
import sys
import os

def rnn_model(para, data):
    """Creates a new RNN model"""
    model_call = {
        'CuDNNLSTM': CuDNNLSTM,
        'CuDNNGRU': CuDNNGRU,
        'LSTM': LSTM,
        'GRU': GRU    
    }
    maxlen = para['maxlen']
    model_type = para['type']
    unit = para['unit']
    layer = para['layer']
    charset = data['charset']
    
    model = Sequential()
    model.add(model_call[model_type](unit, return_sequences=True if layer>1 else False, 
                   input_shape=(maxlen, len(charset))))
    for lay in range(layer-1):
        model.add(model_call[model_type](unit, return_sequences=False 
                       if lay == layer-2 else True))

    model.add(Dense(len(charset)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

def train_model(para, data, run, run_index):
    """Use hyper-parameters (para) to create and train a model"""
    verbose = run['verbose']
    backend.clear_session()
    # 'train' dict holds information to be written to spreadsheet
    train = odict([
        ('val_loss', None),
        ('epoch', None),
        ('start_train', datetime.now())
    ])
    # naming convention used to organise directory structure
    # and also uniquely identify a model's save file
    model_name = '%s_bs-%d_vs-%.2f_ml-%d_layer-%d_unit-%d'%(tuple(para.values()))
    filepath = run['work_dir'] + model_name + '.r' + str(run_index) + '.hdf5'
    
    model = rnn_model(para, data)
    # callbacks
    checkpointer = ModelCheckpoint(filepath, verbose=verbose, period=1, save_best_only=True)
    early_stopper = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=verbose)
    # tensorboarder = TensorBoard('logs/' + model_name, histogram_freq=0)
    callbacks = [checkpointer, early_stopper]

    history = model.fit(data['X_train'], data['y_train'],
                        batch_size=para['batch_size'],
                        validation_split=para['val_split'],
                        shuffle=True,
                        epochs=100,
                        callbacks=callbacks,
                        verbose=verbose)

    val_loss_list = history.history['val_loss']
    min_index = val_loss_list.index(min(val_loss_list))
    train['val_loss'] = form(val_loss_list[min_index], 5)
    train['epoch'] = min_index+1
    train['dur_train'] = str_time(datetime.now() - train['start_train'])
    train['start_train'] = str_time(train['start_train'])
    train['filepath'] = filepath
    train['history'] = history.history
    return train    

def best_train(train_list):
    """Select the best model from candidates based on validation loss"""
    train_list_vloss = [train['val_loss'] for train in train_list]
    min_index = train_list_vloss.index(min(train_list_vloss))
    for index, train in enumerate(train_list):
        if train['epoch'] <= 5:
            print("Low epoch detected:", train['epoch'], train['filepath'])
            write_row([json.dumps(train, sort_keys=True)])
        if min_index == index:
            # best candidate will be renamed as a finalisation step
            old_filepath = train['filepath']
            train['filepath'] = train['filepath'].partition('.r')[0] + '.hdf5'
            os.rename(old_filepath, train['filepath'])
        else:
            # other candidates' save file will be deleted
            os.remove(train['filepath'])
    return train_list[min_index]

def str_para_range(para_range):
    """Generates the directory name based on the hyper-parameters range"""
    abbrev = {'type': '',
              'batch_size': 'bs', 
              'val_split': 'vs',
              'maxlen': 'ml',
              'layer': 'layer',
              'unit': 'unit'}
    para_str = ''
    for key in para_range:
        para_str += abbrev[key] + '-' + str('-'.join(map(str, para_range[key])))
        para_str += '_'
    para_str += 'v'+str(VER)+'/'
    return para_str[1:]

def pre_model_exist(para, run):
    """If a finalised saved file exist, skip model training"""
    model_name = '%s_bs-%d_vs-%.2f_ml-%d_layer-%d_unit-%d'%(tuple(para.values()))
    filepath = run['work_dir'] + '/' + model_name + '.hdf5'
    if os.path.exists(filepath):
        print('Skipping pre-existing model: ' + os.path.basename(filepath))
        return True
    return False

def run_train(para_range, run=None, data=data):
    """Iterates through all combination of the hyper-parameter ranges"""
    default_train_run = odict([
        ('verbose', 0),
        ('work_dir', 'saved_models/' + str_para_range(para_range)),
        ('best_of', 10)])
    if run is None:
        run = default_train_run
    # all possible combination of parameters
    para_combo = list(itertools.product(*[para_range[key] for key in para_range]))
    # logs training run in "saved_models/run.csv"
    write_row([str_time(datetime.now()), run['best_of'], len(para_combo), 
               run['work_dir'], json.dumps({'para_range': para_range}), VER],
          path_join(['saved_models/', 'run.csv']))
    os.makedirs(run['work_dir'], exist_ok=True)
    for model_index, par in enumerate(para_combo, 1):
        para = odict()
        for index, key in enumerate(para_range):
            para[key] = par[index]
        if pre_model_exist(para, run):
            continue
        else:
            print('Start Training :', model_index, '/', len(para_combo), 
                  list(para.values()), '(', run['best_of'], 'Runs )')
        train_list = []
        for run_index in range(run['best_of']):
            data_vector_split(data, maxlen=para['maxlen'], test_size=0.001, random_state=42)
            train = train_model(para, data, run, run_index)
            print('Run', run_index+1, ':', train['dur_train'], '> ' , end="")
            write_row(list(train.values())[:2] + [run['best_of']] + list(para.values()) 
                      + list(train.values())[2:5] + [json.dumps(train['history'], sort_keys=True)], 
                      path_join([run['work_dir'], 'train_candidate.csv']))
            train_list.append(train)
        train = best_train(train_list)
        write_row(list(train.values())[:2] + [run['best_of']] + list(para.values()) 
                  + list(train.values())[2:5] + [json.dumps(train['history'], sort_keys=True)], 
                  path_join([run['work_dir'], 'train.csv']))
        print('Done')
    return run['work_dir'] + 'train.csv'

In [None]:
""" 4. Evaluation """
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

MAX_GEN_LEN = 500

def create_dictset(data, filename='SCOWL_words_50.txt'):
    """Create dictset for valid words eval"""
    # words of len 1-2 were manually screened to 
    # exclude words that don't appear in lyrics
    add_words = set(['a', 'i', 'ah', 'am', 'an', 'as', 'at',
                 'be', 'by', 'do', 'ex', 'go', 'he', 'hi',
                 'if', 'in', 'is', 'it', 'ma', 'me', 'my', 
                 'no', 'of', 'oh', 'on', 'or', 'pa', 'so', 
                 'to', 'uh', 'um', 'up', 'us', 'we'])
    scowl_words_in = open(filename).read().splitlines()
    scowl_words = set()
    for word in scowl_words_in:
        if len(word) > 2:
            scowl_words.add(word.lower())  
    scowl_words.union(add_words)
    data['dictset'] = set.union(*[data['wordset'], scowl_words])

def sample(preds, temperature=1.0):
    """Sample an index from a probability array"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)

def evaluate_generated(generated, dictset, wordset, result_div):
    """
    Evaluate generated sentence
    - 1st metric: valid word %
    - 3rd metric: sentence BLEU score
    """
    generated_words = word_token(generated, [])
    bleu_score = sentence_bleu(data['lines_token'], generated_words, 
                          smoothing_function=SmoothingFunction().method3)
    result_div['bleu'].append(bleu_score)
    for word in generated_words:
        if word not in dictset:
            result_div['invalid_words_counter'].update([word])
            result_div['total_invalid_words'] += 1
        else:
            result_div['valid_words_counter'].update([word])
            result_div['total_valid_words'] += 1
            if word not in wordset:
                result_div['new_words_counter'].update([word])
                result_div['total_new_words'] += 1
    result_div['total_words'] += len(generated_words)

def print_verbose_2(div, index, x_pred_sen, generated):
    print('.'*42)
    print('Phrase:', index+1, '|', 'Diversity:', div)
    print('*' + x_pred_sen + '*' + generated)
    
def print_verbose_1(div, X_test, mean_of, delim, result):
    print('-'*42)
    print('Diversity:', div, '| mean_of:', mean_of, 
          '| test_num:', len(X_test), '| delim:', delim)
    total_invalid_words = result[div]['total_invalid_words']
    print(total_invalid_words, 'invalid words:', 'None' if total_invalid_words == 0 
          else result[div]['invalid_words_counter'])
    total_new_words = result[div]['total_new_words']
    print(total_new_words, 'new words:', 'None' if total_new_words == 0 
          else result[div]['new_words_counter'])
    print('Total valid words:', result[div]['total_valid_words'], '/', result[div]['total_words'], 
      '(' + form(result[div]['total_valid_words_per']) + '%)')
    print('BLEU score:', sum(result[div]['bleu']) / len(result[div]['bleu']) )
    print('='*42)
#     print('average word length:', result[div]['total_chars_evaluated']/result[div]['total_words'])

def evaluate_model(model, para, data, run):
    """
    Evaluate a model iteratively based on:
    - diversity levels
      - each test data point
        - repeats (mean_of)
          - sentence generation (character-by-character)
          - evaluation of sentence
    - return results
    """
    maxlen = para['maxlen']
    result = odict([
        ('start_eval', datetime.now())
    ])
    for div in run['diversity']:
        result[div] = odict([('new_words_counter', Counter()),
                             ('valid_words_counter', Counter()),
                             ('invalid_words_counter', Counter()),
                             ('total_valid_words_per', 0),
                             ('total_words', 0),
                             ('total_new_words', 0),
                             ('total_valid_words', 0),
                             ('total_invalid_words', 0),
                             ('total_chars_evaluated', 0),
                             ('sen_len', []),
#                              ('sen_len_ext', []),
                             ('bleu', [])])

        for test_index in range(len(data['X_test'])):
            x_pred_prime = data['X_test'][test_index:test_index+1]
            x_pred_sen = ''.join([data['indices_char'][np.argmax(x_pred_prime[0, i])] for i in range(maxlen)])
            
            for run_index in range(run['mean_of']):#
                x_pred = x_pred_prime  
                generated = ''
                start_delim = -1 
                end_delim = -1
                for gen_index in range(MAX_GEN_LEN):
                    preds = model.predict(x_pred)[0]
                    next_index = sample(preds, div)
                    next_char = data['indices_char'][next_index]
                    generated += next_char
                    # count num of delimiters in generated sentence so far
                    delim_num = sum(map(generated.count, ['\n',' ']))
                    # returns len of full sentences for 2nd metric
                    valid_sen_len = valid_dist_between_chars(''.join(x_pred_sen.rpartition('\n')[1:])
                                                             + generated)
                    # tracks the start and end of sentences, used by 1st and 3rd metrics
                    if start_delim==-1 and delim_num == 1:
                        start_delim = gen_index
                    if end_delim==-1 and delim_num == run['delim'] + 1:
                        end_delim = gen_index
                    # if the generated sentence is sufficient to calculate all 3 metrics, break
                    if end_delim != -1 and len(valid_sen_len)>=1:
                        result[div]['sen_len'] += valid_sen_len[:1]
#                         result[div]['sen_len_ext'] +sen_len_ext= valid_sen_len[:1]
                        break
                    # converts the char generated into its vectorised form
                    x_pred_next = np.zeros((1, 1, len(data['charset'])), dtype=np.bool)
                    x_pred_next[0, 0, data['char_indices'][next_char]] = True
                    # remove the first in sequence and append new char to end
                    x_pred = np.concatenate((x_pred[:,1:maxlen], x_pred_next), axis=1)
                    if gen_index == MAX_GEN_LEN - 1:
#                         result[div]['sen_len_ext'] += [len(''.join(x_pred_sen.rpartition('\n')[1:])) + MAX_GEN_LEN]
                        print("pred len exceeded:", MAX_GEN_LEN, "| div:", div)
                # gets the generated sentence demarcated by the start and end delimiters
                # this removes any partial words that began in the test data that may
                # inflate the score
                pro_generated = generated[start_delim+1:end_delim]
                result[div]['total_chars_evaluated'] += len(pro_generated)
                evaluate_generated(pro_generated, data['dictset'], data['wordset'], result[div])
                if run['verbose'] >= 2:
                    print_verbose_2(div, test_index, x_pred_sen, generated)
        result[div]['total_valid_words_per'] = result[div]['total_valid_words']/result[div]['total_words']*100
        if run['verbose'] >= 1:
            print_verbose_1(div, data['X_test'], run['mean_of'], run['delim'], result)
    result['dur_eval'] = str_time(datetime.now() - result['start_eval'])
    result['start_eval'] = str_time(result['start_eval'])
    return result

def load_pre_model(filepath):
    model = None
    if os.path.exists(filepath):
        print('Loading saved model: ' + os.path.basename(filepath))
        model = load_model(filepath)
    return model

def get_para_from_filepath(filepath):
    """gets model hyper-parameter from model save filename"""
    filepath = os.path.basename(filepath)
    para = odict([
    ('type', filepath.partition('_')[0]),
    ('batch_size', int(filepath.partition('bs-')[2].partition('_')[0])),
    ('val_split', float(filepath.partition('vs-')[2].partition('_')[0])),
    ('maxlen', int(filepath.partition('ml-')[2].partition('_')[0])),
    ('layer', int(filepath.partition('layer-')[2].partition('_')[0])),
    ('unit', int(filepath.partition('unit-')[2].partition('.')[0]))
    ])
    return para

def format_result(result, diversity):
    for_result = []
    for div in diversity:
        for key in result[div]:
            if 'total' in key:
                for_result.append(form(result[div][key]))
        sen_len_des = pd.Series(result[div]['sen_len']).describe()
        for (_, value) in sen_len_des.items():
            for_result.append(form(value))
#         sen_len_ext_des = pd.Series(result[div]['sen_len_ext']).describe()
#         for (_, value) in sen_len_ext_des.items():
#             for_result.append(form(value))
        bleu_des = pd.Series(result[div]['bleu']).describe()
        for (_, value) in bleu_des.items():
            for_result.append(form(value, 3))
    return for_result

def run_eval(train_csv, start_index=None, end_index=None, run=None, data=data):
    """
    train_csv: filepath to the train.csv output by run_train method
    start_index, end_index: optionally, only eval a subset of models
    """
    default_eval_run = odict([
        ('verbose', 0),
        ('diversity', [1.0]),
        ('mean_of', 5),
        ('delim', 10)])
    if run is None:
        run = default_eval_run
    if 'dictset' not in data:
        create_dictset(data)
    input_table = pd.read_csv(train_csv)
    filepath_list = input_table['filepath'].tolist()[start_index:end_index]
    # evaluate each model listed in train.csv
    for file_index, filepath in enumerate(filepath_list):
        backend.clear_session()
        para = get_para_from_filepath(filepath)
        data_vector_split(data, maxlen=para['maxlen'], test_size=0.001, random_state=42)
        model = load_pre_model(filepath)
        result = evaluate_model(model, para, data, run)
        for_result = format_result(result, run['diversity'])
        print('Done Evaluating :', file_index+1, '/', len(filepath_list), 
              [form(result[div]['total_valid_words_per']) for div in run['diversity']], 
              result['dur_eval'])
        write_row(for_result + [run['delim'], run['mean_of'], len(data['X_test']), result['start_eval'], 
                  result['dur_eval'], filepath], #, json.dumps(result)],
                  path_join([train_csv.replace('train.csv', 'eval_d' + str(run['delim']) + '.csv')]))
    return train_csv

In [None]:
""" 5. Synthesis """
"""Generates a sentence using a prime (seed)"""

def prime_model(prime, maxlen, data):
    """Vectorise the priming string"""
    charset = data['charset']
    char_indices = data['char_indices']
    indices_char = data['indices_char']
    prime = prime[-maxlen:]
    prime = (maxlen-len(prime))*' ' + prime
    print('Prime Vectorisation...')
    X = np.zeros((1, maxlen, len(charset)), dtype=np.bool)
    for t, char in enumerate(prime):
        X[0, t, char_indices[char]] = 1
    # replaces the test dataset with the priming sentence
    data['X_test'] = X
    # modifies the test_size to ensure data configuration info has changed
    # to prevent issues with subsequent normal training / evaluation
    data['test_size'] = 1.0
    
def run_prime(prime, train_csv, start_index=None, end_index=None, run=None, data=data):
    """
    prime: string to seed synthesis
    train_csv: filepath to the train.csv output by run_train method
    start_index, end_index: optionally, only prime a subset of models
    """
    prime = prime.lower()
    if not all(char in data['charset'] for char in prime):
        print('These chars are not in models\' vocab: ', 
              [char for char in prime if char not in data['charset']])
        return
    default_prime_run = odict([
        ('verbose', 2),
        ('diversity', [1.0, 0.5]),
        ('mean_of', 1),
        ('delim', 30)])
    if run is None:
        run = default_prime_run
    if 'dictset' not in data:
        create_dictset(data)
    input_table = pd.read_csv(train_csv)
    filepath_list = input_table['filepath'].tolist()[start_index:end_index]
    for file_index, filepath in enumerate(filepath_list):
        backend.clear_session()
        para = get_para_from_filepath(filepath)
        prime_model(prime, para['maxlen'], data)
        model = load_pre_model(filepath)
        result = evaluate_model(model, para, data, run)
        for_result = format_result(result, run['diversity'])
        print('Done Priming :', file_index+1, '/', len(filepath_list), 
              [form(result[div]['total_valid_words_per']) for div in run['diversity']], 
              result['dur_eval'])
    return train_csv

In [None]:
%%time
# Smoke Test
para_range = odict([
    ('type', ['CuDNNGRU']),
    ('batch_size', [128]),
    ('val_split', [0.05]),
    ('maxlen', [5]),
    ('layer', [2]),
    ('unit', [221])
])

train_run = odict([
    ('verbose', 0),
    ('work_dir', 'saved_models/' + str_para_range(para_range)),
    ('best_of', 1)])

eval_run = odict([
    ('verbose', 0),
    ('diversity', [1.0]),
    ('mean_of', 1),
    ('delim', 10)])

prime_run = odict([
    ('verbose', 2),
    ('diversity', [1.0, 0.5]),
    ('mean_of', 1),
    ('delim', 30)])

expected = "saved_models/CuDNNGRU_bs-128_vs-0.05_ml-5_layer-2_unit-221_v1.0/train.csv"
train_csv = run_train(para_range, train_run)
train_csv = run_eval(train_csv, run=eval_run)
assert expected == run_prime("dream", train_csv, run=prime_run)

# Extended Test
# para_range = odict([
#     ('type', ['CuDNNGRU']),
#     ('batch_size', [128]),
#     ('val_split', [0.05]),
#     ('maxlen', [5]),
#     ('layer', [1, 2, 3]),
#     ('unit', [221])
# ])

# train_run = odict([
#     ('verbose', 0),
#     ('work_dir', 'saved_models/' + str_para_range(para_range)),
#     ('best_of', 3)])

# eval_run = odict([
#     ('verbose', 0),
#     ('diversity', [1.0]),
#     ('mean_of', 3),
#     ('delim', 10)])

# prime_run = odict([
#     ('verbose', 2),
#     ('diversity', [1.0, 0.5]),
#     ('mean_of', 1),
#     ('delim', 30)])

# expected = "saved_models/CuDNNGRU_bs-128_vs-0.05_ml-5_layer-1-2-3_unit-221_v1.0/train.csv"
# train_csv = run_train(para_range, train_run)
# train_csv = run_eval(train_csv, run=eval_run)
# assert expected == run_prime("dream", train_csv, run=prime_run)

In [None]:
# Benchmark Model
# para_range = odict([
#     ('type', ['CuDNNLSTM']), # ('type', ['CuDNNGRU, CuDNNLSTM, GRU, LSTM']),
#     ('batch_size', [128]),
#     ('val_split', [0.05]), # validation / train split
#     ('maxlen', [50]), # network sequence length
#     ('layer', [2]),
#     ('unit', [512])
# ])
# run_eval(run_train(para_range))

In [None]:
# Hyper-parameter Tuning
# para_range = odict([
#     ('type', ['CuDNNGRU']), 
#     ('batch_size', [128]),
#     ('val_split', [0.05]),
#     ('maxlen', [50]),
#     ('layer', [2, 3, 4, 5]),
#     ('unit', [128, 192, 256, 320])
# ])
# train_csv = run_eval(run_train(para_range))

In [None]:
# Selected Model Synthesis 
# prime = "The answer to life, the universe and everything is"
# run_prime(prime, train_csv, 9, 10)