__Import dependencies and decleare global variables__

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import json
import re
import sys
import os
import time

SEQUENCES_LENGTH = 30
EPOCHS = 1
BATCH_SIZE = 32
EMBEDDING_DIM = 128
RNN_DIM = 1024 

__Import fables data__

In [4]:
def clean(text):
    '''
    '''
    text = text.strip()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("should've", "should have")
    text = text.replace("should't", "should not")
    text = text.replace("should't've", "should not have")
    text = text.replace("would've", "would have")
    text = text.replace("would't", "would not")
    text = text.replace("would't've", "would not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace("'t", "")
    text = text.replace("'ve", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace(",", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("-", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = text.lower()
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    
    fables = []
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/aesopFables.json')

    with open(filepath) as json_file:  
        data = json.load(json_file)
        for p in data['stories']:
            fables.append(' '.join(p['story']))
            
    print('{} fables imported.'.format(len(fables)))
    
    fablesText = ''
    for idx, f in enumerate(fables):
        fablesText = fablesText + ' ' + clean(f) + '\n'
    
except IOError:
    sys.exit('Cannot find data!')


147 fables imported.


__Extract Vocabulary__

In [10]:
# CREATE VOCABULARY OF WORDS
idx2word = []
wordSequence = fablesText.split(' ')

b=True
while b:
    if('' in wordSequence): 
        wordSequence.remove('')
    else: b = False

for word in wordSequence:
    if word not in idx2word:
        idx2word.append(word)

word2idx = {}
for word in idx2word:
    word2idx[word] = len(word2idx)

vocab_size = len(idx2word)
print('Vocabulary size: {}'.format(vocab_size))

textAsInt = np.array([word2idx[w] for w in wordSequence])

Vocabulary size: 3061


__Preprocess__

In [None]:
def split_input_target(chunk):
    '''
    '''
    inputText = chunk[:-1]
    targetText = chunk[1:]
    return inputText, targetText

wordDataset = tf.data.Dataset.from_tensor_slices(textAsInt)
sequences = wordDataset.batch(SEQUENCES_LENGTH+1, drop_remainder=True) #The batch method lets us easily convert these individual characters to sequences of the desired size.
dataset = sequences.map(split_input_target)

examplesPerEpoch = len(fablesText) // SEQUENCES_LENGTH
stepsPerEpoch = examplesPerEpoch // BATCH_SIZE
dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

__Train the model__

In [None]:
def loss(labels, logits):
  '''
  '''
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits)

rnn = tf.keras.layers.CuDNNLSTM

trainModel = tf.keras.Sequential(
    [tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, 
    batch_input_shape=[BATCH_SIZE, None]),
    rnn(RNN_DIM,return_sequences=True, recurrent_initializer='glorot_uniform',stateful=True),
    tf.keras.layers.Dense(vocab_size)]
)

trainModel.summary()

trainModel.compile(
      optimizer = tf.train.AdamOptimizer(),
      loss = loss)

trainModel.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=stepsPerEpoch)

dirname = os.path.abspath('')
weightsPath = os.path.join(dirname, 'models/rnn_word_fables.h5')
trainModel.save_weights(weightsPath)

__Define generation model__

In [None]:
rnn = tf.keras.layers.CuDNNLSTM

genModel = tf.keras.Sequential(
    [tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, 
    batch_input_shape=[1, None]),
    rnn(RNN_DIM,return_sequences=True, recurrent_initializer='glorot_uniform',stateful=True),
    tf.keras.layers.Dense(vocab_size)]
)

genModel.load_weights(weightsPath)
genModel.build(tf.TensorShape([1, None]))
genModel.summary()

__Generate text__

In [None]:
def generate_text(model, start_string, char_2_idx, idx_2_char):
    '''
    '''
    # Evaluation step (generating text using the learned weights)
    # Number of characters to generate
    numGenerate = SEQUENCES_LENGTH
    # Converting our start string to numbers (vectorizing)
    start_string = clean(start_string) 
    inputEval = [char_2_idx[s] for s in start_string]
    inputEval = tf.expand_dims(inputEval, 0)
    # Empty string to store our results
    textGenerated = []
    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0
    # Here batch size == 1
    model.reset_states()

    for i in range(numGenerate):
        predictions = model(inputEval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
        # using a multinomial distribution to predict the word returned by the trainModel
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
        # We pass the predicted word as the next input to the trainModel
        # along with the previous hidden state
        inputEval = tf.expand_dims([predicted_id], 0)
        textGenerated.append(idx_2_char[predicted_id])

    return (start_string + ''.join(textGenerated))

generated = generate_text(
        model=genModel, 
        start_string="There was once a little", 
        word_2_idx=word2idx, 
        idx_2_word=idx2word
    )

generated