__Import dependencies and decleare global variables__

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import os
import re
import random
import gensim
from gensim.models import Word2Vec
from tensorflow.keras.models import Model
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from copy import deepcopy
from keras.models import load_model

EPOCHS = 1
MAX_LENGTH = 100
BATCH_SIZE = 128
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024

__Import data__

In [None]:
def clean(text):
    '''
    '''
    text = text.lower()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("should've", "should have")
    text = text.replace("should't", "should not")
    text = text.replace("should't've", "should not have")
    text = text.replace("would've", "would have")
    text = text.replace("would't", "would not")
    text = text.replace("would't've", "would not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace("'t", "")
    text = text.replace("'ve", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace(",", " , ")
    text = text.replace("´", "")
    text = text.replace("‘", "")
    text = text.replace("’", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace("-", "")
    text = text.replace("–", "")
    text = text.replace("—", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    
    fables = []
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/aesopFables.json')

    with open(filepath) as json_file:  
        data = json.load(json_file)
        for p in data['stories']:
            fables.append(' '.join(p['story']))
            
    print('{} fables imported.'.format(len(fables)))
    
    cleanedFables = []
    for f in fables:
        cleanedFables.append(clean(f))
    
    print('{} fables cleaned.'.format(len(cleanedFables)))

except IOError:
    sys.exit('Cannot find data!')

In [None]:
sumLen = 0
for f in cleanedFables:
    words = f.split(' ')
    sumLen += len(words)

avgLen = sumLen/len(cleanedFables)
avgLen

__Extract Vocabulary__

In [None]:
# CREATE VOCABULARY OF WORDS
idx2word = []
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}
wordSequence = []
for fable in cleanedFables:
    words = fable.split(' ')
    wordSequence.extend(words)
    for word in words:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

for word in idx2word:
    word2idx[word] = len(word2idx)

idx2word = list(word2idx.keys())
textAsInt = np.array([word2idx[w] for w in wordSequence])
vocab_size = len(idx2word)
print('Vocabulary Size: {}'.format(vocab_size))


__Preprocess__

In [None]:
inputSentences = []
targetSentences = []
outputSentences = []

for fable in cleanedFables:
        words = fable.split(' ')

        b=True
        while b:
            if('' in words): 
                words.remove('')
            else: b = False

        sentences = [words[i:i+MAX_LENGTH] for i in range(0, len(words), MAX_LENGTH)]
        for s in sentences:
            for i in range(1, len(s)):
                encode_tokens, decode_tokens = s[:i], s[i:]
                encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
                output_tokens = ' '.join(decode_tokens + ['<END>'])
                decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
                inputSentences.append(encode_tokens)
                targetSentences.append(decode_tokens)
                outputSentences.append(output_tokens)

numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_ed_fables_{}_{}.csv'.format(
MAX_LENGTH,  
BATCH_SIZE)))

if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
df = shuffle(df)
df.to_csv(filePath, index=False)

print("Dataset printed on CSV.")

In [None]:
def generate_data(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_ed_fables_{}_{}.csv'.format(
    MAX_LENGTH,  
    BATCH_SIZE)))
    df = pd.read_csv(filePath)
    
    encoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderTargetData = np.zeros((numSamples, max_length + 2, 1),dtype='int')
    
    for i in range(0, numSamples):
        if(i%10000 == 0):print('Generating feeding data... {}/{}'.format(i,numSamples))
        encoderTokens = df.iloc[[i]]['input_encoder'].values[0].split(' ')
        decoderTokens = df.iloc[[i]]['input_decoder'].values[0].split(' ')
        outputTokens = df.iloc[[i]]['output_decoder'].values[0].split(' ')

        for t, word in enumerate(encoderTokens):
            encoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(decoderTokens):
            decoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(outputTokens):
            # decoderTargetData is ahead of decoderInputData by one timestep
            decoderTargetData[i, t, 0] = word_2_idx[word]

    
    return encoderInputData, decoderInputData, decoderTargetData

__Extract embeddings matrix__

In [None]:
# Recreating embeddings index based on Tokenizer vocabulary
word2vecModel = gensim.models.Word2Vec.load('embeddings/text8_word2vec_skipgram_128.bin')
word2vec_vocabulary = word2vecModel.wv.vocab
embeddingIndex = dict()
counter = 0
for i, word in enumerate(idx2word):
    if word in word2vec_vocabulary :
        embeddingIndex[word] = word2vecModel[word]
    else:
        counter += 1

print("{} words without pre-trained embedding!".format(counter))
    
# Prepare embeddings matrix
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))
for i, word in enumerate(idx2word):
    embeddingVector = embeddingIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

__Or use random weights__

In [None]:
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))

__Define function to build the model__

In [None]:
def build_encoder(vocab_length, embedding_weigths=embeddingMatrix, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM):
    '''
    '''
    # Define an input sequence and process it.
    # Input layer of the encoder :
    encoderInput = Input(shape=(None,))
    
    # Hidden layers of the encoder :
    encoder_embedding = Embedding(input_dim = vocab_length, output_dim = embedding_dim, weights=[embedding_weigths])(encoderInput)

    # Output layer of the encoder :
    encoder_LSTM = CuDNNLSTM(hidden_dim , return_state=True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

    # We discard `encoder_outputs` and only keep the states.
    encoderStates = [state_h, state_c]
    
    
    return encoderInput, encoderStates


def build_encoder_gen(encoder_input, encoder_states):
    '''
    '''
    encoderModelGen = Model(encoder_input, encoder_states)

    return encoderModelGen


def build_decoder(vocab_length, encoderStates, embedding_weigths=embeddingMatrix, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM):
    '''
    '''
    # Set up the decoder, using `encoderStates` as initial state.
    # Input layer of the decoder :
    decoderInput = Input(shape=(None,))

    # Hidden layers of the decoder :
    decoderEmbeddingLayer = Embedding(input_dim = vocab_length, output_dim = embedding_dim, weights=[embedding_weigths])
    decoder_embedding = decoderEmbeddingLayer(decoderInput)

    decoderLSTMLayer = CuDNNLSTM(hidden_dim , return_sequences=True, return_state=True)
    decoder_LSTM_output, _ , _ = decoderLSTMLayer(decoder_embedding, initial_state=encoderStates)

    # Output layer of the decoder :
    decoderDenseLayer = Dense(vocab_length, activation='softmax')
    decoderOutput = decoderDenseLayer(decoder_LSTM_output)

    return decoderInput, decoderOutput, decoderEmbeddingLayer,  decoderLSTMLayer, decoderDenseLayer


def build_decoder_gen(decoder_input, decoder_embedding_layer, decoder_LSTM_layer, decoder_dense, hidden_dim=HIDDEN_DIM):
    '''
    '''
    decoder_state_input_h = Input(shape=(hidden_dim,))
    decoder_state_input_c = Input(shape=(hidden_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_embedding_gen = decoder_embedding_layer(decoder_input)
    decoder_LSTM_output_gen, state_h_gen , state_c_gen = decoder_LSTM_layer(decoder_embedding_gen, initial_state = decoder_states_inputs)
    decoder_states_gen = [state_h_gen, state_c_gen]
    decoderOutputGen = decoder_dense(decoder_LSTM_output_gen)

    # sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
    decoderModelGen = Model(
    [decoder_input] + decoder_states_inputs,
    [decoderOutputGen] + decoder_states_gen
    )

    return decoderModelGen
  
def build_encoder_decoder_model(encoder_input, decoder_input, decoder_output):
    '''
    '''
    model = Model([encoder_input, decoder_input], decoder_output)
    model.summary()

    return model

__Train model__

In [None]:
dirname = os.path.abspath('')

encoderGenPath = os.path.join(dirname, 'models/encoder_fables_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

decoderGenPath = os.path.join(dirname, 'models/decoder_fables_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

encoderInput, encoderStates = build_encoder(vocab_length=vocab_size)

decoderInput, decoderOutput, decoderEmbeddingLayer,  decoderLSTMLayer, decoderDenseLayer = build_decoder(
    vocab_length=vocab_size, 
    encoderStates=encoderStates
)

model = build_encoder_decoder_model(
    encoder_input=encoderInput, 
    decoder_input=decoderInput, 
    decoder_output=decoderOutput
)

encoderInputData, decoderInputData, decoderTargetData = generate_data(
    word_2_idx=word2idx,
    num_samples=numSamples,
    max_length=MAX_LENGTH, 
    vocab_length=vocab_size
)

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
tf.keras.backend.set_session(session)

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.fit([encoderInputData, decoderInputData], decoderTargetData, batch_size=BATCH_SIZE, epochs=EPOCHS)

encoderModelGen = build_encoder_gen(
    encoder_input = encoderInput, 
    encoder_states = encoderStates
)

decoderModelGen = build_decoder_gen(
    decoder_input = decoderInput, 
    decoder_embedding_layer = decoderEmbeddingLayer, 
    decoder_LSTM_layer = decoderLSTMLayer, 
    decoder_dense = decoderDenseLayer
)

encoderModelGen.save_model(encoderGenPath)
decoderModelGen.save_model(decoderGenPath)

session.close()

__Generate text__

In [None]:
def generate_text(sentences, encoder_model, decoder_model, vocab_length, word_2_idx, idx_2_word, max_length):
    '''
    '''
    for phrase in sentences:

        # Cleaning sentence
        phrase = clean(phrase)
        print('GENEREATING FROM: {}'.format(phrase))
        tokens = phrase.split(' ')
        inputSequence = np.zeros((1, max_length), dtype='int')
        for i, t in enumerate(tokens):
            inputSequence[0, i] = word_2_idx[t]

        # Encode the input as state vectors.
        statesValue = encoder_model.predict(inputSequence)
        # Generate empty target sequence of length 1.
        targetSeq = np.zeros((1, 1))
        targetSeq[0, 0] = word_2_idx['<START>']
        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stopCondition = False
        decodedSentence = ''
        decodedList = []
        while not stopCondition:
            outputTokens, h, c = decoder_model.predict(
                [targetSeq] + statesValue)

            # Sample a token
            print(outputTokens)
            sampledTokenIndex = np.argmax(outputTokens[0, -1, :])
            sampledWord = idx_2_word[sampledTokenIndex]
            decodedList.append(sampledWord)
            decodedSentence += ' ' + sampledWord
            print(decodedSentence)

            # Exit condition: either hit max length
            # or find stop character.
            if (sampledWord == '<END>' or len(decodedList)== max_length):
                stopCondition = True

            # Update the target sequence (of length 1).
            targetSeq = np.zeros((1, 1))
            targetSeq[0, 0] = sampledTokenIndex

            # Update states
            statesValue = [h, c]

        print('GENERATED: {}'.format(decodedSentence))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
tf.keras.backend.set_session(session)
        
dirname = os.path.abspath('')

encoderGenPath = os.path.join(dirname, 'models/encoder_fables_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

decoderGenPath = os.path.join(dirname, 'models/decoder_plots_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

sentences = [
    'The Cock',
    'A Dog and a Wolf',
    'There was once a little Bear', 
    'An eagle was given permission to fly over the country.',
    'A dog was talking to a bear asking for some food. The bear who was hungry too said no.',
    'There was once a little Mouse who walking in the forest. He found his way into a bear cave. It was alone and afraid. The cave was really dark and the Bear was sleeping.'
]

encoderModel = load_model(encoderGenPath)
decoderModel = load_model(decoderGenPath)

generate_text(
    sentences = sentences,
    encoder_model = encoderModel,
    decoder_model = decoderModel, 
    vocab_length = vocabLength, 
    word_2_idx = word2idx, 
    idx_2_word = idx2word, 
    max_length = maxLength
)

session.close()