__1 - Importing needed dependencies__

In [1]:
import tensorflow as tf
import json
import keras
import os
import sys
import re
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

Using TensorFlow backend.


__2 - Declaring global variables__

In [2]:
EPOCHS = 50
SENTENCES_MAX_LENGTH = 30
BATCH_SIZE = 16
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
NUM_TRAIN_PLOTS = 147

__3 - Try read data__

In [3]:
def clean(text):
    '''
    '''
    text = text.strip()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace(",", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("-", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = text.lower()
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    
    fables = []
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/aesopFables.json')

    with open(filepath) as json_file:  
        data = json.load(json_file)
        for p in data['stories']:
            fables.append(' '.join(p['story']))
            
    print('{} fables imported.'.format(len(fables)))
    
    plotsList = fables
    for idx, f in enumerate(fables):
        fables[idx] = clean(f)
    trainPlotsList = fables[:NUM_TRAIN_PLOTS]    

except IOError:
    sys.exit('Cannot find data!')

147 fables imported.


__4 - Extract vocabulary__

In [4]:
# CREATE VOCABULARY OF WORDS
idx2word = ['<PAD>','<START>', '<END>']
maxLen = 0
for plot in plotsList:

    words = plot.split(' ')
    
    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False
    
    if len(words) > maxLen : 
        maxLen = len(words)
    
    for word in words:
        if word not in idx2word:
            idx2word.append(word)

word2idx = {}
for word in idx2word:
    word2idx[word] = len(word2idx)

vocabLength = len(idx2word)
print('Vocabulary Size: {}'.format(vocabLength))
print('Fable max length: {}'.format(maxLen))

Vocabulary Size: 3066
Fable max length: 459


__4 - Preprocess__

In [5]:
def createInputTarget(words) :
    
    encoder = []
    decoder = []
    output = []
    
    for i in range(1, len(words)):
        encode_tokens, decode_tokens = words[:i], words[i:]
        encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
        output_tokens = ' '.join(decode_tokens + ['<END>'])
        decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
        encoder.append(encode_tokens)
        decoder.append(decode_tokens)
        output.append(output_tokens)
        
    return encoder, decoder, output

def getWordTokens(sentence):
    #clean tokens
    words = sentence.split(' ')
    words.append('.')
    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False
    
    return words

def checkMaxLength(words):
    
    seq = []
    
    if len(words) > SENTENCES_MAX_LENGTH :
        seq.append(words[:SENTENCES_MAX_LENGTH])
        seq.append(words[SENTENCES_MAX_LENGTH:])
        while len(seq[-1]) > SENTENCES_MAX_LENGTH:
            tmp = seq[-1]
            seq[-1] = tmp[:SENTENCES_MAX_LENGTH]
            seq.append(tmp[SENTENCES_MAX_LENGTH:])
    else : 
        seq.append(words)

    return seq

# EXTRACT ENCODER & DECODER INPUT SENTENCES
inputSentences = []
targetSentences = []
outputSentences = []

for plot in trainPlotsList :
    sentences = plot.split('.')
    last = None 
    
    for idx, s in enumerate(sentences):
        words = getWordTokens(s)
        if(len(words) > 2):
            
            seq = checkMaxLength(words)
            
            if(last != None):
                encode_tokens, decode_tokens = last, seq[0]
                encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
                output_tokens = ' '.join(decode_tokens + ['<END>'])
                decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
                inputSentences.append(encode_tokens)
                targetSentences.append(decode_tokens)
                outputSentences.append(output_tokens)
            
            last = seq[-1]
            
            for s1 in seq:
                if(len(s1) > 2):
                    encoder, decoder, output = createInputTarget(s1)
                    inputSentences.extend(encoder)
                    targetSentences.extend(decoder)
                    outputSentences.extend(output)
            


numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))
stepsPerEpoch = numSamples//BATCH_SIZE
print('StepsPerEpoch: {}'.format(stepsPerEpoch))

print('EXAMPLE OF PREPROCESSING: ')
for inp, outp in zip(inputSentences[:100],targetSentences[:100]):
    print('INPUT: {}'.format(inp))
    print('OUTPUT: {}'.format(outp))

# WRITE DATASET TO TXT  
train_dataset = []

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_fables_{}_{}_{}_{}_{}_{}.csv'.format(
EPOCHS, 
SENTENCES_MAX_LENGTH, 
BATCH_SIZE, 
EMBEDDING_DIM,
HIDDEN_DIM,
NUM_TRAIN_PLOTS)))
if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
df = shuffle(df)
df.to_csv(filePath, index=False)

print("Dataset printed on CSV.")

Num samples: 25420
StepsPerEpoch: 1588
EXAMPLE OF PREPROCESSING: 
INPUT: <START> there <END>
OUTPUT: <START> was once a little kid whose growing horns made him think he was a grownup billy goat and able to take care of himself . <END>
INPUT: <START> there was <END>
OUTPUT: <START> once a little kid whose growing horns made him think he was a grownup billy goat and able to take care of himself . <END>
INPUT: <START> there was once <END>
OUTPUT: <START> a little kid whose growing horns made him think he was a grownup billy goat and able to take care of himself . <END>
INPUT: <START> there was once a <END>
OUTPUT: <START> little kid whose growing horns made him think he was a grownup billy goat and able to take care of himself . <END>
INPUT: <START> there was once a little <END>
OUTPUT: <START> kid whose growing horns made him think he was a grownup billy goat and able to take care of himself . <END>
INPUT: <START> there was once a little kid <END>
OUTPUT: <START> whose growing horns made

__5 - Data generator__

In [6]:
def data_generator(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_fables_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))
    df = pd.read_csv(filePath)
    
    counter = 0

    while True:
        
        for i in range(0, batch_size):
            
            if counter >= numSamples:
                break

            encoderTokens = df.iloc[[counter]]['input_encoder'].values[0].split(' ')
            decoderTokens = df.iloc[[counter]]['input_decoder'].values[0].split(' ')
            outputTokens = df.iloc[[counter]]['output_decoder'].values[0].split(' ')
            
            encoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderTargetData = np.zeros((1, max_length + 2, 1),dtype='int')
            
            for t, word in enumerate(encoderTokens):
                encoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(decoderTokens):
                decoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(outputTokens):
                # decoderTargetData is ahead of decoderInputData by one timestep
                decoderTargetData[0, t, 0] = word_2_idx[word]
                
            counter = counter + 1
            
            yield([encoderInputData,decoderInputData], decoderTargetData)

__6 - Train the model__

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_fables_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=6,
    decoder_num=6,
    head_num=8,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.1,
    embed_weights=np.random.random((len(word2idx), EMBEDDING_DIM)),
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    trainGen = data_generator(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=SENTENCES_MAX_LENGTH, 
            vocab_length=vocabLength
        )

    # Train the model
    model.fit_generator(
            trainGen,
            epochs=EPOCHS,
            steps_per_epoch=numSamples,
            workers=10,
            use_multiprocessing=True
            )

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embedding (EmbeddingRet)  [(None, None, 128),  392448      Encoder-Input[0][0]              
                                                                 Decoder-Input[0][0]              
_____________________

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50




__7 - Generate sentences__

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_fables_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=8,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=1,
    embed_weights=np.random.random((len(word2idx), EMBEDDING_DIM)),
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.load_weights(transformerModelPath)

sentences = [
    'There was once a little', 
]

decoded_sentences = []
    
for s in sentences:

    print('Generating from: {}'.format(s))
    encoderTokens = []
    s = clean(s)
    encoderwords = s.split(' ')
    for w in encoderwords:
        encoderTokens.append(word2idx[w])
    encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
    encoderInputData = np.zeros((1, SENTENCES_MAX_LENGTH + 2), dtype='int64')

    decoded = decode(
    model,
    encoderTokens,
    start_token=word2idx['<START>'],
    end_token=word2idx['<END>'],
    pad_token=word2idx['<PAD>'],
    max_len=SENTENCES_MAX_LENGTH,
    )

    decodedPhrase = ''
    for x in decoded:
        decodedPhrase = decodedPhrase + ' ' + idx2word[x]

    decoded_sentences.append(decodedPhrase)
    print('Generated: {}'.format(decodedPhrase))

resultsModelPatht = os.path.join(dirname, 'output_data/out_fables_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPatht, index=False)