## 1. Import dependencies 
Importing needed dependencies.
In this first step we also define all global variables that will help managing redundancy:
- __*PREPROCESS*__: preprocessing type (Continous or splitted on dots)
- __*EPOCHS*__: number of epochs in which the training is divided.
- __*SENTENCES_MAX_LENGTH*__: Maximum length of the variable dimension phrases..
- __*BATCH_SIZE*__: number of samples after which update the weights.
- __*EMBEDDING_DIM*__: number of neurons in the Embeddings layer.
- __*HIDDEN_DIM*__: number of LSTM units in the network.
- __*ENCODERS*__: number of encoders in the architecture.
- __*DECODERS*__: number of decoders in the architecture.
- __*DROPOUT_RATE*__: Dropout value.
- __*HEADS_ATTENTION*__: number of words considered by the self-attention mechanism.
- __*ACTIVATION_FUNCTION*__: Used by the feedforward layers in the transformer model.

In [None]:
import tensorflow as tf
import json
import keras
import os
import sys
import re
import copy
import numpy as np
import pandas as pd
import random
import gensim
from copy import deepcopy
from gensim.models import Word2Vec
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

PREPROCESS = "CONTINOUS_ONE" # CONTINOUS or DOTS
EPOCHS = 100
SENTENCES_MAX_LENGTH = 100
BATCH_SIZE = 16
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
ENCODERS = 1
DECODERS = 1
DROPOUT_RATE = 0.1
HEADS_ATTENTION = 8
ACTIVATION_FUNCTION = 'relu'

## 2. Import Aesop fables data
The chosen dataset is a JSON file containing 147 Aesop Fables divided in sentences.
For the availabilty, I need to to thanks this funny and interesting project on Aesop Fables which explore the connections between them using machine learning: <a href="https://github.com/itayniv/aesop-fables-stories">GitHub repository</a>

Here an example of how it is structured:
```json
{
  "stories":[
    {
      "number": "01",
      "title": "THE WOLF AND THE KID",
      "story": [
        "There was once a little Kid whose growing horns made him think he was a grown-up Billy Goat and able to take care of himself.",
        "So one evening when the flock started home from the pasture and his mother called, the Kid paid no heed and kept right on nibbling the tender grass.",
        "A little later when he lifted his head, the flock was gone.",
        "He was all alone.",
        "The sun was sinking.",
        "Long shadows came creeping over the ground.",
        "A chilly little wind came creeping with them making scary noises in the grass.",
        "The Kid shivered as he thought of the terrible Wolf.",
        "Then he started wildly over the field, bleating for his mother.",
        "But not half-way, near a clump of trees, there was the Wolf!",
        "The Kid knew there was little hope for him.",
        "Please, Mr. Wolf, he said trembling, I know you are going to eat me.",
        "But first please pipe me a tune, for I want to dance and be merry as long as I can.",
        "The Wolf liked the idea of a little music before eating, so he struck up a merry tune and the Kid leaped and frisked gaily.",
        "Meanwhile, the flock was moving slowly homeward.",
        "In the still evening air the Wolf's piping carried far.",
        "The Shepherd Dogs pricked up their ears.",
        "They recognized the song the Wolf sings before a feast, and in a moment they were racing back to the pasture.",
        "The Wolf's song ended suddenly, and as he ran, with the Dogs at his heels, he called himself a fool for turning piper to please a Kid, when he should have stuck to his butcher's trade."
      ],
      "moral": "Do not let anything turn you from your purpose.",
      "characters": []
    }, ...
```

In [None]:
def clean(text):
    '''
    '''
    text = text.lower()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("should've", "should have")
    text = text.replace("should't", "should not")
    text = text.replace("should't've", "should not have")
    text = text.replace("would've", "would have")
    text = text.replace("would't", "would not")
    text = text.replace("would't've", "would not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace("'t", "")
    text = text.replace("'ve", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace(",", " , ")
    text = text.replace("´", "")
    text = text.replace("‘", "")
    text = text.replace("’", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace("-", "")
    text = text.replace("–", "")
    text = text.replace("—", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    
    fables = []
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/aesopFables.json')

    with open(filepath) as json_file:  
        data = json.load(json_file)
        for p in data['stories']:
            fables.append(' '.join(p['story']))
            
    print('{} fables imported.'.format(len(fables)))
    
    cleanedFables = []
    for f in fables:
        cleanedFables.append(clean(f))
    
    print('{} fables cleaned.'.format(len(cleanedFables)))

except IOError:
    sys.exit('Cannot find data!')

We need to investigate on fables max length and average length to better decided preprocess hyperparamateres.

In [None]:
sumLen = 0
maxLen = 0

for fable in cleanedFables:
    words = fable.split(' ')
    l = len(words)
    sumLen += l
    if l > maxLen : maxLen = l 

avgLen = sumLen/len(cleanedFables)
print('Number of reviews: {}'.format(len(reviewsCleaned)))
print('Max length: {}'.format(maxLen))
print('Avg length: {}'.format(avgLen))  

## 3. Extract Vocabulary
The vocabulary is saved as: 
- a __numpy array__ to map each encoding to the right word
- a __dictionary__ to map each word to its encoding number 

In [None]:
idx2word = []
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}

for fable in fables:
    words = fable.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    for word in words:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

for word in idx2word:
    word2idx[word] = len(word2idx)

idx2word = list(word2idx.keys())
print(idx2word[:3])
vocabLength = len(idx2word)
print('Vocabulary Size: {}'.format(vocabLength))

## 4. Preprocess text

The Transoformer model has an Encoder-Decoder architecture so we can train the model to generate variable dimension sequences, meaning that it will be the model itself to decide how many words have to be generated for a determined input sequence.
However in order to achieve this result the text has to preprocessed in a way that let the model understand where a sequence starts and where it ends.
In fact in the previous code cell we had these three tokens to the vocabulary:

```python
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}
```
I will compare two types of preprocessing, the first identified as __continous__ divides the text into sequences of words, respecting a maximum length decided a priori.Each sequence will generate as many samples as its number of words.

For example, say SEQUENCES_LENGTH is 4 and our text is "Hello my name is Dario and I love to code". 
- Sequences: "Hello my name is ", "Dario and I love", "to code"

Then with the first sequence:
- __EncoderInput__: "START Hello END" <br/>
  __DecoderInput__: "START my name is END" <br/>
  __Target__: "my name is END" <br/>
  
  
- __EncoderInput__: "START Hello my END" <br/>
  __DecoderInput__: "START name is END"<br/>
  __Target__: "name is END"<br/>
  
  
- __EncoderInput__:  "START Hello my name END"<br/>
  __DecoderInput__: "START is END"<br/>
  __Target__: "is END"<br/>
  
  
- __EncoderInput__: "START Hello my name is END" <br/>
  __DecoderInput__: "START END"<br/>
  __Target__: "END"<br/>



The second one, indetified as __dots__, instead of divide continous sequences of the maximum length simply split the fables by dots and then apply the same inputs/target generation to the results. 

In [None]:
def createInputTarget(words) :
    
    encoder = []
    decoder = []
    output = []
    
    for i in range(1, len(words)):
        encode_tokens, decode_tokens = words[:i], words[i:]
        encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
        output_tokens = ' '.join(decode_tokens + ['<END>'])
        decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
        encoder.append(encode_tokens)
        decoder.append(decode_tokens)
        output.append(output_tokens)
        
    return encoder, decoder, output

def getWordTokens(sentence):
    #clean tokens
    words = sentence.split(' ')
    words.append('.')
    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False
    
    return words

def checkMaxLength(words):
    
    seq = []
    
    if len(words) > SENTENCES_MAX_LENGTH :
        seq.append(words[:SENTENCES_MAX_LENGTH])
        seq.append(words[SENTENCES_MAX_LENGTH:])
        while len(seq[-1]) > SENTENCES_MAX_LENGTH:
            tmp = deepcopy(seq[-1])
            del seq[-1]
            seq.append(tmp[:SENTENCES_MAX_LENGTH])
            seq.append(tmp[SENTENCES_MAX_LENGTH:])
    else : 
        seq.append(words)

    return seq

# EXTRACT ENCODER & DECODER INPUT SENTENCES
inputSentences = []
targetSentences = []
outputSentences = []

if PREPROCESS == 'CONTINOUS':
    
    for fable in cleanedFables:
        words = fable.split(' ')

        b=True
        while b:
            if('' in words): 
                words.remove('')
            else: b = False

        sentences = [words[i:i+SENTENCES_MAX_LENGTH] for i in range(0, len(words), SENTENCES_MAX_LENGTH)]
        for s in sentences:
            for i in range(1, len(s)):
                encode_tokens, decode_tokens = s[:i], s[i:]
                encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
                output_tokens = ' '.join(decode_tokens + ['<END>'])
                decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
                inputSentences.append(encode_tokens)
                targetSentences.append(decode_tokens)
                outputSentences.append(output_tokens)

elif PREPROCESS == 'DOTS' : 
    
    for fable in cleanedFables :
        sentences = fable.split('.')
        
        last = None;
        
        for idx, s in enumerate(sentences):
            
            words = getWordTokens(s)
            
            if(len(words) > 2):

                seq = checkMaxLength(words)
                
                for s1 in seq:
                    if(len(s1) > 2):
                        encoder, decoder, output = createInputTarget(s1)
                        inputSentences.extend(encoder)
                        targetSentences.extend(decoder)
                        outputSentences.extend(output)
                
                if(last != None):
                    connection = last[len(last)//2:] + seq[0][:len(seq[0])//2]
                    encoder, decoder, output = createInputTarget(connection)
                    inputSentences.extend(encoder)
                    targetSentences.extend(decoder)
                    outputSentences.extend(output)
                
                last = deepcopy(seq[-1])
                
                
numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_fables_{}_{}_{}_{}_{}.csv'.format(
EPOCHS, 
SENTENCES_MAX_LENGTH, 
BATCH_SIZE, 
EMBEDDING_DIM,
HIDDEN_DIM)))

if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
#df = shuffle(df)
df.to_csv(filePath, index=False)

print("Dataset printed on CSV.")

But what is the purpose of the padding token?
```python
'<PAD>' : 0
```

In order to be able to feed the model we need to create inputs of the same length.
This is way I defined a function to generate final data with paddings.

In [None]:
def generate_data(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_fables_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))
    df = pd.read_csv(filePath)
    
    encoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderTargetData = np.zeros((numSamples, max_length + 2, 1),dtype='int')
    
    for i in range(0, numSamples):
        if(i%10000 == 0):print('Generating feeding data... {}/{}'.format(i,numSamples))    
        encoderTokens = df.iloc[[i]]['input_encoder'].values[0].split(' ')
        decoderTokens = df.iloc[[i]]['input_decoder'].values[0].split(' ')
        outputTokens = df.iloc[[i]]['output_decoder'].values[0].split(' ')

        for t, word in enumerate(encoderTokens):
            encoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(decoderTokens):
            decoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(outputTokens):
            # decoderTargetData is ahead of decoderInputData by one timestep
            decoderTargetData[i, t, 0] = word_2_idx[word]

    
    return encoderInputData, decoderInputData, decoderTargetData

## 5. Extract embeddings matrix
Loading pre-trained embeddings is a good practice to use them and in this case I calculated them with Google's Word2Vec model on the famous text8 dataset.
- *More details on __train_embeddings.ipyn__ notebook* (To be executed if the .bin file do not exists)

The embeddings are simply 128 (or whatever is the dimensionality during training) weigths from a single neuron in the input layer to the 128 neurons in the hidden layer trained to understand which words compared in the same context for a given text.

So we simply extract these weights for every single word in our vocabulary and build a matrix with them.

In [None]:
# Recreating embeddings index based on Tokenizer vocabulary
word2vecModel = gensim.models.Word2Vec.load('embeddings/text8_word2vec_skipgram_128.bin')
word2vec_vocabulary = word2vecModel.wv.vocab
embeddingIndex = dict()
counter = 0
for i, word in enumerate(idx2word):
    if word in word2vec_vocabulary :
        embeddingIndex[word] = word2vecModel[word]
    else:
        counter += 1

print("{} words without pre-trained embedding!".format(counter))
    
# Prepare embeddings matrix
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))
for i, word in enumerate(idx2word):
    embeddingVector = embeddingIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

### _Or it is possible to use random weights_
Do not execute this cell to use pre-trained embeddings.

In [None]:
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))

## 7. Train the model
To build the transformer model I use and external library available on <a href="https://github.com/kpot/keras-transformer">this GitHub repository</a>.
The the model is trained and its weight are saved in a .h5 file.

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_fables_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=ENCODERS,
    decoder_num=DECODERS,
    head_num=HEADS_ATTENTION,
    hidden_dim=HIDDEN_DIM,
    attention_activation=ACTIVATION_FUNCTION,
    feed_forward_activation=ACTIVATION_FUNCTION,
    dropout_rate=DROPOUT_RATE,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    encoderInputData, decoderInputData, decoderTargetData = generate_data(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=SENTENCES_MAX_LENGTH, 
            vocab_length=vocabLength
    )

    # Train the model
    model.fit(
            [encoderInputData, decoderInputData],
            decoderTargetData,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS
            )

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')



## 8. Generate text
To conclude, here the prediction script, which will use the decode function from the open source library to predict the next word again and again



In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_fables_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=ENCODERS,
    decoder_num=DECODERS,
    head_num=HEADS_ATTENTION,
    hidden_dim=HIDDEN_DIM,
    attention_activation=ACTIVATION_FUNCTION,
    feed_forward_activation=ACTIVATION_FUNCTION,
    dropout_rate=DROPOUT_RATE,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.load_weights(transformerModelPath)

sentences = [
    'The Cock',
    'A Dog and a Wolf',
    'There was once a little Bear', 
    'An eagle was given permission to fly over the country.',
    'A dog was talking to a bear asking for some food. The bear who was hungry too said no.',
    'There was once a little Mouse who walking in the forest. He found his way into a bear cave. It was alone and afraid. The cave was really dark and the Bear was sleeping.'
]

decoded_sentences = []
    
for s in sentences:

    print('Generating from: {}'.format(s))
    encoderTokens = []
    s = clean(s)
    encoderwords = s.split(' ')
    
    b=True
    while b:
        if('' in encoderwords): 
            encoderwords.remove('')
        else: b = False
    
    for w in encoderwords:
        encoderTokens.append(word2idx[w])
    encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
    encoderInputData = np.zeros((1, SENTENCES_MAX_LENGTH + 2), dtype='int64')

    decoded = decode(
    model,
    encoderTokens,
    start_token=word2idx['<START>'],
    end_token=word2idx['<END>'],
    pad_token=word2idx['<PAD>'],
    max_len=SENTENCES_MAX_LENGTH,
    )

    decodedPhrase = ''
    for x in decoded:
        decodedPhrase = decodedPhrase + ' ' + idx2word[x]

    decoded_sentences.append(decodedPhrase)
    print('Generated: {}'.format(decodedPhrase))

resultsModelPath = os.path.join(dirname, 'output_data/out_fables_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPath, index=False)