# Transformer Architecture with Keras

In [6]:
import numpy as np

import keras.backend as K

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#-------------------------------------------------------------------------------

In [2]:
#-------------------------------------------------------------------------------
def softmax_over_time(x):
    """
    Implements a softmax operation over the temporal axis for normalizing 
    attention weights into a probability distribution
    
    Input(s)
    - x (tensor): [batch_size, time_steps, 1]
    """
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    
    return e / s
#-------------------------------------------------------------------------------

In [3]:
# Configurations
BATCH_SIZE = 64
EPOCHS = 20
LATENT_DIM = 256
LATENT_DIM_DECODER = 256
NUM_SAMPLES = 10000
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

## Load Data

In [4]:
input_texts = [] # Sentence in original language
target_texts = [] # Sentence in translated language
target_texts_inputs = [] # Sentence (translated) with <SOS> tag

In [5]:
raw_text_path = 'D:/Data/translations/spa-eng/spa.txt'

with open(raw_text_path, encoding='latin1') as f:
    for i, l in enumerate(f.readlines()):
        if i == NUM_SAMPLES:
            break
        
        # If line doesn't contain delimiting char
        if '\t' not in l:
            continue
        
        input_text, translation = l.split('\t')
        
        input_texts.append(input_text)
        target_texts.append(translation + ' <eos>')
        target_texts_inputs.append('<sos>' + translation)
    
print('Num samples:', len(input_texts))

Num samples: 10000


## Preprocess Data

In [None]:
# Tokenize
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='') # Don't filter out punctuations
tokenizer_outputs.fit_on_texts()