<a href="https://colab.research.google.com/github/dhirensk/ai/blob/master/English_to_French_seq2seq_tf_2_0_withAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**DISCLAIMER:**
This tutorial demostrates an example of attention model using Tensorflow addons tfa.seq2seq module. The tutorial takes the design concepts and code excerpts from the official tensorflow tutorial as presented in following link 
https://www.tensorflow.org/tutorials/text/nmt_with_attention

### Datasets
For French-English Translation

http://www.manythings.org/anki/fra-eng.zip

For English-Hindi Translation

http://www.manythings.org/anki/hin-eng.zip

In [0]:
!pip install tensorflow==2.0
!pip install tensorflow-addons

In [89]:
import tensorflow as tf
import tensorflow_addons as tfa
print(tf.__version__)
from sklearn.model_selection import train_test_split
import os
import io
import numpy as np
import re
import unicodedata
import urllib3
import shutil
import zipfile
import itertools
from google.colab import drive


2.0.0


### Download File

In [3]:
#zipfile = tf.keras.utils.get_file('hin-eng.zip',origin='http://www.manythings.org/anki/hin-eng.zip', extract=True)
http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:       
    shutil.copyfileobj(r, out_file)
print(zipfilename)
with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)
!ls /content/

/content/fra-eng.zip
_about.txt  fra-eng.zip  fra.txt  sample_data


In [0]:
!cat /content/fra.txt | wc -l

170651


### Preprocess File

In [0]:
def read_file(filename):
    path = os.getcwd()
    path = os.path.join(path, filename)
    file = io.open(path,encoding='UTF-8')
    lines = file.read()
    file.close()
    return lines

In [0]:

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [0]:
# For non-english characterset translations such as Hindi, Russian, etc. we keep unicode. 
def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    s = s.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)

    s = s.rstrip().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    s = '<start> ' + s + ' <end>'
    return s

### Create Dataset

In [0]:
def create_dataset(filename, num_samples):
    path = os.getcwd()
    path = os.path.join(path, filename)
    file = io.open(path,encoding='UTF-8')
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_samples]]

    return zip(*word_pairs)

In [0]:
X_text,Y_text,_  = create_dataset("fra.txt", num_samples=5000)

In [9]:
print(X_text[4000:4001])
print(Y_text[4000:4001])

('<start> is this love ? <end>',)
('<start> est ce de l amour ? <end>',)


In [10]:
#total samples
print("Total Samples : ", len(X_text))

Total Samples :  5000


In [0]:
# create a function to tokenize words into index using inbuild tokenizer vocabulory
# important to override filter otherwise it will filter out all punctuation, plus tabs and line breaks, minus the ' character.
def tokenize(input):
   tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
   tokenizer.fit_on_texts(input)
   sequences = tokenizer.texts_to_sequences(input)
  # print(max_len(sequences))
   sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
   return  sequences, tokenizer

In [0]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

In [0]:
# Tokenize each word into index and return the tokenized list and tokenizer
X , X_tokenizer = tokenize(X_text)
Y, Y_tokenizer = tokenize(Y_text)
X_train,  X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

Tx = max_len(X)
Ty = max_len(Y)  

In [14]:
print("Max length English sentence : ", Tx)
print("Max length French sentence : ", Ty)

Max length English sentence :  7
Max length French sentence :  15


In [15]:
X_tokenizer.word_index['<start>'] #'<start>': 2   # tokenize by frequency
input_vocab_size = len(X_tokenizer.word_index)+1  # add 1 for reserve index 0 which is not included in dictionary
output_vocab_size = len(Y_tokenizer.word_index)+ 1
print("input vocab size : ", input_vocab_size)
print("output vocab size : " ,output_vocab_size)

input vocab size :  1223
output vocab size :  2374


### Model Parameters

In [0]:
BATCH_SIZE = 64
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 1024
dense_units = 1024
Dtype = tf.float32   #used to initialize DecoderCell Zero state

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
example_X, example_Y = next(iter(dataset))
print(example_X.shape) 
print(example_Y.shape)  

(64, 7)
(64, 15)


In [18]:
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
example_X, example_Y = next(iter(dataset))
print(example_X.shape) 
print(example_Y.shape)

(64, 7)
(64, 15)


### Tensorflow Addons 2.0

### Encoder

In [0]:
#for LSTM need to initialite Tx hidden and Tx memory state , for GRU only need one of the field
def initialize_initial_state():
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]

### Optimizer and Custom Loss Function

In [0]:
optimizer = tf.keras.optimizers.Adam()

###Important note:
 It's worth pointing out that we divide the loss by batch_size, so our hyperparameters are "invariant" to batch_size. Some people divide the loss by (batch_size * num_time_steps), which plays down the errors made on short sentences. 

More subtly, our hyperparameters (applied to the former way) can't be used for the latter way. For example, if both approaches use SGD with a learning of 1.0, the latter approach effectively uses a much smaller learning rate of 1 / num_time_steps.

### Here, mask is a zero-one matrix of the same size as decoder_outputs. It masks padding positions outside of the target sequence lengths with values 0.

In [0]:
def loss_function(y_pred, y):
    # we are running decoder for each time step, Expected shape of y and y_pred
    #shape of y [batch_size, ty] --> [64, 29]
    #shape of y_pred [batch_size, output_vocab_size] --> [64,29,4951]
    sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    #skip loss calculation for padding i.e. y = 0 index is reserved for padding
    # y is a tensor of batch_size,1 . Create a mask when y=0

    mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss


### Define Model

In [0]:
#ENCODER
class EncoderNetwork(tf.keras.Model):
    def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size, output_dim=embedding_dims)
        self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True, return_state=True )
    
#DECODER
class DecoderNetwork(tf.keras.Model):
    def __init__(self,output_vocab_size, embedding_dims, rnn_units):
        super().__init__()
        self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,output_dim=embedding_dims) 
        self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
                # calling attention with hidden states
        self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
        self.rnn_cell =  self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,output_layer=self.dense_layer)

    def build_attention_mechanism(self, units,memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
        #return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decodernn cell  
    def build_rnn_cell(self, batch_size ):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism, attention_layer_size=dense_units)
        return rnn_cell
    
    def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) 
        return decoder_initial_state



encoderNetwork = EncoderNetwork(input_vocab_size,embedding_dims, rnn_units)
decoderNetwork = DecoderNetwork(output_vocab_size,embedding_dims, rnn_units)


In [23]:
decoderNetwork.attention_mechanism.memory_initialized

False

###Training using teacher forcing
Creates a callable TensorFlow graph from a Python function.

function constructs a callable that executes a TensorFlow graph (tf.Graph) created by tracing the TensorFlow operations in func.

This allows the TensorFlow runtime to apply optimizations and exploit parallelism in the computation defined by func.

### One step of training on a batch

In [0]:

def train_step(input_batch, output_batch,encoder_initial_cell_state):
    #initialize loss = 0
    loss = 0
    with tf.GradientTape() as tape:
        encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
        a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, initial_state =encoder_initial_cell_state)

        #pass [ last step activations , last memory_state ] as input to decoder for LSTM
        s_prev = [a_tx, c_tx]

        # Decoder Embeddings
        decoder_input = output_batch[:,:-1] # ignore <end>
        #compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:,1:] #ignore <start>
        decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
        decoderNetwork.attention_mechanism.setup_memory(a)
        decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE, encoder_state=s_prev,Dtype=tf.float32)
        
        #BasicDecoderOutput
         
        outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state, sequence_length=BATCH_SIZE*[Ty-1])

        logits = outputs.rnn_output
        #Calculate loss
        # remove <start> token from output_Batch..run training exclude <t0>
        loss = loss_function(logits, decoder_output)

    #Returns the list of all layer variables / weights.
    variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables  
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss

### Training
get existing checkpoint objects

Object based Checkpointing

In [25]:
# mount gdrive containing trained checkpoint objects
drive.mount('/content/drive', force_remount=True )

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [26]:

checkpointdir = os.path.join('/content/drive/My Drive/DL',"nmt_tfa_logs_eng_to_fra_withAttention")
chkpoint_prefix = os.path.join(checkpointdir, "chkpoint")
if not os.path.exists(checkpointdir):
    os.mkdir(checkpointdir)

checkpoint = tf.train.Checkpoint(optimizer = optimizer, encoderNetwork = encoderNetwork, 
                                 decoderNetwork = decoderNetwork)

try:
    status = checkpoint.restore(tf.train.latest_checkpoint(checkpointdir))
    print("Checkpoint found at {}".format(tf.train.latest_checkpoint(checkpointdir)))
except:
    print("No checkpoint found at {}".format(checkpointdir))

Checkpoint found at /content/drive/My Drive/DL/nmt_tfa_logs_eng_to_fra_withAttention/chkpoint-45


In [0]:
#decoder <start> sequence weight mean
start_index = Y_tokenizer.word_index['<start>']
start_index = tf.constant([start_index], dtype = tf.int32)
print(start_index)
start_index_emb = decoderNetwork.decoder_embedding(start_index)
print(start_index_emb.shape)
start_index_emb_avg = tf.reduce_sum(start_index_emb)
print(start_index_emb_avg.numpy()) 

tf.Tensor([1], shape=(1,), dtype=int32)
(1, 256)
1.2889861


In [0]:
#check that some variable weights exists if loading from checkpoint
print(len(decoderNetwork.variables))  #[output_vocab_size, embedding_dims]
print(len(encoderNetwork.variables))

8
4


In [0]:
epochs = 15
for i in range(1, epochs+1):

    encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0.0


    for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
        total_loss += batch_loss
        if (batch+1)%20 == 0:
            print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
            checkpoint.save(file_prefix = chkpoint_prefix)

total loss: 1.9033313989639282 epoch 1 batch 20 
total loss: 1.6031697988510132 epoch 1 batch 40 
total loss: 1.440329909324646 epoch 1 batch 60 
total loss: 1.2812906503677368 epoch 2 batch 20 
total loss: 1.2377431392669678 epoch 2 batch 40 
total loss: 1.3126517534255981 epoch 2 batch 60 
total loss: 1.1322230100631714 epoch 3 batch 20 
total loss: 1.2145603895187378 epoch 3 batch 40 
total loss: 1.2067127227783203 epoch 3 batch 60 
total loss: 1.0027304887771606 epoch 4 batch 20 
total loss: 0.9286714792251587 epoch 4 batch 40 
total loss: 0.9177215695381165 epoch 4 batch 60 
total loss: 0.876984179019928 epoch 5 batch 20 
total loss: 0.8240584135055542 epoch 5 batch 40 
total loss: 0.9926271438598633 epoch 5 batch 60 
total loss: 0.6743034720420837 epoch 6 batch 20 
total loss: 0.7864662408828735 epoch 6 batch 40 
total loss: 0.7916507720947266 epoch 6 batch 60 
total loss: 0.6255617737770081 epoch 7 batch 20 
total loss: 0.6889328360557556 epoch 7 batch 40 
total loss: 0.64866554

### Inference
Create input sequence to pass to encoder.

The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.

Stop predicting when the model predicts the end token.

And store the attention weights for every time step.

In [0]:

#if trained in same session else use checkpoint variable
#decoder_embedding_matrix = tf.train.load_variable(checkpointdir, 'decoderNetwork/decoder_embedding/embeddings/.ATTRIBUTES/VARIABLE_VALUE')
decoder_embedding_matrix = decoderNetwork.decoder_embedding.variables[0] 


print(decoderNetwork.decoder_embedding.variables[0].shape)

(3007, 256)


if restoring from checkpoint, lets print all variables related to decoder_embeddings and then select and load the right variable containing decoder embeddings

In [51]:
[print(var) for var in tf.train.list_variables(checkpointdir) if re.match(r'.*decoder_embedding.*',var[0])  ]

('decoderNetwork/decoder_embedding/embeddings/.ATTRIBUTES/VARIABLE_VALUE', [2374, 256])
('decoderNetwork/decoder_embedding/embeddings/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE', [2374, 256])
('decoderNetwork/decoder_embedding/embeddings/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE', [2374, 256])


[None, None, None]

In [52]:
decoder_embedding_matrix = tf.train.load_variable(checkpointdir, 'decoderNetwork/decoder_embedding/embeddings/.ATTRIBUTES/VARIABLE_VALUE')
print(decoder_embedding_matrix.shape)

(2374, 256)


In [107]:
#use with scope /cpu:0 for inferencing
#restore from latest checkpoint for inferencing
input_raw="Hi  \nHow are you today"
#input_raw="Wow!"  #checking translation on training set record
#def inference(input_raw):
input_lines = input_raw.split("\n")
# We have a transcript file containing English-Hindi pairs
# Preprocess X
input_lines = [preprocess_sentence(line) for line in input_lines]
input_sequences = [[X_tokenizer.word_index[w] for w in line.split(' ')] for line in input_lines]
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=Tx, padding='post')
inp = tf.convert_to_tensor(input_sequences)
#print(inp.shape)
inference_batch_size = input_sequences.shape[0]
encoder_initial_cell_state = [tf.zeros((inference_batch_size, rnn_units)), tf.zeros((inference_batch_size, rnn_units))]
encoder_emb_inp = encoderNetwork.encoder_embedding(inp)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, initial_state =encoder_initial_cell_state)
#pass [ last step activations , encoder memory_state ] as input to decoder for LSTM
s_prev = [a_tx, c_tx]
#output_sequences = []
print('a_tx :',a_tx.shape)
print('c_tx :', c_tx.shape)
print("s_prev = [a_tx, c_tx] :",np.array(s_prev).shape)


start_tokens = tf.fill([inference_batch_size],Y_tokenizer.word_index['<start>'])
#print(start_tokens)
end_token = Y_tokenizer.word_index['<end>']

greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()
#finished, start_inputs = greedy_sampler.initialize(decoder_embedding_matrix,start_tokens, end_token)
#print(finished.shape, start_inputs.shape)

decoder_input = tf.expand_dims([Y_tokenizer.word_index['<start>']]* inference_batch_size,1)
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)

decoder_instance = tfa.seq2seq.BasicDecoder(cell = decoderNetwork.rnn_cell, sampler = greedy_sampler, output_layer=decoderNetwork.dense_layer)
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(inference_batch_size, encoder_state=s_prev,Dtype=tf.float32)
print("\nCompared to simple encoder-decoder without attention, the decoder_initial_state \
 is an AttentionWrapperState object containing s_prev tensors and context and alignment vector \n ")
print("decoder initial state shape :",np.array(decoder_initial_state).shape)
print("decoder_initial_state tensor \n", decoder_initial_state)

# Since we do not know the target sequence lengths in advance, we use maximum_iterations to limit the translation lengths.
# One heuristic is to decode up to two times the source sentence lengths.
maximum_iterations = tf.round(tf.reduce_max(Tx) * 2)

#initialize inference decoder

(first_finished, first_inputs,first_state) = decoder_instance.initialize(decoder_embedding_matrix,
                             start_tokens = start_tokens, end_token=end_token, initial_state = decoder_initial_state)
#print( first_finished.shape)
print("\nfirst_inputs returns the same decoder_input i.e. embedding of  <start> :",first_inputs.shape)




inputs = first_inputs
state = first_state  
predictions = np.empty((inference_batch_size,0), dtype = np.int32)                                                                             
for j in range(maximum_iterations):
    outputs, next_state, next_inputs, finished = decoder_instance.step(j,inputs,state)
    inputs = next_inputs
    state = next_state
    outputs = np.expand_dims(outputs.sample_id,axis = -1)
    predictions = np.append(predictions, outputs, axis = -1)
                                                                               

a_tx : (2, 1024)
c_tx : (2, 1024)
s_prev = [a_tx, c_tx] : (2, 2, 1024)

Compared to simple encoder-decoder without attention, the decoder_initial_state  is an AttentionWrapperState object containing s_prev tensors and context and alignment vector 
 
decoder initial state shape : (6,)
decoder_initial_state tensor 
 AttentionWrapperState(cell_state=[<tf.Tensor: id=432949, shape=(2, 1024), dtype=float32, numpy=
array([[-0.50622696,  0.10476793,  0.15953295, ..., -0.03485877,
         0.38209227,  0.05028822],
       [ 0.05423798, -0.07571906, -0.02596835, ...,  0.03457706,
         0.02335815, -0.01079138]], dtype=float32)>, <tf.Tensor: id=432946, shape=(2, 1024), dtype=float32, numpy=
array([[-0.74728763,  0.19526252,  0.28277415, ..., -0.10372411,
         0.5871746 ,  0.10757777],
       [ 0.22841427, -0.1665909 , -0.06484494, ...,  0.08734686,
         0.04334781, -0.02505926]], dtype=float32)>], attention=<tf.Tensor: id=449400, shape=(2, 1024), dtype=float32, numpy=
array([[0., 0., 0

In [100]:

print("English Sentence:")
print(input_raw)
print("\nFrench Translation:")
for i in range(len(predictions)):
    line = predictions[i,:]
    seq = list(itertools.takewhile( lambda index: index !=2, line))
    print(" ".join( [Y_tokenizer.index_word[w] for w in seq]))

English Sentence:
Hi  
How are you today

French Translation:
salut !
comment allez vous ?


### Evaluate Loss

In [0]:
def eval_step(input_batch, output_batch,encoder_initial_cell_state, BATCH_SIZE):
    #initialize loss = 0
    loss = 0

    # we can do initialization in outer block
    #encoder_initial_cell_state = encoder.initialize_initial_state()
    encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
    a, h_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, initial_state =encoder_initial_cell_state)

    #pass [ last step activations , encoder memory_state ] as input to decoder for LSTM
    s_prev = [h_tx, c_tx]

    decoder_input = output_batch[:,:-1] # ignore <end>
    #compare logits with timestepped +1 version of decoder_input
    decoder_output = output_batch[:,1:] #ignore <start>
    decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
    decoder_instance = tfa.seq2seq.BasicDecoder(decoderNetwork.rnn_cell, greedy_sampler, decoderNetwork.dense_layer)
    #BasicDecoderOutput
    # Create a sequence_length vector of 
    decoderNetwork.attention_mechanism.setup_memory(a)
    decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE, encoder_state=s_prev,Dtype=tf.float32)
    outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state, sequence_length=BATCH_SIZE*[Ty-1])
    logits = outputs.rnn_output
    sample_id = outputs.sample_id
    #Calculate loss
    loss = loss_function(logits, decoder_output)
    return loss, sample_id

### Evaluation Loss on Entire Test Set

In [106]:
dataset_test = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(len(X_test))
for (input_batch, output_batch) in dataset_test.take(-1):
    batch_size = len(input_batch)
    print(input_batch.shape)
    encoder_initial_cell_state = [tf.zeros((batch_size, rnn_units)), tf.zeros((batch_size, rnn_units))]
    loss,_ = eval_step(input_batch, output_batch, encoder_initial_cell_state, batch_size)
    loss = tf.reduce_mean(loss)
    print("Training loss {}".format(loss) )

(1000, 7)
Training loss 0.33843979239463806
