In [2]:
!nvidia-smi

Tue Mar 21 15:11:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   27C    P5    N/A /  75W |    159MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from matplotlib import pylab
import matplotlib
import matplotlib.gridspec as gridspec
%matplotlib inline
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
import word2vec
import nltk

2023-03-24 14:17:22.864045: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 14:17:23.685876: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-24 14:17:25.089260: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-24 14:17:25.089361: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.compat.v1.disable_eager_execution()

## Dataset

[Dowload](https://nlp.stanford.edu/projects/nmt/):

* English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading the Datasets and Building the Vocabulary

First, we build the vocabulary dictionaries for the source and target (English) language. 
The vocabularies are found in the file `vocab.50K.en`(English).

In [3]:
# Word string -> ID mapping
dictionary = dict()

vocabulary_size = len(dictionary)
with open('data/vocab.50K.en', encoding='utf-8') as f:
    for line in f:
        # disregard the new line aka `\n`
        dictionary[line[:-1]] = len(dictionary)
        
vocabulary_size = len(dictionary)
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

print('Dictionary:', list(dictionary.items())[:10], end = '\n')
print('Reverse dictionary:', list(reverse_dictionary.items())[:10], end = '\n')
print('Vocabulary size: ', vocabulary_size, end = '\n')


Dictionary: [('<unk>', 0), ('<s>', 1), ('</s>', 2), ('the', 3), (',', 4), ('.', 5), ('of', 6), ('and', 7), ('to', 8), ('in', 9)]
Reverse dictionary: [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
Vocabulary size:  50000


### Loading data
Here we load the data from the dataset.csv file (generated in the other script)

In [4]:
dataset = pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [5]:
wt = nltk.tokenize.WhitespaceTokenizer()

for column in dataset.columns:
    dataset[column] = dataset[column].str.lower() 
    dataset[column] = dataset[column].str.replace(',', ' ,')  \
                                     .str.replace('.',' .', regex=False)   \
                                     .str.replace('?',' ?', regex=False)   \
                                     .str.replace(')','', regex=False)   \
                                     .str.replace('(','', regex=False)   \
                                     .str.replace('"','')   \
                                     .str.replace('\n',' ')
    dataset[column] = dataset[column].apply(wt.tokenize)
dataset = shuffle(dataset)

In [6]:
dataset.head()

Unnamed: 0,question,answer
1118773,"[what, are, we, in, the, golden, age, of, righ...","[superhero, movies]"
1143635,"[what, sequel, was, better, than, the, origina...","[blade, runner, 2049, ., also, ,, no, one, men..."
238146,"[how, can, i, get, a, hawk, out, of, our, wear...","[further, tips, here, ,, perhaps, ., http://ww..."
1013252,"[a, goblin, king, has, cordially, invited, you...","[the, number, of, you, going, on, about, harmi..."
972653,"[what, should, you, not, say, in, a, pitch, bl...","[so, ,, has, anyone, else, here, tried, anal, ..."


### Data analysis
Mean sentence length and standard deviation of sentence length

In [10]:
print('(Questions) Average sentence length: ', dataset['question'].str.len().mean())
print('(Questions) Standard deviation of sentence length: ', dataset['question'].str.len().std())

print('(Answers) Average sentence length: ', dataset['answer'].str.len().mean())
print('(Answers) Standard deviation of sentence length: ', dataset['answer'].str.len().std())

(Questions) Average sentence length:  17.101486059545056
(Questions) Standard deviation of sentence length:  9.122891352194081
(Answers) Average sentence length:  54.367627238247216
(Answers) Standard deviation of sentence length:  843.0636308326157


### Update the sentences to fixed length
Update all sentences with a fixed size, to process the sentences as batches.

In [7]:
max_sent_length = {'question' : 30, 'answer': 70}

def padding_sent(source):
    padded = []
    for tokens in dataset[source]: 
        # adding the start token
        tokens.insert(0, '<s>')  

        if len(tokens) >= max_sent_length[source]:
            tokens = tokens[:(max_sent_length[source] - 1)]
            tokens.append('</s>')

        if len(tokens) < max_sent_length[source]:
            tokens.extend(['</s>' for _ in range(max_sent_length[source] - len(tokens))])  

        padded.append(tokens)
    return padded

In [8]:
questions = padding_sent('question')
answers = padding_sent('answer')

### Create the reverse dataset

In [9]:
def create_reverse_dataset(source):
    reverse_tokens = []
    reverse_dataset = []
    for tokens in source: 
        for token in tokens: 
            if token not in dictionary.keys():
                reverse_tokens.append(dictionary['<unk>'])
            else:
                reverse_tokens.append(dictionary[token])
        reverse_dataset.append(reverse_tokens)
        reverse_tokens = []
    return reverse_dataset

train_inputs =  np.array(create_reverse_dataset(questions), dtype=np.int32)
train_outputs =  np.array(create_reverse_dataset(answers), dtype=np.int32)

### Word Embedding

In [10]:
sentence_cursors = [0 for _ in range(train_inputs.shape[0])]
batch_size = 32
embedding_size = 64
steps = 80000

In [14]:
word2vec.define_data_and_hyperparameters(
        train_inputs.shape[0], 
        max_sent_length['question'], 
        max_sent_length['answer'], 
        dictionary, 
        reverse_dictionary,  
        train_inputs, 
        train_outputs, 
        embedding_size,
        vocabulary_size)

word2vec.print_some_batches()
word2vec.define_word2vec_tensorflow(batch_size)
word2vec.run_word2vec(batch_size, steps)



with window_size = 2:
    batch: [['<s>', 'former', 'of', '<unk>'], ['<s>', '<unk>', 'favourite', 'light'], ['<s>', 'postal', 'of', '<unk>'], ['<s>', 'as', 'kid', ','], ['<s>', 'you', 'throw', '10000'], ['<s>', 'how', 'you', 'feel'], ['<s>', 'where', 'you', 'put'], ['<s>', 'when', 'the', 'absolute']]
    labels: ['smokers', 'your', 'workers', 'a', 'can', 'would', 'do', 'was']
Defining 4 embedding lookups representing each word in the context
Stacked embedding size: [32, 64, 4]
Reduced mean embedding size: [32, 64]


2023-03-20 14:31:47.258978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-20 14:31:47.306573: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-20 14:31:47.369285: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-20 14:31:47.369851: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Initialized
Average loss at step 2000: 2.779788
Average loss at step 4000: 1.444825
Average loss at step 6000: 1.281724
Average loss at step 8000: 1.185413
Average loss at step 10000: 1.129574
Nearest to should: did, constituent, would, husbands, Derrida, does, well, greetings,
Nearest to -: ,, women, instead, fellow, cooled, cops, housed, Bible,
Nearest to also: parents, dopo, deceiving, CAM, Elche, S.p.A., walked, leases,
Nearest to The: Alejandro, presses, pioneered, Absolute, Berlaymont, devoid, Excellent, multiple,
Nearest to these: 1821, sparking, satisfying, MacDonald, www.avaaz.org, Jo, collateral, Rhapsody,
Nearest to us: <unk>, worst, for, indemnify, longest, apple, alright, ridiculous,
Nearest to out: sleep, indelible, prayer, fell, live, all, vs, meu,
Nearest to or: Poos, biocidal, parody, Multitude, object, decides, Immanuel, Link,
Nearest to by: Advocates, GfK, Sealed, ERC, 3.7, jour, 6.30, tabling,
Nearest to as: video, movie, person, Procchio, girl, scale, song, weird,


Average loss at step 52000: 1.066826
Average loss at step 54000: 1.067815
Average loss at step 56000: 1.074104
Average loss at step 58000: 1.076034
Average loss at step 60000: 1.071978
Nearest to should: can, would, did, could, will, does, detailled, Vejer,
Nearest to -: ,, :, finances, nightclub, coaches, Maritim, hillside, 80th,
Nearest to also: sirens, oral, involved, arising, commentaries, clic, visited, dopo,
Nearest to The: Alejandro, presses, pioneered, colonized, Absolute, devoid, Berlaymont, Excellent,
Nearest to these: Doctor, F1, Along, 0044, contravention, 50, Ideally, Linguistic,
Nearest to us: usa, insbesondere, handicraft, emotional, jealousy, tumours, Friesland, indemnify,
Nearest to out: Documentation, 8, soir, Exif, Byzantine, indelible, fell, tempted,
Nearest to or: Poos, Valencia, DVI, impatience, Multitude, Payments, Nueva, zero,
Nearest to by: Advocates, on, GfK, dismayed, predefined, data, realistic, daran,
Nearest to as: Orion, Eugen, rebound, Teheran, CHILD, bi

In [11]:
class DataGenerator(object):

    def __init__(self, batch_size, num_unroll, is_input, is_train):
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        self._word_embeddings = np.load('embeddings.npy')
        self._sent_ids = None
        self._is_input = is_input
        self._is_train = is_train

    def next_batch(self, sent_ids):

        sent_length = max_sent_length['question'] if self._is_input else max_sent_length['answer']

        batch_data = np.zeros((self._batch_size, embedding_size), dtype=np.float32)
        batch_labels = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32)

        for batch in range(self._batch_size):
            sent_id = sent_ids[batch]
            
            if self._is_input:
                sent_text = train_inputs[sent_id] if self._is_input else test_inputs[sent_id]
            else:
                sent_text = train_outputs[sent_id] if self._is_input else train_outputs[sent_id]
            
            batch_data[batch] = self._word_embeddings[sent_text[self._cursor[batch]],:]
            batch_labels[batch] = np.zeros((vocabulary_size), dtype=np.float32)
            batch_labels[batch, sent_text[self._cursor[batch] + 1]] = 1.0

            self._cursor[batch] = (self._cursor[batch] + 1) % (sent_length - 1)

        return batch_data,batch_labels

    def unroll_batches(self,sent_ids):

        if sent_ids is not None:
            self._sent_ids = sent_ids
            self._cursor = [0 for _ in range(self._batch_size)]
        unroll_data, unroll_labels = [],[]

        for unroll_ids in range(self._num_unroll):
            data, labels = self.next_batch(self._sent_ids)
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels, self._sent_ids

    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]

dg = DataGenerator(batch_size=5, num_unroll=20, is_input=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Input data')
for _, lbl in zip(u_data,u_labels):
    print([reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

dg = DataGenerator(batch_size=5, num_unroll=30, is_input=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('\nOutput data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

Input data
['what', 'what', 'how', 'a', 'what']
['are', 'sequel', 'can', '<unk>', 'should']
['we', 'was', 'i', 'king', 'you']
['in', 'better', 'get', 'has', 'not']
['the', 'than', 'a', 'cordially', 'say']
['golden', 'the', 'hawk', 'invited', 'in']
['age', 'original', 'out', 'you', 'a']
['of', '?', 'of', 'to', 'pitch']
['right', '</s>', 'our', 'attend', 'black']
['now', '</s>', '<unk>', 'a', 'room']
['?', '</s>', '?', '<unk>', 'with']
['</s>', '</s>', '</s>', 'party', '10']
['</s>', '</s>', '</s>', '.', 'people']
['</s>', '</s>', '</s>', 'what', '?']
['</s>', '</s>', '</s>', 'do', '</s>']
['</s>', '</s>', '</s>', 'you', '</s>']
['</s>', '</s>', '</s>', 'do', '</s>']
['</s>', '</s>', '</s>', 'to', '</s>']
['</s>', '</s>', '</s>', 'prepare', '</s>']
['</s>', '</s>', '</s>', 'for', '</s>']

Output data batch
['superhero', 'blade', 'further', 'the', 'so']
['movies', 'runner', 'tips', 'number', ',']
['</s>', '<unk>', 'here', 'of', 'has']
['</s>', '.', ',', 'you', 'anyone']
['</s>', 'also', '

## Building the Model with TensorFlow

Define the hyperparameters, the input/output placeholders, the LSTM/Output layer parameters, the LSTM/output calculations, and finally the optimization steps.

### Hyperparameters


In [27]:
emb_mat = np.load('embeddings.npy')
input_size = emb_mat.shape[1]

num_nodes = 128
batch_size = 10

encoder_num_unrollings = 20
decoder_num_unrollings = 30

### Input / Output Placeholders

In [28]:
tf.compat.v1.reset_default_graph()

word_embeddings = tf.convert_to_tensor(value=emb_mat,name='embeddings')

print('Defining Encoder Data Placeholders')
encoder_train_inputs = []

for ui in range(encoder_num_unrollings):
    encoder_train_inputs.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size],name='train_inputs_%d'%ui))

print('Defining Decoder Data Placeholders')

decoder_train_inputs, decoder_train_labels, decoder_train_masks = [],[],[]

for ui in range(decoder_num_unrollings):
    decoder_train_inputs.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size],name='decoder_train_inputs_%d'%ui))
    decoder_train_labels.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'decoder_train_labels_%d'%ui))
    decoder_train_masks.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,1],name='decoder_train_masks_%d'%ui))


encoder_test_input = [tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size], name='test_input_%d'%ui) for ui in range(encoder_num_unrollings)]
decoder_test_input = tf.nn.embedding_lookup(params=word_embeddings,ids=[dictionary['<s>']])

Defining Encoder Data Placeholders
Defining Decoder Data Placeholders


### Defining the Encoder Model

In [29]:

with tf.compat.v1.variable_scope('Encoder'):

    # Input gate
    encoder_input_gate_x = tf.compat.v1.get_variable('input_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_input_gate_m = tf.compat.v1.get_variable('input_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_input_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='input_gate_b')

    # Forget gate
    encoder_forget_gate_x = tf.compat.v1.get_variable('forget_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_forget_gate_m = tf.compat.v1.get_variable('forget_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_forget_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='forget_gate_b')

    # Candidate value (c~_t)
    encoder_candidate_value_x = tf.compat.v1.get_variable('candidate_value_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_candidate_value_m = tf.compat.v1.get_variable('candidate_value_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_candidate_value_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='candidate_value_b')

    # Output gate
    encoder_output_gate_x = tf.compat.v1.get_variable('output_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_output_gate_m = tf.compat.v1.get_variable('output_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_output_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='output_gate_b')

    # Variáveis para salvar o resultado
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name='train_output')
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name = 'train_cell')

    saved_test_output = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_output')
    saved_test_state = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_cell')

print('Encoder Model defined')

Encoder Model defined


### Defining the Decoder Model

In [30]:
with tf.compat.v1.variable_scope('Decoder'):

    # Input gate
    decoder_input_gate_x = tf.compat.v1.get_variable('input_gate_x',shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_input_gate_m = tf.compat.v1.get_variable('input_gate_m',shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_input_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='input_gate_b')

    # Forget gate
    decoder_forget_gate_x = tf.compat.v1.get_variable('forget_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_forget_gate_m = tf.compat.v1.get_variable('forget_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_forget_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='forget_gate_b')

    # Candidate value (c~_t)
    decoder_candidate_value_x = tf.compat.v1.get_variable('candidate_value_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_candidate_value_m = tf.compat.v1.get_variable('candidate_value_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_candidate_value_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='candidate_value_b')

    # Output gate
    decoder_output_gate_x = tf.compat.v1.get_variable('output_gate_x',shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_output_gate_m = tf.compat.v1.get_variable('output_gate_m',shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_output_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05),name='output_gate_b')

    # Softmax Classifier
    w = tf.compat.v1.get_variable('softmax_weights',shape=[num_nodes, vocabulary_size], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    b = tf.Variable(tf.random.uniform([vocabulary_size],-0.05,-0.05),name='softmax_bias')
    
print('Decoder Model defined')

Decoder Model defined


### Defining LSTM cell


In [31]:
# Encoder LSTM cell
def encoder_lstm_cell(_input, _output, _state):
    input_gate = tf.sigmoid(tf.matmul(_input, encoder_input_gate_x) + tf.matmul(_output, encoder_input_gate_m) + encoder_input_gate_b)
    forget_gate = tf.sigmoid(tf.matmul(_input, encoder_forget_gate_x) + tf.matmul(_output, encoder_forget_gate_m) + encoder_forget_gate_b)
    update = tf.matmul(_input, encoder_candidate_value_x) + tf.matmul(_output, encoder_candidate_value_m) + encoder_candidate_value_b
    _state = forget_gate * _state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(_input, encoder_output_gate_x) + tf.matmul(_output, encoder_output_gate_m) + encoder_output_gate_b)
    return output_gate * tf.tanh(_state), _state

# Decoder LSTM cell
def decoder_lstm_cell(_input, _output, _state):
    input_gate = tf.sigmoid(tf.matmul(_input, decoder_input_gate_x) + tf.matmul(_output, decoder_input_gate_m) + decoder_input_gate_b)
    forget_gate = tf.sigmoid(tf.matmul(_input, decoder_forget_gate_x) + tf.matmul(_output, decoder_forget_gate_m) + decoder_forget_gate_b)
    update = tf.matmul(_input, decoder_candidate_value_x) + tf.matmul(_output, decoder_candidate_value_m) + decoder_candidate_value_b
    _state = forget_gate * _state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(_input, decoder_output_gate_x) + tf.matmul(_output, decoder_output_gate_m) + decoder_output_gate_b)
    return output_gate * tf.tanh(_state), _state

In [33]:

#=========================== TRAIN =================================

outputs = list()
output = saved_output
state = saved_state

# Calculate the output and state of the encoder
for _input in encoder_train_inputs:
    output, state = encoder_lstm_cell(_input, output, state)

# Calculate the output and state of the decoder
with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
    for _input in decoder_train_inputs:
        output, state = decoder_lstm_cell(_input, output, state)
        outputs.append(output)

# Calculate the decoder logits for all unrolled steps
logits = tf.matmul(tf.concat(axis=0, values=outputs), w) + b

# Decoder predictions
train_prediction = tf.nn.softmax(logits)


#=========================== TEST =================================

test_output  = saved_test_output
test_state = saved_test_state
test_predictions = []

for _input in encoder_test_input:
    test_output, test_state = encoder_lstm_cell(_input, test_output,test_state)

# Calculate the decoder output
with tf.control_dependencies([saved_test_output.assign(test_output), saved_test_state.assign(test_state)]):
    for i in range(decoder_num_unrollings):

        test_output, test_state = decoder_lstm_cell(decoder_test_input, test_output, test_state)
        test_prediction = tf.nn.softmax(tf.compat.v1.nn.xw_plus_b(test_output, w, b))
        decoder_test_input = tf.nn.embedding_lookup(params=word_embeddings,ids=tf.argmax(input=test_prediction,axis=1))
        test_predictions.append(tf.argmax(input=test_prediction,axis=1))


### Calculating the Loss

The loss is calculated by summing all losses obtained along the time axis and the average of the lot axis.

In [37]:
loss_batch = tf.concat(axis=0,values=decoder_train_masks) * tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.concat(axis=0, values=decoder_train_labels))
loss = tf.reduce_mean(input_tensor=loss_batch)

### Optimizer

There are two optimizers used here: Adam and SGD. 
Using Adam just causes the model to exhibit some undesirable behavior in the long run. 
So Adam is used to get a good initial guess for the SGD and use the SGD from that point on.

In [38]:
# These are used to slow down the learning rate over time
global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.compat.v1.assign(global_step,global_step + 1)

# Using two optimizers, when optimizer changes we reset global step
reset_gstep = tf.compat.v1.assign(global_step,0)

# Calculated decaying learning rate
learning_rate = tf.maximum(
    tf.compat.v1.train.exponential_decay(0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True), 0.00001)

sgd_learning_rate = tf.maximum(
    tf.compat.v1.train.exponential_decay(0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True), 0.00001)

with tf.compat.v1.variable_scope('Adam'):
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
with tf.compat.v1.variable_scope('SGD'):
    sgd_optimizer = tf.compat.v1.train.GradientDescentOptimizer(sgd_learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sgd_gradients, v = zip(*sgd_optimizer.compute_gradients(loss))
sgd_gradients, _ = tf.clip_by_global_norm(sgd_gradients, 5.0)
sgd_optimize = optimizer.apply_gradients(zip(sgd_gradients, v))

# Making sure there are fluid gradients from decoder to encoder
for (g_i,v_i) in zip(gradients,v):
    assert g_i is not None, 'Gradient none for %s'%(v_i.name)

### Resetting the Training and Testing States

Define the state reset functions

In [39]:
# Reset training state
reset_train_state = tf.group(tf.compat.v1.assign(saved_output, tf.zeros([batch_size, num_nodes])),
                             tf.compat.v1.assign(saved_state, tf.zeros([batch_size, num_nodes])))

reset_test_state = tf.group(
    saved_test_output.assign(tf.zeros([batch_size, num_nodes])),
    saved_test_state.assign(tf.zeros([batch_size, num_nodes])))

## Running the Neural Network

With all the TensorFlow operations defined, now to define various functions related to running the model, as well as running the model.

### Evaluate and Print Results

it is defined two functions to print and save the prediction results for training data as well as test data, and finally, define a function to get candidate and reference data to calculate the BLEU Score.

In [40]:
def print_and_save_train_predictions(du_labels, tr_pred, rand_idx, train_prediction_text_fname):

    print_str = 'Question: '
    for w in np.argmax(np.concatenate(du_labels,axis=0)[rand_idx::batch_size],axis=1).tolist():
        print_str += reverse_dictionary[w] + ' '
        if reverse_dictionary[w] == '</s>':
            break
    print(print_str)

    # Log
    #with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:
    #    fa.write(print_str+'\n')

    print()
    print_str = 'Answer: '
    for w in np.argmax(tr_pred[rand_idx::batch_size],axis=1).tolist():
        print_str += reverse_dictionary[w] + ' '
        if reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    
    # Log
    #with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:
    #    fa.write(print_str+'\n')


def print_and_save_test_predictions(test_du_labels, test_pred_unrolled, batch_id, test_rand_idx, test_prediction_text_fname):

    # Print real answer
    print('Answer: ',test_source_sent[(batch_id*batch_size)+test_rand_idx])
    print_str = '\t Question (TRUE):' + test_target_sent[(batch_id*batch_size)+test_rand_idx]
    print(print_str + '\n')

    # Print predicted question
    print_str = '\t Question (Predicted): '

    for test_pred in test_pred_unrolled:
        print_str += reverse_dictionary[test_pred[test_rand_idx]] + ' '
        if reverse_dictionary[test_pred[test_rand_idx]] == '</s>':
            break
    print(print_str + '\n')

    # Log
    #with open(os.path.join(log_dir, test_prediction_text_fname),'a',encoding='utf-8') as fa:
    #    fa.write(print_str+'\n')

def create_bleu_ref_candidate_lists(all_preds, all_labels):

    bleu_labels, bleu_preds = [],[]
    ref_list, cand_list = [],[]
    for b_i in range(batch_size):
        tmp_lbl = all_labels[b_i::batch_size]
        tmp_lbl = tmp_lbl[np.where(tmp_lbl != tgt_dictionary['</s>'])]
        ref_str = ' '.join([reverse_dictionary[lbl] for lbl in tmp_lbl])
        ref_list.append([ref_str])
        tmp_pred = all_preds[b_i::batch_size]
        tmp_pred = tmp_pred[np.where(tmp_pred != tgt_dictionary['</s>'])]
        cand_str = ' '.join([reverse_dictionary[pre] for pre in tmp_pred])
        cand_list.append(cand_str)

    return cand_list, ref_list