In [1]:
!nvidia-smi

Thu Mar 30 15:51:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   30C    P8    N/A /  75W |    266MiB /  4096MiB |     24%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from matplotlib import pylab
import matplotlib
import matplotlib.gridspec as gridspec
%matplotlib inline
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
import word2vec
import nltk

2023-03-30 15:51:17.630976: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 15:51:18.451929: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-30 15:51:19.805743: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-30 15:51:19.806080: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [3]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.compat.v1.disable_eager_execution()

checkpoint_directory = "/tmp/training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")

## Dataset

* English vocabulary: [`vocab.50K.en`] created in the creating-vocabulary.ipynb

### Loading the Datasets and Building the Vocabulary

First, we build the vocabulary dictionaries for the source and target (English) language. 
The vocabularies are found in the file `vocab.50K.en`(English).

In [4]:
# Word string -> ID mapping
dictionary = dict()

vocabulary_size = len(dictionary)
with open('data/vocab.50K.en', encoding='utf-8') as f:
    for line in f:
        # disregard the new line aka `\n`
        dictionary[line[:-1]] = len(dictionary)
        
vocabulary_size = len(dictionary)
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

print('Dictionary:', list(dictionary.items())[:10], end = '\n')
print('Reverse dictionary:', list(reverse_dictionary.items())[:10], end = '\n')
print('Vocabulary size: ', vocabulary_size, end = '\n')


Dictionary: [('<unk>', 0), ('<s>', 1), ('</s>', 2), ('.', 3), ('the', 4), (',', 5), ('a', 6), ('to', 7), ('and', 8), ('i', 9)]
Reverse dictionary: [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, '.'), (4, 'the'), (5, ','), (6, 'a'), (7, 'to'), (8, 'and'), (9, 'i')]
Vocabulary size:  50000


### Loading data
Here we load the data from the dataset.csv file (generated in the other script)

In [5]:
dataset = pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [6]:
def lowerDataset(data):
    return data.str.lower() 
    
def cleanDataset(data):
    return data.str.replace('/r/','')                  \
                .str.replace(')','', regex=False)      \
                .str.replace('(','', regex=False)      \
                .str.replace(']','', regex=False)      \
                .str.replace('[','', regex=False)      \
                .str.replace('!','')                   \
                .str.replace('"','')                   \
    
def paddDataset(data):
    return data.str.replace(',', ' ,')                 \
                .str.replace('.',' . ', regex=False)    \
                .str.replace('?',' ?', regex=False)    \
                .str.replace('\n',' ')
    

In [7]:
wt = nltk.tokenize.WhitespaceTokenizer()
for column in dataset.columns:    
    dataset[column] = lowerDataset(dataset[column]) 
    dataset[column] = cleanDataset(dataset[column])
    dataset[column] = paddDataset(dataset[column])                                    
    dataset[column] = dataset[column].apply(wt.tokenize)
dataset = shuffle(dataset)

In [8]:
dataset.head()

Unnamed: 0,question,answer
884630,"[is, canine, flatulence, a, problem, ?, what, ...","[i, don't, think, it's, the, kind, of, problem..."
753750,"[the, opening, to, friends, spells, it, as, f,...","[next, time, on, kids, next, door, fiendish, r..."
200415,"[virtual, teachers, of, reddit, due, to, covid...","[i, had, a, kid, who, was, stuck, in, the, mid..."
203697,"[girls, of, reddit, ,, what, is, your, ideal, ...","[i, goooooooooot, this, ., first, of, all, ,, ..."
791426,"[it's, 3:54, a, ., m, ., ,, your, tv, ,, radio...","[yellowstone, just, blew, up, ., ., ., i, woul..."


### Data analysis
Mean sentence length and standard deviation of sentence length

In [9]:
print('(Questions) Average sentence length: ', dataset['question'].str.len().mean())
print('(Questions) Standard deviation of sentence length: ', dataset['question'].str.len().std())

print('(Answers) Average sentence length: ', dataset['answer'].str.len().mean())
print('(Answers) Standard deviation of sentence length: ', dataset['answer'].str.len().std())

(Questions) Average sentence length:  17.116929628176226
(Questions) Standard deviation of sentence length:  9.138660652379492
(Answers) Average sentence length:  54.4734629738071
(Answers) Standard deviation of sentence length:  844.5689123041486


### Update the sentences to fixed length
Update all sentences with a fixed size, to process the sentences as batches.

In [10]:
max_sent_length = {'question' : 30, 'answer': 70}

def padding_sent(source):
    padded = []
    for tokens in dataset[source]: 
        # adding the start token
        tokens.insert(0, '<s>')  

        if len(tokens) >= max_sent_length[source]:
            tokens = tokens[:(max_sent_length[source] - 1)]
            tokens.append('</s>')

        if len(tokens) < max_sent_length[source]:
            tokens.extend(['</s>' for _ in range(max_sent_length[source] - len(tokens))])  

        padded.append(tokens)
    return padded

In [11]:
questions = padding_sent('question')
answers = padding_sent('answer')

### Create the reverse dataset

In [12]:
def create_reverse_dataset(source):
    reverse_tokens = []
    reverse_dataset = []
    for tokens in source: 
        for token in tokens: 
            if token not in dictionary.keys():
                reverse_tokens.append(dictionary['<unk>'])
            else:
                reverse_tokens.append(dictionary[token])
        reverse_dataset.append(reverse_tokens)
        reverse_tokens = []
    return reverse_dataset

train_inputs =  np.array(create_reverse_dataset(questions), dtype=np.int32)
train_outputs =  np.array(create_reverse_dataset(answers), dtype=np.int32)

### Word Embedding

In [13]:
sentence_cursors = [0 for _ in range(train_inputs.shape[0])]
batch_size = 32
embedding_size = 64
steps = 80000

In [14]:
word2vec.define_data_and_hyperparameters(
        train_inputs.shape[0], 
        max_sent_length['question'], 
        max_sent_length['answer'], 
        dictionary, 
        reverse_dictionary,  
        train_inputs, 
        train_outputs, 
        embedding_size,
        vocabulary_size)

word2vec.print_some_batches()
word2vec.define_word2vec_tensorflow(batch_size)
word2vec.run_word2vec(batch_size, steps)



with window_size = 2:
    batch: [['<s>', "what's", 'like', 'being'], ['<s>', 'what', "haven't", 'you'], ['<s>', 'what', 'the', 'dumbest'], ['<s>', 'what', "i'm", 'upper'], ['<s>', 'time', 'brag', ','], ['<s>', "you're", 'into', 'custody'], ['<s>', 'what', 'your', 'favorite'], ['<s>', 'for', 'who', 'got']]
    labels: ['it', 'movies', 'was', 'screams', 'to', 'taken', 'is', 'those']
Defining 4 embedding lookups representing each word in the context
Stacked embedding size: [32, 64, 4]
Reduced mean embedding size: [32, 64]


2023-03-30 15:24:20.344039: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 15:24:20.527713: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-30 15:24:20.952423: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-30 15:24:20.953194: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Initialized
Average loss at step 2000: 3.030542
Average loss at step 4000: 1.641926
Average loss at step 6000: 1.438059
Average loss at step 8000: 1.319586
Average loss at step 10000: 1.260281
Nearest to and: cognitively, literally, teenagers, tries, button, kielbasa, 4th, 2,
Nearest to when: go, get, how, slack-jawed, rather, iodine, own, something,
Nearest to .: essex, whip, dictatorship, loves, surfaces, esl, croatia, defunct,
Nearest to get: when, go, tomorrow, want, <s>, how, uncles, think,
Nearest to why: there, where, who, <s>, how, not, now, gone,
Nearest to one: something, starting, where, an, everyone, <s>, this, barns,
Nearest to most: worst, dumbest, strangest, fastest, scariest, stupidest, best, biggest,
Nearest to back: elixir, roadshow, 2020, giant, now, over, still, ;3;42u1re,
Nearest to day: non-electronic, jew, hobbled, oils, inconsequential, ,when, rekindle, allison,
Nearest to .: essex, whip, dictatorship, loves, surfaces, esl, croatia, defunct,
Nearest to slowly: m

Average loss at step 52000: 1.153049
Average loss at step 54000: 1.155536
Average loss at step 56000: 1.157846
Average loss at step 58000: 1.156418
Average loss at step 60000: 1.169359
Nearest to and: strategies/techniques, doggo, womb, crack, doorstop, at, f****d, jackfruit,
Nearest to when: in/out, slack-jawed, sharpie, blowing, how, while, neurological, recoup,
Nearest to .: v, novella, s, o, unhealthier, predator, single-use, j,
Nearest to get: see, hold, learn, stop, trust, ask, wish, eat,
Nearest to why: imprisoning, ftl:, shockingly, mules, predict, co-vid, pooh, how,
Nearest to one: 1, steve, 1975, grossest, $1000, sick, 'you're, playing,
Nearest to most: biggest, scariest, cheesiest, chang, fastest, laziest, worst, dumbest,
Nearest to back: tablet, transported, teleported, moo, giant, catered, re-visit, obviously,
Nearest to day: jew, non-electronic, insider, action, hoopla, minute, mistake, skinning,
Nearest to .: v, novella, s, o, unhealthier, predator, single-use, j,
Neares

In [14]:
class DataGenerator(object):

    def __init__(self, batch_size, num_unroll, is_input, is_train):
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        self._word_embeddings = np.load('embeddings.npy')
        self._sent_ids = None
        self._is_input = is_input
        self._is_train = is_train

    def next_batch(self, sent_ids):

        sent_length = max_sent_length['question'] if self._is_input else max_sent_length['answer']

        batch_data = np.zeros((self._batch_size, embedding_size), dtype=np.float32)
        batch_labels = np.zeros((self._batch_size, vocabulary_size), dtype=np.float32)

        for batch in range(self._batch_size):
            sent_id = sent_ids[batch]
            
            if self._is_input:
                sent_text = train_inputs[sent_id] if self._is_input else test_inputs[sent_id]
            else:
                sent_text = train_outputs[sent_id] if self._is_input else train_outputs[sent_id]
            
            batch_data[batch] = self._word_embeddings[sent_text[self._cursor[batch]],:]
            batch_labels[batch] = np.zeros((vocabulary_size), dtype=np.float32)
            batch_labels[batch, sent_text[self._cursor[batch] + 1]] = 1.0

            self._cursor[batch] = (self._cursor[batch] + 1) % (sent_length - 1)

        return batch_data,batch_labels

    def unroll_batches(self,sent_ids):

        if sent_ids is not None:
            self._sent_ids = sent_ids
            self._cursor = [0 for _ in range(self._batch_size)]
        unroll_data, unroll_labels = [],[]

        for unroll_ids in range(self._num_unroll):
            data, labels = self.next_batch(self._sent_ids)
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels, self._sent_ids

    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]

dg = DataGenerator(batch_size=5, num_unroll=20, is_input=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Input data')
for _, lbl in zip(u_data,u_labels):
    print([reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

dg = DataGenerator(batch_size=5, num_unroll=30, is_input=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('\nOutput data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

Input data
['is', 'the', 'virtual', 'girls', "it's"]
['canine', 'opening', 'teachers', 'of', '3:54']
['flatulence', 'to', 'of', '<unk>', 'a']
['a', 'friends', '<unk>', ',', '.']
['problem', 'spells', 'due', 'what', 'm']
['?', 'it', 'to', 'is', '.']
['what', 'as', 'covid-19', 'your', ',']
['can', 'f', ',', 'ideal', 'your']
['governments', '.', 'was', 'first', 'tv']
['do', 'r', 'it', 'date', ',']
['about', '.', 'shocking', '?', 'radio']
['it', 'i', 'to', '</s>', ',']
['?', '.', 'see', '</s>', 'cell']
['</s>', 'e', 'how', '</s>', 'phone']
['</s>', '.', 'some', '</s>', 'begins']
['</s>', 'n', 'of', '</s>', 'transmitting']
['</s>', '.', 'your', '</s>', 'an']
['</s>', 'd', 'kids', '</s>', 'emergency']
['</s>', '.', 'actually', '</s>', 'alert']
['</s>', 's', 'live', '</s>', '.']

Output data batch
['i', 'next', 'i', 'i', 'yellowstone']
["don't", 'time', 'had', '<unk>', 'just']
['think', 'on', 'a', 'this', 'blew']
["it's", 'kids', 'kid', '.', 'up']
['the', 'next', 'who', 'first', '.']
['kind',

## Building the Model with TensorFlow

Define the hyperparameters, the input/output placeholders, the LSTM/Output layer parameters, the LSTM/output calculations, and finally the optimization steps.

### Hyperparameters


In [15]:
emb_mat = np.load('embeddings.npy')
input_size = emb_mat.shape[1]

num_nodes = 128
batch_size = 10

encoder_num_unrollings = 20
decoder_num_unrollings = 30

### Input / Output Placeholders

In [16]:
tf.compat.v1.reset_default_graph()

word_embeddings = tf.convert_to_tensor(value=emb_mat,name='embeddings')

print('Defining Encoder Data Placeholders')
encoder_train_inputs = []

for ui in range(encoder_num_unrollings):
    encoder_train_inputs.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size],name='train_inputs_%d'%ui))

print('Defining Decoder Data Placeholders')

decoder_train_inputs, decoder_train_labels, decoder_train_masks = [],[],[]

for ui in range(decoder_num_unrollings):
    decoder_train_inputs.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size],name='decoder_train_inputs_%d'%ui))
    decoder_train_labels.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,vocabulary_size], name = 'decoder_train_labels_%d'%ui))
    decoder_train_masks.append(tf.compat.v1.placeholder(tf.float32, shape=[batch_size,1],name='decoder_train_masks_%d'%ui))


encoder_test_input = [tf.compat.v1.placeholder(tf.float32, shape=[batch_size,input_size], name='test_input_%d'%ui) for ui in range(encoder_num_unrollings)]
decoder_test_input = tf.nn.embedding_lookup(params=word_embeddings,ids=[dictionary['<s>']])

Defining Encoder Data Placeholders
Defining Decoder Data Placeholders


### Defining the Encoder Model

In [17]:
with tf.compat.v1.variable_scope('Encoder'):

    # Input gate
    encoder_input_gate_x = tf.compat.v1.get_variable('input_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_input_gate_m = tf.compat.v1.get_variable('input_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_input_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='input_gate_b')

    # Forget gate
    encoder_forget_gate_x = tf.compat.v1.get_variable('forget_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_forget_gate_m = tf.compat.v1.get_variable('forget_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_forget_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='forget_gate_b')

    # Candidate value (c~_t)
    encoder_candidate_value_x = tf.compat.v1.get_variable('candidate_value_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_candidate_value_m = tf.compat.v1.get_variable('candidate_value_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_candidate_value_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='candidate_value_b')

    # Output gate
    encoder_output_gate_x = tf.compat.v1.get_variable('output_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_output_gate_m = tf.compat.v1.get_variable('output_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    encoder_output_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='output_gate_b')

    # Variáveis para salvar o resultado
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name='train_output')
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False, name = 'train_cell')

    saved_test_output = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_output')
    saved_test_state = tf.Variable(tf.zeros([batch_size, num_nodes]),trainable=False, name='test_cell')

print('Encoder Model defined')

Encoder Model defined


### Defining the Decoder Model

In [18]:
with tf.compat.v1.variable_scope('Decoder'):

    # Input gate
    decoder_input_gate_x = tf.compat.v1.get_variable('input_gate_x',shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_input_gate_m = tf.compat.v1.get_variable('input_gate_m',shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_input_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='input_gate_b')

    # Forget gate
    decoder_forget_gate_x = tf.compat.v1.get_variable('forget_gate_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_forget_gate_m = tf.compat.v1.get_variable('forget_gate_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_forget_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05, 0.05), name='forget_gate_b')

    # Candidate value (c~_t)
    decoder_candidate_value_x = tf.compat.v1.get_variable('candidate_value_x', shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_candidate_value_m = tf.compat.v1.get_variable('candidate_value_m', shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_candidate_value_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05), name='candidate_value_b')

    # Output gate
    decoder_output_gate_x = tf.compat.v1.get_variable('output_gate_x',shape=[input_size, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_output_gate_m = tf.compat.v1.get_variable('output_gate_m',shape=[num_nodes, num_nodes], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    decoder_output_gate_b = tf.Variable(tf.random.uniform([1, num_nodes],-0.05,0.05),name='output_gate_b')

    # Softmax Classifier
    w = tf.compat.v1.get_variable('softmax_weights',shape=[num_nodes, vocabulary_size], initializer = tf.compat.v1.keras.initializers.VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"))
    b = tf.Variable(tf.random.uniform([vocabulary_size],-0.05,-0.05),name='softmax_bias')
    
print('Decoder Model defined')

Decoder Model defined


### Defining LSTM cell


In [19]:
# Encoder LSTM cell
def encoder_lstm_cell(_input, _output, _state):
    input_gate = tf.sigmoid(tf.matmul(_input, encoder_input_gate_x) + tf.matmul(_output, encoder_input_gate_m) + encoder_input_gate_b)
    forget_gate = tf.sigmoid(tf.matmul(_input, encoder_forget_gate_x) + tf.matmul(_output, encoder_forget_gate_m) + encoder_forget_gate_b)
    update = tf.matmul(_input, encoder_candidate_value_x) + tf.matmul(_output, encoder_candidate_value_m) + encoder_candidate_value_b
    _state = forget_gate * _state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(_input, encoder_output_gate_x) + tf.matmul(_output, encoder_output_gate_m) + encoder_output_gate_b)
    return output_gate * tf.tanh(_state), _state

# Decoder LSTM cell
def decoder_lstm_cell(_input, _output, _state):
    input_gate = tf.sigmoid(tf.matmul(_input, decoder_input_gate_x) + tf.matmul(_output, decoder_input_gate_m) + decoder_input_gate_b)
    forget_gate = tf.sigmoid(tf.matmul(_input, decoder_forget_gate_x) + tf.matmul(_output, decoder_forget_gate_m) + decoder_forget_gate_b)
    update = tf.matmul(_input, decoder_candidate_value_x) + tf.matmul(_output, decoder_candidate_value_m) + decoder_candidate_value_b
    _state = forget_gate * _state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(_input, decoder_output_gate_x) + tf.matmul(_output, decoder_output_gate_m) + decoder_output_gate_b)
    return output_gate * tf.tanh(_state), _state

In [20]:

#=========================== TRAIN =================================

outputs = list()
output = saved_output
state = saved_state

# Calculate the output and state of the encoder
for _input in encoder_train_inputs:
    output, state = encoder_lstm_cell(_input, output, state)

# Calculate the output and state of the decoder
with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
    for _input in decoder_train_inputs:
        output, state = decoder_lstm_cell(_input, output, state)
        outputs.append(output)

# Calculate the decoder logits for all unrolled steps
logits = tf.matmul(tf.concat(axis=0, values=outputs), w) + b

# Decoder predictions
train_prediction = tf.nn.softmax(logits)


#=========================== TEST =================================

test_output  = saved_test_output
test_state = saved_test_state
test_predictions = []

for _input in encoder_test_input:
    test_output, test_state = encoder_lstm_cell(_input, test_output,test_state)

# Calculate the decoder output
with tf.control_dependencies([saved_test_output.assign(test_output), saved_test_state.assign(test_state)]):
    for i in range(decoder_num_unrollings):

        test_output, test_state = decoder_lstm_cell(decoder_test_input, test_output, test_state)
        test_prediction = tf.nn.softmax(tf.compat.v1.nn.xw_plus_b(test_output, w, b))
        decoder_test_input = tf.nn.embedding_lookup(params=word_embeddings,ids=tf.argmax(input=test_prediction,axis=1))
        test_predictions.append(tf.argmax(input=test_prediction,axis=1))


### Calculating the Loss

The loss is calculated by summing all losses obtained along the time axis and the average of the lot axis.

In [21]:
loss_batch = tf.concat(axis=0,values=decoder_train_masks) * tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.concat(axis=0, values=decoder_train_labels))
loss = tf.reduce_mean(input_tensor=loss_batch)

### Optimizer

There are two optimizers used here: Adam and SGD. 
Using Adam just causes the model to exhibit some undesirable behavior in the long run. 
So Adam is used to get a good initial guess for the SGD and use the SGD from that point on.

In [22]:
# These are used to slow down the learning rate over time
global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.compat.v1.assign(global_step,global_step + 1)

# Using two optimizers, when optimizer changes we reset global step
reset_gstep = tf.compat.v1.assign(global_step,0)

# Calculated decaying learning rate
learning_rate = tf.maximum(
    tf.compat.v1.train.exponential_decay(0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True), 0.00001)

sgd_learning_rate = tf.maximum(
    tf.compat.v1.train.exponential_decay(0.005, global_step, decay_steps=1, decay_rate=0.95, staircase=True), 0.00001)

with tf.compat.v1.variable_scope('Adam'):
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate)
with tf.compat.v1.variable_scope('SGD'):
    sgd_optimizer = tf.compat.v1.train.GradientDescentOptimizer(sgd_learning_rate)

gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, v))

sgd_gradients, v = zip(*sgd_optimizer.compute_gradients(loss))
sgd_gradients, _ = tf.clip_by_global_norm(sgd_gradients, 5.0)
sgd_optimize = optimizer.apply_gradients(zip(sgd_gradients, v))

# Making sure there are fluid gradients from decoder to encoder
for (g_i,v_i) in zip(gradients,v):
    assert g_i is not None, 'Gradient none for %s'%(v_i.name)

### Resetting the Training and Testing States

Define the state reset functions

In [23]:
# Reset training state
reset_train_state = tf.group(tf.compat.v1.assign(saved_output, tf.zeros([batch_size, num_nodes])),
                             tf.compat.v1.assign(saved_state, tf.zeros([batch_size, num_nodes])))

reset_test_state = tf.group(
    saved_test_output.assign(tf.zeros([batch_size, num_nodes])),
    saved_test_state.assign(tf.zeros([batch_size, num_nodes])))

## Running the Neural Network

With all the TensorFlow operations defined, now to define various functions related to running the model, as well as running the model.

### Evaluate and Print Results

it is defined two functions to print and save the prediction results for training data as well as test data, and finally, define a function to get candidate and reference data to calculate the BLEU Score.

In [24]:
def print_and_save_train_predictions(decoder_unrolled_labels, pred_unrolled, random_index, train_prediction_text_fname):

    print_str = 'Actual: ' 
    for w in np.argmax(np.concatenate(decoder_unrolled_labels,axis=0)[random_index::batch_size],axis=1).tolist():
        print_str += reverse_dictionary[w] + ' '
        if reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str)  

    print()
    print_str = 'Predicted: '
    for w in np.argmax(pred_unrolled[random_index::batch_size],axis=1).tolist():
        print_str += reverse_dictionary[w] + ' '
        if reverse_dictionary[w] == '</s>':
            break
    print(print_str)
    with open(os.path.join(log_dir, train_prediction_text_fname),'a',encoding='utf-8') as fa:                
        fa.write(print_str+'\n')


def create_bleu_ref_candidate_lists(all_preds, all_labels):

    bleu_labels, bleu_preds = [],[]
    ref_list, cand_list = [],[]
    for b_i in range(batch_size):
        tmp_lbl = all_labels[b_i::batch_size]
        tmp_lbl = tmp_lbl[np.where(tmp_lbl != dictionary['</s>'])]
        ref_str = ' '.join([reverse_dictionary[lbl] for lbl in tmp_lbl])
        ref_list.append([ref_str])
        tmp_pred = all_preds[b_i::batch_size]
        tmp_pred = tmp_pred[np.where(tmp_pred != dictionary['</s>'])]
        cand_str = ' '.join([reverse_dictionary[pre] for pre in tmp_pred])
        cand_list.append(cand_str)

    return cand_list, ref_list

In [25]:
def train_single_step(unrolled_encoder_data, unrolled_decoder_data, unrolled_decoder_labels):

    feed_dict = {}
    for ui, dat in enumerate(unrolled_encoder_data):
        feed_dict[encoder_train_inputs[ui]] = dat

    for ui,(dat,lbl) in enumerate(zip(unrolled_decoder_data,unrolled_decoder_labels)):
        feed_dict[decoder_train_inputs[ui]] = dat
        feed_dict[decoder_train_labels[ui]] = lbl
        d_msk = (np.logical_not(np.argmax(lbl,axis=1)==dictionary['</s>'])).astype(np.int32).reshape(-1,1)
        feed_dict[decoder_train_masks[ui]] = d_msk

    # ======================= OTIMIZAÇÃO ==========================
    if (step+1) < 20000:
        _,l,tr_pred = sess.run([optimize,loss,train_prediction], feed_dict=feed_dict)
    else:
        _,l,tr_pred = sess.run([sgd_optimize,loss,train_prediction], feed_dict=feed_dict)

    # ======================= SAVING ==========================
    if (step+1)%1000==0:
        checkpoint = tf.train.Checkpoint(optimizer=sgd_optimize, loss=loss, train_prediction=train_prediction)
        
    return l, tr_pred

In [26]:
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.mkdir(log_dir)

train_prediction_text_fname = 'train_predictions.txt'

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement=True
sess = tf.compat.v1.InteractiveSession(config=config)

tf.compat.v1.global_variables_initializer().run()
word_embeddings = np.load('embeddings.npy')

def define_data_generators(batch_size, encoder_num_unrollings, decoder_num_unrollings):

    encoder_data_generator = DataGenerator(batch_size=batch_size,num_unroll=encoder_num_unrollings,is_input=True, is_train=True)
    decoder_data_generator = DataGenerator(batch_size=batch_size,num_unroll=decoder_num_unrollings,is_input=False, is_train=True)

    test_encoder_data_generator = DataGenerator(batch_size=batch_size,num_unroll=encoder_num_unrollings,is_input=True, is_train=False)
    test_decoder_data_generator = DataGenerator(batch_size=batch_size,num_unroll=decoder_num_unrollings,is_input=False, is_train=False)

    return encoder_data_generator,decoder_data_generator,test_encoder_data_generator,test_decoder_data_generator

2023-03-30 15:54:25.434896: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 15:54:25.498165: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-30 15:54:25.794318: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-30 15:54:25.795045: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

### Restore checkpoint

In [None]:
status = checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))

### Running Training

In [27]:
train_bleu_scores_over_time,test_bleu_scores_over_time = [],[]
loss_over_time = []

train_bleu_refs, train_bleu_cands = [],[]
test_bleu_refs, test_bleu_cands = [],[]

num_steps = 100001
avg_loss = 0

encoder_data_generator, decoder_data_generator, \
test_encoder_data_generator, test_decoder_data_generator = \
define_data_generators(batch_size, encoder_num_unrollings, decoder_num_unrollings)

print('Starting training')

for step in range(num_steps):

    if (step+1)%100==0:
        print('.',end='')

    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))

    # Getting an unrolled set of data batches for the encoder
    unrolled_encoder_data, _, _ = encoder_data_generator.unroll_batches(sent_ids=sent_ids)
    
    # Getting an unrolled set of data batches for the decoder
    unrolled_decoder_data, unrolled_decoder_labels, _ = decoder_data_generator.unroll_batches(sent_ids=sent_ids)

    # Train for single step
    l, tr_pred = train_single_step(unrolled_encoder_data, unrolled_decoder_data, unrolled_decoder_labels)

    # Calculate BLEU scores
    if np.random.random() < 0.1:

        all_labels = np.argmax(np.concatenate(unrolled_decoder_labels,axis=0),axis=1)
        all_preds = np.argmax(tr_pred,axis=1)

        batch_cands, batch_refs = create_bleu_ref_candidate_lists(all_preds, all_labels)

        train_bleu_refs.extend(batch_refs)
        train_bleu_cands.extend(batch_cands)

    if (step+1)%500==0:

        print('Step ',step+1)
        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:
            fa.write('============= Step ' +  str(step+1) + ' =============\n')

        random_index = np.random.randint(low=1,high=batch_size)
        print_and_save_train_predictions(unrolled_decoder_labels, tr_pred, random_index, train_prediction_text_fname)

        # Calculating the BLEU score for the accumulated candidates
        bscore = 0.0
        bscore = corpus_bleu(train_bleu_refs,train_bleu_cands,smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4)
        train_bleu_scores_over_time.append(bscore)
        print('(Train) BLEU (%d elements): '%(len(train_bleu_refs)),bscore)

        train_bleu_refs, train_bleu_cands = [],[]
        with open(log_dir + os.sep +'blue_scores.txt','a') as fa_bleu:
            fa_bleu.write(str(step+1) +','+str(bscore)+'\n')

        with open(os.path.join(log_dir, train_prediction_text_fname),'a') as fa:
            fa.write('(Train) BLEU: %.5f\n'%bscore)
    
    # Update average loss
    avg_loss += l

    # Resetting hidden state for each batch
    sess.run(reset_train_state)



Starting training
.....Step  500
Actual: the secret ingredient is crime . </s> 

Predicted: i <unk> . . a . </s> 
(Train) BLEU (440 elements):  0.024440227855029428
.....Step  1000
Actual: please do not mutilate your child's genitals . that is all . </s> 

Predicted: i i a a . . . . </s> 
(Train) BLEU (580 elements):  0.09712813972500225
.....Step  1500
Actual: i used to deposit checks from my roommate for rent and he always said something stupid in the the memo line . he started writing things in the memo line 

Predicted: i was to be a and a mom . a in i was was to . . the time time . . i was a to . the time . 
(Train) BLEU (400 elements):  0.12480847946887484
.....Step  2000
Actual: i was once at work and i work i . a deli so i was cutting the plastic off this roast beef and instead of cutting the plastic wrapping i 

Predicted: i have a a a in i was in was i girl of i was a . first of a . . . i of the . <unk> of . 
(Train) BLEU (550 elements):  0.12866980304824177
.....Step  2500
A

.....Step  19500
Actual: batman & robin had a really solid soundtrack for being as cheesy as it was . and opening it with smashing pumpkins , and finishing with an alternate take on 

Predicted: the . <unk> . a good good game . a a a . a was a </s> 
(Train) BLEU (390 elements):  0.18625420427999664
.....Step  20000
Actual: $180 mongoose mountain bike . a lot of money for a 14 year old , but it was worth it . . . still use it , too . </s> 

Predicted: i . . . . </s> 
(Train) BLEU (450 elements):  0.18620236387294706
.....Step  20500
Actual: eat bananas , drink plenty of water and stretch every morning . reason of spasms could be low potassium , dehydration , lack of exercise or poorly executed . </s> 

Predicted: i a to and , of the . then . day . </s> 
(Train) BLEU (520 elements):  0.18640694976549757
.....Step  21000
Actual: it spices things up and it makes the food less boring imo </s> 

Predicted: i is . . . i is me most . . . . </s> 
(Train) BLEU (630 elements):  0.1884307196915248

.....Step  36500
Actual: my brother and i are very close , but i didn't know his wife very well before the wedding . i was happy to be asked to be a bridesmaid 

Predicted: i dad is i were a close to and i was know how mom was well . he first . he was a to the a to be a little 
(Train) BLEU (580 elements):  0.20022646429219562
.....Step  37000
Actual: not red lobster but seafood counter at a grocery store , the manager would regularly take dead lobsters that were being eaten by other lobsters , cook them , and 

Predicted: i a a , i . . a bar store . but first was be a a . . i a a . a people . and , to and 
(Train) BLEU (470 elements):  0.198799128871467
.....Step  37500
Actual: am i the only one who just hates everything about this video ? it's like a giant brooklyn yuppie hipster stereotype . such a narcissistic asshole hoping these people will 

Predicted: i i was only one who was got to . the . game </s> 
(Train) BLEU (400 elements):  0.19914697744926377
.....Step  38000
Actual: wa

.....Step  53000
Actual: call it death with dignity , because that's what it is . </s> 

Predicted: i a to . a . and they why you is . </s> 
(Train) BLEU (480 elements):  0.1957275868118364
.....Step  53500
Actual: this one is good when you're feeling low . </s> 

Predicted: i is is a . i in . . </s> 
(Train) BLEU (540 elements):  0.20611113537715012
.....Step  54000
Actual: not a teacher , but i was an it technician at my high school about 6 years after i graduated . the staff at the school all share stories about 

Predicted: i a teacher , but i was a older teacher teacher a school school . a years ago a was in i first were a time was day the . 
(Train) BLEU (510 elements):  0.18943730700790004
.....Step  54500
Actual: <unk> , when rickrolling was hot , that shit was great . i also always enjoyed <unk> isn't a meme </s> 

Predicted: <unk> . <unk> i <unk> a , i was was a . </s> 
(Train) BLEU (490 elements):  0.20583684883798167
.....Step  55000
Actual: i work in it and almost every pe

.....Step  70500
Actual: if they only kidnap one human to study all of humanity , they've already failed . </s> 

Predicted: i you are do them , , the , the the , then will . to </s> 
(Train) BLEU (480 elements):  0.20290593012976782
.....Step  71000
Actual: our engineers and scientists are too smart to get tied up in our political system . </s> 

Predicted: i <unk> is <unk> are not lazy . be a to . the relationship . . </s> 
(Train) BLEU (420 elements):  0.20366204647273134
.....Step  71500
Actual: walmart i've seen that shit fuck the economies of small towns bad man . </s> 

Predicted: <unk> . been a i . . fuck . the . . . . </s> 
(Train) BLEU (480 elements):  0.20517034218930685
.....Step  72000
Actual: i am in foster care now i'm 17 and i entered in temporary care at the age of 14 . i've always said the most important thing for temporary foster 

Predicted: i have a a of . . not . i am a a . . the end of the . i been been that best recent thing i've me and 
(Train) BLEU (490 elemen

.....Step  88500
Actual: the two <unk> what can men do against such reckless hate ? ride out with me . clip </s> 

Predicted: the <unk> of . the you </s> 
(Train) BLEU (630 elements):  0.20029313636265278
.....Step  89000
Actual: a headache that woke me up at 4am one day . never felt pain like it - dental <unk> i've had . broken bones i've had . i've been mildly 

Predicted: i <unk> . is up up . the . day . </s> 
(Train) BLEU (460 elements):  0.20886468320735352
.....Step  89500
Actual: my family runs a concessions trailer at fairs . lots of great stories but this one always sticks out . we have a giant <unk> steam table directly in front 

Predicted: i friend was in lot and on a . i of people things . i is of makes to . </s> 
(Train) BLEU (480 elements):  0.21038580313332506
.....Step  90000
Actual: roller coaster tycoon </s> 

Predicted: the coaster . . </s> 
(Train) BLEU (450 elements):  0.20288541909718583
.....Step  90500
Actual: beer steak grill sun cats </s> 

Predicted: i . . .