# Basic experiments with seq2seq architecture and attention

- based on tutorials:
  - [official on tf 2.x api](https://www.tensorflow.org/tutorials/text/nmt_with_attention#write_the_encoder_and_decoder_model)
  - [official on tf 1.x api](https://github.com/tensorflow/nmt)
  - [thesis by Thang Luong](https://github.com/lmthang/thesis)

- code for all the models is in [model.py](./model.py)


# Pure encoder-decoder architecture
- 1024 GRU units both for encoder and decoder
- Adam optimizer with basic settings
- teacher forcing during training, greedy decoding during evaluation

In [1]:
from main import *
from sklearn.model_selection import train_test_split

In [2]:
path = "spa.txt"
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path, num_examples)

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                target_tensor,
                                                                                                test_size=0.2)
print(f"training examples: {len(input_tensor_train)}")
print(f"testing examples: {len(input_tensor_val)}")

training examples: 24000
testing examples: 6000


- preparing dataset

In [3]:
# create dataset
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 256
units = 1024
n_layers_m1 = 1
n_layers_m2 = 2
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

- using decoder without attention `DecoderWA`
- encoder creates a representation of the input sentence in its hidden state, the representation is then fed to the hidden state of the decoder

In [4]:
# create encoder and decoderWA
encoder_m1 = Encoder(vocab_inp_size, embedding_dim, n_layers_m1, units, BATCH_SIZE)
decoder_m1 = DecoderWA(vocab_tar_size, embedding_dim, n_layers_m1, units, BATCH_SIZE)

# create deeper encoder and decoderWA
encoder_m2 = Encoder(vocab_inp_size, embedding_dim, n_layers_m2, units, BATCH_SIZE)
decoder_m2 = DecoderWA(vocab_tar_size, embedding_dim, n_layers_m2, units, BATCH_SIZE)

# create dummy variables just for the sake of showing actual
# dimensions of tensors flowing through the network
sample_hidden = encoder_m1.initialize_hidden_state()
example_input_batch, example_target_batch = next(iter(dataset))
sample_output, sample_hidden = encoder_m1(example_input_batch, sample_hidden)
print(f'Encoder output shape: (batch size, sequence length, units) {sample_output.shape}')
print(f'Encoder Hidden state shape: (batch size, units) {sample_hidden[0].shape}')

sample_decoder_output, _, _ = decoder_m1(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print(f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Encoder output shape: (batch size, sequence length, units) (64, 19, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)
Decoder output shape: (batch_size, vocab size) (64, 4807)


- Adam optimizer without any finetuning
- SparseCategoricalCrossentropy makes life easier, removes the need for creating one-hot representation from targets
- checkpoint directory for saving partially trained models over time

In [5]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
              from_logits=True, reduction='none')

checkpoint_dir = './training_checkpoints'
checkpoint_prefix_m1 = os.path.join(checkpoint_dir, 'ckpt_m1')
checkpoint_prefix_m2 = os.path.join(checkpoint_dir, 'ckpt_m2')
checkpoint_m1 = tf.train.Checkpoint( optimizer=optimizer
                                   , encoder=encoder_m1
                                   , decoder=decoder_m1)
checkpoint_m2 = tf.train.Checkpoint( optimizer=optimizer
                                   , encoder=encoder_m2
                                   , decoder=decoder_m2)

- training with teacher forcing

In [None]:
EPOCHS = 10
train( dataset
     , encoder_m1
     , decoder_m1
     , loss_object
     , optimizer
     , EPOCHS
     , BATCH_SIZE
     , steps_per_epoch
     , targ_lang
     , checkpoint_m1
     , checkpoint_prefix_m1)

In [None]:
EPOCHS = 40
train( dataset
     , encoder_m2
     , decoder_m2
     , loss_object
     , optimizer
     , EPOCHS
     , BATCH_SIZE
     , steps_per_epoch
     , targ_lang
     , checkpoint_m2
     , checkpoint_prefix_m2)

In [10]:
evaluate( input_tensor_val
        , target_tensor_val
        , encoder_m1
        , decoder_m1
        , inp_lang
        , targ_lang
        , max_length_inp
        , max_length_targ
        , units
        , n_layers_m1)

Average f1 score over validation dataset : 0.49910286989245517


In [17]:
evaluate( input_tensor_val
        , target_tensor_val
        , encoder_m2
        , decoder_m2
        , inp_lang
        , targ_lang
        , max_length_inp
        , max_length_targ
        , units
        , n_layers_m2)

Average f1 score over validation dataset : 0.463503092278094
