# Assignment 10: Machine Translation with Seq2Seq Models

## 1. Configuration & Constants

- Định nghĩa các biến cấu hình và hằng số cần thiết cho quá trình huấn luyện mô hình dịch máy.

In [None]:
!pip install -U tensorflow keras -q

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting keras
  Downloading keras-3.11.3-py3-none-any.whl.metadata (5.9 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras-3.11.3-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m118.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, keras, tensorflow
  Attempting uninstall: tensor

In [25]:
!pip show tensorflow
!pip show keras

Name: tensorflow
Version: 2.20.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google_pasta, grpcio, h5py, keras, libclang, ml_dtypes, numpy, opt_einsum, packaging, protobuf, requests, setuptools, six, tensorboard, termcolor, typing_extensions, wrapt
Required-by: dopamine_rl, tensorflow-text, tensorflow_decision_forests, tf_keras
Name: keras
Version: 3.11.3
Summary: Multi-backend Keras
Home-page: 
Author: 
Author-email: Keras team <keras-users@googlegroups.com>
License: Apache License 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: absl-py, h5py, ml-dtypes, namex, numpy, optree, packaging, rich
Required-by: keras-hub, tensorflow


In [26]:
import os
import pathlib
import time
from typing import Dict, Tuple, List

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import tensorflow as tf
from tensorflow import keras
import keras
from sklearn.model_selection import train_test_split

In [27]:
# Data configuration
__root__ = os.getcwd()
DATA_DIR = "data_iwslt15"
SITE_PREFIX = "https://nlp.stanford.edu/projects/nmt/data"
DATA_FILES = {
    "train": ("train.en", "train.vi"),
    "dev": ("tst2012.en", "tst2012.vi"),
    "test": ("tst2013.en", "tst2013.vi"),
}
NUM_EXAMPLES = 50000 # Number of training examples to use
MAX_SENTENCE_LENGTH = 50 # Max number of tokens per sentence

# Model Hyperparameters
BUFFER_SIZE = 32000 # Buffer size for shuffling the dataset
BATCH_SIZE = 64 # Batch size for training
EMBEDDING_DIM = 512 # Dimension of the embedding vector
HIDDEN_UNITS = 512 # Number of hidden units in the LSTM
EPOCHS = 10 # Number of epochs to train the model

# Training configuration
CHECKPOINT_DIR = os.path.join(__root__, "lab_10", "model_checkpoints")

## 2. Data preparation

- Download IWSLT15 dataset từ [Stanford NMT](https://nlp.stanford.edu/projects/nmt/data/).
- Giải nén và lưu vào thư mục `/data_iwslt15`.

In [21]:
def preprocess_sentence(sentence: str) -> str:
    """
    Add <start> and <end> tokens to the sentence.
    """
    return f"<start> {sentence.strip()} <end>"

def tokenize_sentences(
        sentences: List[str]
) -> Tuple[tf.Tensor, tf.keras.preprocessing.text.Tokenizer]: # type: ignore
    """
    Tokenize and pad a list of sentences.
    Args:
        sentences (List[str]): List of sentences to tokenize.
    Returns:
        Tuple[tf.Tensor, tf.keras.preprocessing.text.Tokenizer]: A tuple containing the padded tensor and fitted tokenizer
    """
    # Create a tokenizer and fit on the sentences
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='') # type: ignore

    # Fit the tokenizer on the sentences
    tokenizer.fit_on_texts(sentences)

    # Convert sentences to sequences and pad them
    tensor = tokenizer.texts_to_sequences(sentences)

    # Pad the sequences to ensure uniform length
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post') # type: ignore

    return tensor, tokenizer

def load_data(
        source_path: str,
        target_path: str,
        num_examples: int = None,
) -> Tuple[List[str], List[str]]:
    """
    Load and preprocess sentence pairs from file paths.
    Args:
        source_path (str): Path to the source language file.
        target_path (str): Path to the target language file.
        num_examples (int, optional): Number of examples to load. If None, load all.
    Returns:
        Tuple[List[str], List[str]]: Lists of preprocessed source and target sentences.
    """
    # Read source sentences
    with open(source_path, 'r', encoding='utf-8') as f:
        source_sentences = f.readlines()

    # Read target sentences
    with open(target_path, 'r', encoding='utf-8') as f:
        target_sentences = f.readlines()

    # Assuming both files have the same number of lines
    assert len(source_sentences) == len(target_sentences)

    # Get the number of examples to use
    if num_examples:
        source_sentences = source_sentences[:num_examples]
        target_sentences = target_sentences[:num_examples]

    # Preprocess sentences
    source_data, target_data = [], []
    for src, tgt in zip(source_sentences, target_sentences):
        if len(src.split()) <= MAX_SENTENCE_LENGTH and len(tgt.split()) <= MAX_SENTENCE_LENGTH:
            source_data.append(preprocess_sentence(src))
            target_data.append(preprocess_sentence(tgt))

    return source_data, target_data

In [22]:
# Load and preprocess the data
DATA_PARENT_DIR = os.path.join(__root__, DATA_DIR)

TRAIN_SOURCE_PATH = os.path.join(DATA_PARENT_DIR, DATA_FILES['train'][0])
TRAIN_TARGET_PATH = os.path.join(DATA_PARENT_DIR, DATA_FILES['train'][1])
source_sentences, target_sentences = load_data(
    TRAIN_SOURCE_PATH,
    TRAIN_TARGET_PATH,
    num_examples=NUM_EXAMPLES
)

# Tokenize the sentences
source_tensor, source_tokenizer = tokenize_sentences(source_sentences)
target_tensor, target_tokenizer = tokenize_sentences(target_sentences)

# Create training and validation sets
vocab_src_size = len(source_tokenizer.word_index) + 1 # Why +1? Because of padding token
vocab_tgt_size = len(target_tokenizer.word_index) + 1

# Create a tf.data dataset from the tensors and batch it
train_dataset = tf.data.Dataset.from_tensor_slices(tensors=(source_tensor, target_tensor))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [28]:
example_input_batch, example_target_batch = next(iter(train_dataset)) # type: ignore
print(f"Input batch shape: {example_input_batch.shape}, Target batch shape: {example_target_batch.shape}")

Input batch shape: (64, 52), Target batch shape: (64, 52)


## 3. Model Architecture (Seq2Seq with Attention)

### 3.1. Encoder

In [47]:
class Encoder(tf.keras.Model): # type: ignore
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_units: int, batch_size: int):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) # type: ignore
        self.gru = tf.keras.layers.GRU(
            self.hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )

    def call(self, x: tf.Tensor, hidden: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.embedding(x)
        output_tuple = self.gru(x, initial_state=hidden)
        output = output_tuple[0]
        state = tf.stack(output_tuple[1:], axis=0)
        return output, state

    def initialize_hidden_state(self) -> tf.Tensor:
        return tf.zeros((self.batch_size, self.hidden_units))

encoder = Encoder(vocab_size=vocab_src_size, embedding_dim=EMBEDDING_DIM, hidden_units=HIDDEN_UNITS, batch_size=BATCH_SIZE)
sample_output, sample_hidden = encoder(example_input_batch, encoder.initialize_hidden_state())
print(f"Encoder output shape: {sample_output.shape}") # (batch_size, max_length, hidden_units)
print(f"Encoder hidden state shape: {sample_hidden.shape}") # (batch_size, hidden_units)

Encoder output shape: (64, 52, 512)
Encoder hidden state shape: (64, 512)


### 3.2. Bahdanau Attention

In [48]:
class BahdanauAttention(tf.keras.Model): # type: ignore
    def __init__(self, units: int):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units) # type: ignore
        self.W2 = tf.keras.layers.Dense(units) # type: ignore
        self.V = tf.keras.layers.Dense(1) # type: ignore

    def call(self, query: tf.Tensor, values: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        # query shape == (batch_size, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1) # (batch_size, 1, hidden size)

        # score shape == (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values # type: ignore
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights # type: ignore

attention_layer = BahdanauAttention(units=HIDDEN_UNITS)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print(f"Attention result shape: {attention_result.shape}") # (batch_size, hidden_units)
print(f"Attention weights shape: {attention_weights.shape}") # (batch_size, max_length, 1)

Attention result shape: (64, 512)
Attention weights shape: (64, 52, 1)


### 3.3. Decoder

In [50]:
class Decoder(tf.keras.Model): # type: ignore
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_units: int, batch_size: int):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) # type: ignore
        self.gru = tf.keras.layers.GRU( # type: ignore
            self.hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc = tf.keras.layers.Dense(vocab_size) # type: ignore

        # Used for attention
        self.attention = BahdanauAttention(self.hidden_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concat == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output_tuple = self.gru(x, initial_state=hidden)
        output = output_tuple[0]
        state = tf.stack(output_tuple[1:], axis=0)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab_size)
        x = self.fc(output)

        return x, state, attention_weights

decoder = Decoder(vocab_size=vocab_tgt_size, embedding_dim=EMBEDDING_DIM, hidden_units=HIDDEN_UNITS, batch_size=BATCH_SIZE)
sample_decoder_output, _, _ = decoder(
    tf.random.uniform((BATCH_SIZE, 1)),
    sample_hidden,
    sample_output
)
print(f"Decoder output shape: {sample_decoder_output.shape}") # (batch_size, vocab)

Decoder output shape: (64, 11252)


### 3.4. Training Setup

In [51]:
optimizer = tf.keras.optimizers.Adam() # type: ignore
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') # type: ignore

def loss_function(real: tf.Tensor, pred: tf.Tensor) -> tf.Tensor:
    """
    Compute the loss between the real and predicted values.
    Args:
        real (tf.Tensor): The ground truth tensor.
        pred (tf.Tensor): The predicted tensor from the model.
    Returns:
        tf.Tensor: The computed loss value.
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

@tf.function
def train_step(
    source: tf.Tensor,
    target: tf.Tensor,
    enc_hidden: tf.Tensor
) -> tf.Tensor:
    """
    Perform a single training step.
    Args:
        source (tf.Tensor): Source language input tensor.
        target (tf.Tensor): Target language input tensor.
        enc_hidden (tf.Tensor): Initial hidden state for the encoder.
        encoder (Encoder): The encoder model.
        decoder (Decoder): The decoder model.
        target_tokenizer (tf.keras.preprocessing.text.Tokenizer): Tokenizer for the target language.
    Returns:
        tf.Tensor: The loss value for the training step.
    """
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(source, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims(
            input=[target_tokenizer.word_index['<start>']] * BATCH_SIZE, # type: ignore
            axis=1
        )

        # Teacher forcing
        for t in range(1, target.shape[1]): # type: ignore
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(target[:, t], predictions) # type: ignore
            dec_input = tf.expand_dims(target[:, t], 1) # type: ignore

    batch_loss = loss / int(target.shape[1]) # type: ignore
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables)) # type: ignore

    return batch_loss # type: ignore


def train():
    """
    Train the Seq2Seq model with attention.
    """
    encoder = Encoder(vocab_size=vocab_src_size, embedding_dim=EMBEDDING_DIM, hidden_units=HIDDEN_UNITS, batch_size=BATCH_SIZE)
    decoder = Decoder(vocab_size=vocab_tgt_size, embedding_dim=EMBEDDING_DIM, hidden_units=HIDDEN_UNITS, batch_size=BATCH_SIZE)

    checkpoint_prefix = os.path.join(CHECKPOINT_DIR, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

    STEP_PER_EPOCH = len(source_tensor) // BATCH_SIZE

    # Training loop
    print("Starting training...")
    for epoch in range(EPOCHS):
        start = time.time()
        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for batch, (source, target) in enumerate(train_dataset.take(STEP_PER_EPOCH)):
            batch_loss = train_step(source, target, enc_hidden)
            total_loss += batch_loss # type: ignore

        if batch % 100 == 0: # type: ignore
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}') # type: ignore

        # Save checkpoint every 1 epochs
        checkpoint.save(file_prefix=checkpoint_prefix)

        print(f'Epoch {epoch+1} Loss {total_loss / STEP_PER_EPOCH:.4f}') # type: ignore
        print(f'Time taken for 1 epoch {time.time() - start:.2f} sec\n')

    print("Training complete.")

# Start training the model
train()

Starting training...


OperatorNotAllowedInGraphError: in user code:

    File "/tmp/ipython-input-559456271.py", line 41, in train_step  *
        enc_output, enc_hidden = encoder(source, enc_hidden)
    File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/ipython-input-1804359599.py", line 16, in call
        output_tuple = self.gru(x, initial_state=hidden)

    OperatorNotAllowedInGraphError: Exception encountered when calling GRU.call().
    
    [1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m
    
    Arguments received by GRU.call():
      • sequences=tf.Tensor(shape=(64, 52, 512), dtype=float32)
      • initial_state=tf.Tensor(shape=(64, 512), dtype=float32)
      • mask=None
      • training=False
