In [None]:
from datasets import load_dataset

# Load the 'google/fleurs' dataset for 'hi_in' (Hindi - India)
dset = load_dataset("SEACrowd/indo_general_mt_en_id", trust_remote_code=True)


In [None]:
# Split the dataset into training, validation, and test sets
train_data = dset['train']
val_data = dset['validation']
test_data = dset['test']

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, SimpleRNN, Embedding, Dense, Attention
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
import random
# Randomly select 10k indices
random_indices = random.sample(range(len(train_data)), 10000)

# Select the 10k rows
train_data = train_data.select(random_indices)
# Extract source (Indonesian) and target (English) texts from each split
train_source_texts = [example['src'] for example in train_data]
train_target_texts = [example['tgt'] for example in train_data]

val_source_texts = [example['src'] for example in val_data]
val_target_texts = [example['tgt'] for example in val_data]

test_source_texts = [example['src'] for example in test_data]
test_target_texts = [example['tgt'] for example in test_data]

In [None]:
# Add special tokens to your training data
train_target_texts = ["<SOS> " + text + " <EOS>" for text in train_target_texts]
val_target_texts = ["<SOS> " + text + " <EOS>" for text in val_target_texts]
test_target_texts = ["<SOS> " + text + " <EOS>" for text in test_target_texts]


In [None]:
train_target_texts

In [None]:
# Tokenization (Source and Target)
source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

source_tokenizer.fit_on_texts(train_source_texts)
target_tokenizer.fit_on_texts(train_target_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Convert texts to sequences
train_source_sequences = source_tokenizer.texts_to_sequences(train_source_texts)
train_target_sequences = target_tokenizer.texts_to_sequences(train_target_texts)

val_source_sequences = source_tokenizer.texts_to_sequences(val_source_texts)
val_target_sequences = target_tokenizer.texts_to_sequences(val_target_texts)

test_source_sequences = source_tokenizer.texts_to_sequences(test_source_texts)
test_target_sequences = target_tokenizer.texts_to_sequences(test_target_texts)

In [None]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

# Define maximum sequence lengths (modify as needed)
# max_source_length = max(len(seq) for seq in train_source_sequences)
# max_target_length = max(len(seq) for seq in train_target_sequences)
max_source_length = 60
max_target_length = 60

# Pad sequences to ensure uniform length
train_source_sequences = pad_sequences(train_source_sequences, maxlen=max_source_length, padding='post')
train_target_sequences = pad_sequences(train_target_sequences, maxlen=max_target_length, padding='post')

val_source_sequences = pad_sequences(val_source_sequences, maxlen=max_source_length, padding='post')
val_target_sequences = pad_sequences(val_target_sequences, maxlen=max_target_length, padding='post')

test_source_sequences = pad_sequences(test_source_sequences, maxlen=max_source_length, padding='post')
test_target_sequences = pad_sequences(test_target_sequences, maxlen=max_target_length, padding='post')

# Create TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_source_sequences, train_target_sequences))
val_dataset = tf.data.Dataset.from_tensor_slices((val_source_sequences, val_target_sequences))
test_dataset = tf.data.Dataset.from_tensor_slices((test_source_sequences, test_target_sequences))

# Batch and shuffle the training dataset
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=1000).batch(batch_size, drop_remainder=True)
val_dataset = val_dataset.batch(batch_size, drop_remainder=True)
test_dataset = test_dataset.batch(batch_size, drop_remainder=True)

# Print dataset samples to verify
for source, target in train_dataset.take(1):
    print('Source batch shape:', source.shape)
    print('Target batch shape:', target.shape)


In [None]:
#
# Encoder
#
# Encoder with LSTM
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            self.enc_units,
            return_sequences=False,  # Set to True if you need the full sequence
            return_state=True,       # Set to True to get the last state (hidden and cell)
            recurrent_initializer="glorot_uniform",
            unroll=True  # Forces TensorFlow to use non-cuDNN kernels
        )

    def call(self, x):
        # Inside call method of Decoder
        print("Before embedding:", x.shape)
        x = self.embedding(x)
        print("After embedding:", x.shape)
        output, state_hidden, state_cell = self.lstm(x)
        print("After LSTM: output shape:", output.shape, 
            "state_hidden shape:", state_hidden.shape,
            "state_cell shape:", state_cell.shape)
        print("After FC layer:", x.shape)

        return state_hidden, state_cell  # Return both hidden and cell states


In [None]:
#
# Decoder
#
# Decoder with LSTM
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(
            self.dec_units,
            return_sequences=True,  # Set to True to return the full sequence
            return_state=True,      # Set to True to return the last hidden and cell states
            recurrent_initializer="glorot_uniform",
            unroll=True  # Forces TensorFlow to use non-cuDNN kernels
        )
        self.softmax = tf.keras.layers.Dense(vocab_size, activation="softmax")

    def call(self, x, hidden, cell):
        x = self.embedding(x)
        # Pass both hidden and cell states to the LSTM
        output, hidden_state, cell_state = self.lstm(inputs=x, initial_state=[hidden, cell])
        output = self.softmax(output)
        

        return output, hidden_state, cell_state  # Return output, hidden state, and cell state


In [None]:
# ========================================
# Create model
# ========================================

embedding_dim = 256
units = 1024  # LSTM/GRU dimensionality of the output space.

encoder = Encoder(source_vocab_size, embedding_dim, units, batch_size)
decoder = Decoder(target_vocab_size, embedding_dim, units, batch_size)

optimizer = tf.compat.v1.train.AdamOptimizer()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="val_accuracy")

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) # this masks '<pad>'
    """
    Example:

    real= tf.Tensor(
    [[21  1 44 0  0]   (jump !    <eos> <pad> <pad>)
    [ 17  9 24 2 44]   (i    go   there .     <eos>)
    [ 27  1 44 0  0]   (no   !    <eos> <pad> <pad>)
    [ 21 22 32 2 44]], (i    know you   .     <eos>)
    , shape=(4, 5), dtype=int64)

    where <pad> = 0.

    mask= tf.Tensor(
    [[True  True  True False False]
    [ True  True  True True  True ]
    [[True  True  True False False]
    [ True  True  True True  True ],
    shape=(4, 5), dtype=bool)
    """

    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [None]:
import tensorflow as tf
tf.__version__
tf.config.list_physical_devices('GPU')

In [None]:
CHECKPOINT = False

In [None]:
# ========================================
# Training
# ========================================
import os 
import csv
from nltk.translate.bleu_score import sentence_bleu
checkpoint_path = (
    "./checkpointsLLSTM/seq2seq-sample-"
    + str(500)
    + "-embedding-"
    + str(embedding_dim)
    + "-hidden-"
    + str(units)
)

if CHECKPOINT == True:
    ckpt = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print("Latest checkpoint restored!!")
# Function to calculate BLEU score
def calculate_bleu(references, hypotheses):
    """
    Compute BLEU scores for a batch of references and hypotheses.
    `references`: List of lists of reference sentences (tokenized).
    `hypotheses`: List of hypothesis sentences (tokenized).
    """
    scores = []
    for ref, hyp in zip(references, hypotheses):
        # BLEU score for each sentence
        scores.append(sentence_bleu([ref], hyp))
    return sum(scores) / len(scores)  # Average BLEU score for the batch

def append_loss_to_csv(epoch, train_loss, val_loss, time_taken, csv_path="training_lstm_results.csv"):
    file_exists = os.path.exists(csv_path)
    with open(csv_path, mode="a", newline="") as file:
        writer = csv.writer(file)
        if not file_exists:
            # Write header if the file is being created for the first time
            writer.writerow(["Epoch", "Train Loss", "Val Loss", "Time Taken"])
        # Append the new epoch's losses
        writer.writerow([epoch, train_loss.numpy(), val_loss.numpy(), time_taken])
    print(f"Epoch {epoch + 1}:Val Loss: {val_loss:.4f} - Saved to {csv_path}.")

@tf.function
def train(encoder, decoder, source_sentences, target_sentences, target_lang_tokenizer):
    with tf.GradientTape() as tape:
        # Encoder outputs
        h, c = encoder(source_sentences)

        # Decoder inputs
        dec_input = target_sentences[:, :-1]  # Input for decoder
        expected_dec_output = target_sentences[:, 1:]  # Target for comparison

        # Debug inputs
        print("dec_input shape:", dec_input.shape, "dec_input dtype:", dec_input.dtype)
        print("h shape:", h.shape, "h dtype:", h.dtype)
        print("c shape:", c.shape, "c dtype:", c.dtype)

        # Decode
        predictions, _, _ = decoder(dec_input, h, c)

        # Debug outputs
        print("Predictions shape:", predictions.shape)
        loss = loss_function(expected_dec_output, predictions)
        train_accuracy(expected_dec_output, predictions)

    batch_loss = loss / int(target_sentences.shape[1])
    variables = encoder.variables + decoder.variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


@tf.function
def validate(encoder, decoder, source_sentences, target_sentences, target_lang_tokenizer):
    h, c = encoder(source_sentences)
    dec_input = target_sentences[:, :-1]  # Decoder input
    expected_dec_output = target_sentences[:, 1:]  # Expected output

    predictions, _, _ = decoder(dec_input, h, c)
    loss = loss_function(expected_dec_output, predictions)
    val_accuracy(expected_dec_output, predictions)  # Assuming you have val_accuracy defined

    return loss


# Set n_epochs at least 20 when you do training.
n_epochs = 40

# Prepare to store loss and accuracy
loss_history = []
accuracy_history = []
val_loss_history = []
val_accuracy_history = []

import time
for epoch in range(1, n_epochs + 1):
    start = time.time()

    total_loss = 0
    total_val_loss = 0
    train_accuracy.reset_state()
    val_accuracy.reset_state()  # Reset validation accuracy

    # Training loop
    for (batch, (source_sentences, target_sentences)) in enumerate(train_dataset):
        batch_loss = train(encoder, decoder, source_sentences, target_sentences, target_tokenizer)
        total_loss += batch_loss

        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy: {:.4f}".format(epoch, batch, batch_loss.numpy(), train_accuracy.result()))

    # Store the average loss and accuracy for this epoch
    avg_loss = total_loss / (batch + 1)
    loss_history.append(avg_loss)
    accuracy_history.append(train_accuracy.result().numpy())

    # Validation loop
    for (val_source_sentences, val_target_sentences) in val_dataset:
        val_loss = validate(encoder, decoder, val_source_sentences, val_target_sentences, target_tokenizer)
        total_val_loss += val_loss

    # Store the average validation loss and accuracy for this epoch
    avg_val_loss = total_val_loss / len(val_dataset)
    val_loss_history.append(avg_val_loss)
    # calculate bleu score
    if CHECKPOINT == True:
        ckpt_save_path = ckpt_manager.save()
        print("Saving checkpoint for epoch {} at {}".format(epoch, ckpt_save_path))
    time_taken = time.time() - start
    append_loss_to_csv(epoch, avg_loss, avg_val_loss, time_taken)
    print("Epoch {}/{} Loss {:.4f} Val Loss {:.4f}".format(epoch, n_epochs, avg_loss, avg_val_loss))
    print("Time taken for 1 epoch {:.4f} sec\n".format(time_taken))


In [None]:
import matplotlib.pyplot as plt
# Plotting Loss and Accuracy
plt.figure(figsize=(14, 10))

# Loss plot
plt.subplot(2, 1, 1)
plt.plot(loss_history, label='Training Loss', color='blue')
plt.plot(val_loss_history, label='Validation Loss', color='orange')
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Accuracy plot
plt.subplot(2, 1, 2)
plt.plot(accuracy_history, label='Training Accuracy', color='green')
plt.plot(val_accuracy_history, label='Validation Accuracy', color='red')
plt.title('Training and Validation Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()



In [None]:
target_tokenizer.word_index

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction, sentence_bleu

In [None]:
# ========================================
# Translation
# ========================================
import csv
def evaluate(sentence, encoder, decoder, source_lang_tokenizer, target_lang_tokenizer):

    inputs = source_lang_tokenizer.texts_to_sequences([sentence])[0]

    inputs = tf.compat.v1.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_source_length, padding="post"
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    h, c = encoder(inputs)
    dec_input = tf.expand_dims([target_lang_tokenizer.word_index["sos"]], 0)

    for t in range(max_target_length):
        #
        # Greedy Search
        #
        predictions, h, c = decoder(dec_input, h, c)
        predicted_id = tf.argmax(predictions[0][0]).numpy()
        result += target_lang_tokenizer.index_word[predicted_id] + " "
        if target_lang_tokenizer.index_word[predicted_id] == "eos":
            return result

        dec_input = tf.expand_dims([predicted_id], 0)

    return result


def translate(sentence, encoder, decoder, source_lang_tokenizer, target_lang_tokenizer):
    result = evaluate(sentence, encoder, decoder, source_lang_tokenizer, target_lang_tokenizer)
    return result.capitalize()

"""
# for debug:
#sentence = "Su voz suena muy bello."
#sentence = "No nos gusta la lluvia."
sentence = "Nos gusta la lluvia."
result = translate(sentence, encoder, decoder, source_lang_tokenizer, target_lang_tokenizer)
print("Input    : {}".format(sentence))
print("Predicted: {}".format(result))

sys.exit()
"""
#
#
# Helper function to detokenize a sequence to text
def detokenize(sequence, tokenizer):
    return ' '.join([tokenizer.index_word.get(idx, '') for idx in sequence if idx != 0])

def calculate_bleu(reference, hypothesis):
    # Tokenize the sentences for BLEU score calculation

    # Use the sentence_bleu function to calculate the score
    bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=SmoothingFunction().method1)
    return bleu_score
keys = np.arange(len(test_source_sequences))
bleu_scores = []
for i in range(len(keys)):
    print("===== [{}] ======".format(i + 1))
    sentence = detokenize(test_source_sequences[i], source_tokenizer)
    result = translate(sentence, encoder, decoder, source_tokenizer, target_tokenizer)
    print("Input    : {}".format(sentence))
    print("Predicted: {}".format(result))
    correct_sentence = detokenize(test_target_sequences[i], target_tokenizer)
    print("Correct  : {}".format(correct_sentence))
    # Calculate and store the BLEU score
    bleu_score = calculate_bleu(correct_sentence, result)
    bleu_scores.append(bleu_score)
    print("BLEU Score: {:.4f}".format(bleu_score))
    # Save the BLEU scores to a CSV file
    with open('predictions_lstm_test.csv', mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([i + 1, sentence, result, correct_sentence, bleu_score])
# Calculate the average BLEU score for the sample
average_bleu_score = np.mean(bleu_scores)
print("Average BLEU Score for the sample: {:.4f}".format(average_bleu_score))
# encoder.summary()
# decoder.summary()
# decoder.get_config()