In [None]:
import tensorflow as tf
from pathlib import Path
import numpy as np
from tensorflow.keras.layers import Concatenate
from positional_encoding_layer import PositionalEncodingLayer
from transformer_model import TransformerBuilder
from tensorflow.keras.callbacks import ModelCheckpoint
import csv
import pandas as pd

In [None]:
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


df = pd.read_csv("/home/giang/data/eng_vi/train.csv") # from this dataset https://huggingface.co/datasets/ncduy/mt-en-vi

# Extract English and Vietnamese columns
english_sentences_csv = df['en'].tolist()
vietnamese_sentences_cvs = df['vi'].tolist()

# Print the pairs
pairs_csv = list(zip(english_sentences_csv, vietnamese_sentences_cvs))

english_file_path = "/home/giang/data/eng_vi/train.en.txt"  # from this dataset https://www.kaggle.com/datasets/tuannguyenvananh/iwslt15-englishvietnamese
english_sentences = load_sentences(english_file_path)


vietnamese_file_path = "/home/giang/data/eng_vi/train.vi.txt"  
vietnamese_sentences = load_sentences(vietnamese_file_path) # from this dataset https://www.kaggle.com/datasets/tuannguyenvananh/iwslt15-englishvietnamese


pairs_txt = list(zip(english_sentences, vietnamese_sentences))
pairs =  pairs_csv + pairs_txt

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices(pairs[:10]).batch(2)
for i in dataset_test:
    print(i)
    print("/////////////////")

In [None]:
np.random.shuffle(pairs)
sentences_en, sentences_vn = zip(*pairs)
print(len(pairs))


In [None]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_vn[i])

In [None]:
def generate_dataset_output(sentences_en, sentences_vn):

    sentence_eng_dataset = tf.data.Dataset.from_tensor_slices(list(sentences_en))
    sentence_vn_dataset = tf.data.Dataset.from_tensor_slices(list(sentences_vn))

    sentence_eng_dataset = sentence_eng_dataset.batch(100000)
    sentence_vn_dataset = sentence_vn_dataset.batch(100000)
    sentence_vn_dataset = sentence_vn_dataset.map(lambda batch: "startofseq " + batch+ " endofseq")


    return (sentence_eng_dataset, sentence_vn_dataset)


In [None]:
(sentence_eng_dataset, sentence_vn_dataset) = generate_dataset_output(sentences_en, sentences_vn)

In [None]:
vocab_size = 10000
max_length = 500

text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,  
    output_sequence_length=max_length,
)

text_vec_layer_vn = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,  
    output_sequence_length=max_length 
)

text_vec_layer_en.adapt(sentences_en)

text_vec_layer_vn.adapt([f"startofseq {s} endofseq" for s in sentences_vn])

In [None]:
vocab = text_vec_layer_vn.get_vocabulary()
with open("vectorizer_vocab.txt", "w") as f:
    for word in vocab:
        f.write(word + "\n")


In [None]:
vectorizer_model = tf.keras.Sequential([text_vec_layer_vn])
vectorizer_model.save("text_vectorizer.keras")

In [None]:
print(text_vec_layer_en.get_vocabulary()[:20])
print(text_vec_layer_vn.get_vocabulary()[:20])

In [None]:
def transform_function(pair):
        global text_vec_layer_vn
        output_vectorize = text_vec_layer_vn(pair[1] + " endofseq")
        return ((pair[0], "startofseq " + pair[1]), output_vectorize)
def generate_valid_train_dataset():
    train_data_size = int((len(pairs) * 0.9))
    train_data = pairs[:train_data_size]
    valid_data = pairs[train_data_size:]
    train_data_dataset = tf.data.Dataset.from_tensor_slices(train_data).map(transform_function).batch(70)
    valid_data_dataset = tf.data.Dataset.from_tensor_slices(valid_data).map(transform_function).batch(70)
    return (train_data_dataset, valid_data_dataset)


In [None]:
(train_data_dataset, valid_data_dataset) = generate_valid_train_dataset()

In [None]:
for batch in train_data_dataset:
    print(batch)
    break

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_vn(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
pos_embed_layer = PositionalEncodingLayer(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

In [None]:
transformer_builder = TransformerBuilder()
transformer_output = transformer_builder.build(encoder_in=encoder_in, decoder_in=decoder_in)
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(transformer_output)

In [None]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],outputs=[Y_proba])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam",metrics=["accuracy"])

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="checkpoints/march_19_test2/model_{epoch:02d}_{val_loss:.2f}.weights.h5",  # Use .h5
    save_weights_only=True,
    save_best_only=False,
    mode="min",
    verbose=1
)

In [None]:
history = model.fit(
    train_data_dataset,  # Training dataset
    epochs=2,      # Number of epochs
    validation_data=valid_data_dataset,  # Validation dataset
    callbacks=[checkpoint_callback]

) 

In [None]:
loaded_model = tf.keras.models.load_model('saved_model.keras')

In [None]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(50):
        X = np.array([sentence_en]) # encoder input
        X_dec = np.array(["startofseq " + translation]) # decoder input
        y_proba = loaded_model((X, X_dec))[0, word_idx] # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_vn.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:

def translate_with_beam_search(sentence_en, beam_width=3):
    beam = [("startofseq ", 0.0)]  
    max_length = 20  
    for word_idx in range(max_length):
        new_beam = []
        for translation, score in beam:
            if translation.endswith("endofseq"):
                new_beam.append((translation, score))
                continue
            X = np.array([sentence_en])  
            X_dec = np.array([translation]) 

            y_proba = model((X, X_dec))[0, word_idx]  # Probability distribution
            
            
            y_proba_tensor = tf.convert_to_tensor(y_proba)  
            top_k_values, top_k_indices = tf.math.top_k(y_proba_tensor, k=beam_width)

            top_k_indices = top_k_indices.numpy()
            top_k_values = top_k_values.numpy()

            for word_id, word_score in zip(top_k_indices, top_k_values):
                predicted_word = text_vec_layer_vn.get_vocabulary()[word_id]
                new_translation = f"{translation} {predicted_word}" if translation != "startofseq" else predicted_word
                new_score = score + word_score 
                new_beam.append((new_translation, new_score))


        beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
    best_translation = beam[0][0]
    return best_translation.replace("startofseq", "").replace("endofseq", "").strip()

In [None]:
print(translate("I need a fork"))