In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Tạo thư mục làm việc
import os
base_path = '/content/drive/MyDrive/NMT_Project'
if not os.path.exists(base_path):
    os.makedirs(base_path)
%cd {base_path}


Mounted at /content/drive
/content/drive/MyDrive/NMT_Project


In [2]:
!pip install pyvi
!pip install subword-nmt
!pip install tensorflow-text
!pip install gradio

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsuite-0.5.0
Collectin

In [3]:
from pyvi import ViTokenizer

def preprocess_data(en_path, vi_path, limit=None):
    input_texts = []
    target_texts = []

    with open(en_path, 'r', encoding='utf-8') as f_en, \
         open(vi_path, 'r', encoding='utf-8') as f_vi:

        lines_en = f_en.readlines()
        lines_vi = f_vi.readlines()
        if limit:
            lines_en = lines_en[:limit]
            lines_vi = lines_vi[:limit]

        for en, vi in zip(lines_en, lines_vi):
            en_clean = en.strip().lower()

            vi_tokenized = ViTokenizer.tokenize(vi.strip())
            vi_clean = f"<start> {vi_tokenized} <end>"

            input_texts.append(en_clean)
            target_texts.append(vi_clean)

    return input_texts, target_texts
input_docs, target_docs = preprocess_data('train.en.txt', 'train.vi.txt')


print(f"Số lượng câu huấn luyện: {len(input_docs)}")
print(f"Ví dụ Input (EN): {input_docs[0]}")
print(f"Ví dụ Target (VI): {target_docs[0]}")

Số lượng câu huấn luyện: 133317
Ví dụ Input (EN): rachel pike : the science behind a climate headline
Ví dụ Target (VI): <start> Khoa_học đằng sau một tiêu_đề về khí_hậu <end>


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np

try:
    tf.compat.v1.enable_eager_execution()
except:
    pass

BATCH_SIZE = 64
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 128
LATENT_DIM = 256

print("Đang chuyển đổi dữ liệu sang NumPy...")
input_arr = np.array(input_docs, dtype=object) # dtype=object để chứa chuỗi
target_arr = np.array(target_docs, dtype=object)

#vector hóa cho Tiếng Anh
source_vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
print("Đang học từ vựng tiếng Anh (Adapt)...")
source_vectorization.adapt(input_arr)

#vector hóa cho Tiếng Việt
target_vectorization = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH + 1,
)
print("Đang học từ vựng tiếng Việt (Adapt)...")
target_vectorization.adapt(target_arr)

print("Đã tạo xong bộ từ điển thành công!")

#Chuẩn bị dữ liệu train format TF (Pipeline)
def format_dataset(eng, vi):
    eng = source_vectorization(eng)
    vi = target_vectorization(vi)
    return ({
        "encoder_inputs": eng,
        "decoder_inputs": vi[:, :-1],
    }, vi[:, 1:])

def make_dataset(input_texts, target_texts):
    dataset = tf.data.Dataset.from_tensor_slices((input_texts, target_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()

print("Đang tạo luồng dữ liệu training...")
train_ds = make_dataset(input_docs, target_docs)
print("Sẵn sàng để Train!")

Đang chuyển đổi dữ liệu sang NumPy...
Đang học từ vựng tiếng Anh (Adapt)...
Đang học từ vựng tiếng Việt (Adapt)...
Đã tạo xong bộ từ điển thành công!
Đang tạo luồng dữ liệu training...
Sẵn sàng để Train!


In [11]:
from tensorflow.keras.layers import Dense

class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(dec_units, return_sequences=True, return_state=True)
        self.attention = Attention()
        # Projection để enc_output -> dec_units
        self.enc_proj = Dense(dec_units)
        self.fc = Dense(vocab_size)

    def call(self, x, enc_output, state_h, state_c):
        x = self.embedding(x)
        enc_proj = self.enc_proj(enc_output)  # shape = dec_units
        # Attention
        context_vector = self.attention([x, enc_proj])
        x = tf.concat([context_vector, x], axis=-1)
        output, h, c = self.lstm(x, initial_state=[state_h, state_c])
        x = self.fc(output)
        return x, h, c


In [13]:
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, LATENT_DIM)
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, LATENT_DIM, LATENT_DIM)


# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


In [14]:
@tf.function
def train_step(batch_input, batch_target):
    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(batch_input['encoder_inputs'])
        dec_input = batch_input['decoder_inputs']
        dec_target = batch_target

        predictions, _, _ = decoder(dec_input, enc_output, enc_h, enc_c)
        loss = loss_function(dec_target, predictions)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss


In [15]:
EPOCHS = 5  # thử trước để test
for epoch in range(EPOCHS):
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(train_ds):
        batch_loss = train_step(inp, targ)
        total_loss += batch_loss
        if batch % 50 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    print(f'Epoch {epoch+1} Loss {total_loss/ (batch+1):.4f}')


ValueError: in user code:

    File "/tmp/ipython-input-2633100946.py", line 8, in train_step  *
        predictions, _, _ = decoder(dec_input, enc_output, enc_h, enc_c)
    File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/ipython-input-1885996051.py", line 17, in call
        context_vector = self.attention([x, enc_proj])

    ValueError: Exception encountered when calling Attention.call().
    
    [1mDimensions must be equal, but are 128 and 256 for '{{node decoder_1_1/attention_1_1/MatMul}} = BatchMatMulV2[T=DT_FLOAT, adj_x=false, adj_y=false, grad_x=false, grad_y=false](decoder_1_1/embedding_8_1/GatherV2, decoder_1_1/attention_1_1/transpose)' with input shapes: [64,40,128], [64,256,40].[0m
    
    Arguments received by Attention.call():
      • inputs=['tf.Tensor(shape=(64, 40, 128), dtype=float32)', 'tf.Tensor(shape=(64, 40, 256), dtype=float32)']
      • mask=['None', 'None']
      • training=False
      • return_attention_scores=False
      • use_causal_mask=False


In [18]:

from tensorflow.keras import layers, Model, Input

VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 128
LATENT_DIM = 256

#  Encoder
encoder_inputs = Input(shape=(SEQUENCE_LENGTH,), dtype="int64", name="encoder_inputs")


x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = layers.LSTM(
    LATENT_DIM,
    return_state=True,
    recurrent_dropout=0.1
)(x)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(SEQUENCE_LENGTH,), dtype="int64", name="decoder_inputs")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(decoder_inputs)

decoder_lstm = layers.LSTM(
    LATENT_DIM,
    return_sequences=True,
    return_state=True,
    recurrent_dropout=0.1
)
decoder_outputs, _, _ = decoder_lstm(x, initial_state=encoder_states)

# Output Layer
decoder_dense = layers.Dense(VOCAB_SIZE, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Compile Model
model_baseline = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Dùng Adam optimizer
model_baseline.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model_baseline.summary()

print(" Bắt đầu Training")
# Training
history = model_baseline.fit(train_ds, epochs=10)

model_baseline.save('model_baseline.keras')
print("Đã lưu model thành công!")

 Bắt đầu Training
Epoch 1/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 282ms/step - accuracy: 0.0996 - loss: 5.9469
Epoch 2/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 281ms/step - accuracy: 0.1123 - loss: 4.8322
Epoch 3/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m580s[0m 278ms/step - accuracy: 0.1332 - loss: 4.3823
Epoch 4/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 278ms/step - accuracy: 0.1452 - loss: 4.0762
Epoch 5/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m578s[0m 277ms/step - accuracy: 0.1533 - loss: 3.8579
Epoch 6/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 274ms/step - accuracy: 0.1602 - loss: 3.6874
Epoch 7/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m566s[0m 271ms/step - accuracy: 0.1657 - loss: 3.5489
Epoch 8/10
[1m2084/2084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 269ms/step - accura

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/IWSLT15/model_baseline.keras'

In [19]:
model_baseline.save('model_baseline.keras')

In [21]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import random

print("đang tải data test")
test_input_docs, test_target_docs = preprocess_data('tst2013.en.txt',  'tst2013.vi.txt')
print(f"Số lượng câu test: {len(test_input_docs)}")

test_ds = make_dataset(test_input_docs, test_target_docs)

#Đánh giá sơ bộ (Loss & Accuracy)
print("Đánh giá toàn tập test")
results = model_baseline.evaluate(test_ds)
print(f"Test Loss: {results[0]:.4f}")
print(f"Test Accuracy: {results[1]:.4f}")

vi_vocab = target_vectorization.get_vocabulary()
vi_index_lookup = dict(zip(range(len(vi_vocab)), vi_vocab))
max_decoded_sentence_length = SEQUENCE_LENGTH

def decode_sequence(input_sentence):
    tokenized_input = source_vectorization([input_sentence])
    decoded_sentence = "<start>"

    for i in range(max_decoded_sentence_length):
        tokenized_target = target_vectorization([decoded_sentence])[:, :-1]
        predictions = model_baseline.predict([tokenized_input, tokenized_target], verbose=0)
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = vi_index_lookup[sampled_token_index]

        if sampled_token == "<end>":
            break
        decoded_sentence += " " + sampled_token

    return decoded_sentence.replace("<start>", "").strip()

#  Tính điểm BLEU
print("\n Tính điểm BLEU")
num_samples =  50
total_bleu = 0

random_indices = random.sample(range(len(test_input_docs)), num_samples)

for idx in random_indices:
    input_sen = test_input_docs[idx]
    target_sen = test_target_docs[idx].replace("<start>", "").replace("<end>", "").strip()

    predicted_sen = decode_sequence(input_sen)

    ref = [target_sen.split()]
    cand = predicted_sen.split()
    score = sentence_bleu(ref, cand, smoothing_function=SmoothingFunction().method1)
    total_bleu += score

    print(f"\n Input (En):  {input_sen}")
    print(f" Target (Vi): {target_sen}")
    print(f" Model Dịch:  {predicted_sen}")
    print(f" BLEU Score:  {score:.4f}")

avg_bleu = total_bleu / num_samples
print(f"\n------------------------------------------------")
print(f"ĐIỂM BLEU TRUNG BÌNH ({num_samples} mẫu): {avg_bleu:.4f}")

đang tải data test
Số lượng câu test: 1268
Đánh giá toàn tập test
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step - accuracy: 0.1693 - loss: 3.8362
Test Loss: 3.7608
Test Accuracy: 0.1715

 Tính điểm BLEU

 Input (En):  these north koreans were not so lucky .
 Target (Vi): Nhưng những người Bắc Triều_Tiên này thì không được may_mắn như_vậy .
 Model Dịch:  những con này không có [UNK] [UNK] end  end end end end end end end end end end end end end tấtcả end end end end end end end end end end end end end end end end end
 BLEU Score:  0.0062

 Input (En):  who will he become because someone took a stand and made a difference in his life ?
 Target (Vi): Em sẽ trở_thành người thế_nào nếu có ai đó đứng lên và thay_đổi cuộc_đời em ?
 Model Dịch:  ai đó sẽ trởthành một người đànông một người đànông và một người khác và đó là một cuộcsống end  end end end end end end end end end end end end end end end end end end end
 BLEU Score:  0.0126

 Input (En):  so i got an old