In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import load_dataset
# from evaluate import load
import json

# output_model_dir = 'Vi-Mong-mBART'
# model_checkpoint = "AI_models/model_hmong_vietnamese_mt_mbart"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pwd

/home/vietnq2/Projects/Personal/LearnMong/quizz/scripts


In [3]:
# Load model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("../AI_models/model_hmong_vietnamese_mt_mbart")
tokenizer = MBart50TokenizerFast.from_pretrained("../AI_models/tokenizer_hmong_vietnamese_mt_mbart")

tokenizer_json = json.loads(tokenizer._tokenizer.to_str())


In [7]:
tokenizer.add_special_tokens({'additional_special_tokens': ['hmn_VN']})

1

In [13]:
tokenizer.lang_code_to_id['hmn_VN']

KeyError: 'hmn_VN'

In [8]:
tokenizer.get_added_vocab()['hmn_VN']

250246

In [9]:
tokenizer.src_lang = 'hmn_VN'

In [12]:
tokenizer.decode(tokenizer('hello')['input_ids'])

'hmn_VN hello</s>'

## Prepare data

In [64]:
from datasets import load_dataset
import os


class DataLoader:
    """
    A simple DataLoader for text data stored in csv files.
    """

    def __init__(self, data_path, columns=None, batch_size=32):
        """
        Initialize the DataLoader with the path to the training data and batch size.

        Args:
            data_path (str): Path to the folder containing csv files.
            columns (list): List of columns to load from the csv files.
        """
        self.batch_size = batch_size
        self.columns = columns
        self._load_data(data_path)

    def _load_data(self, data_path: str):
        """
        Load multiple csv files from a given path and create a Hugging Face Dataset.
        """

        try:
            assert os.path.exists(data_path)
        except AssertionError:
            print(f"Data path does not exist: {data_path}")
            raise FileNotFoundError(f"Data path does not exist: {data_path}")

        data_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith(".csv")]
        self.dataset = load_dataset(
            "csv",
            data_files=data_files,
            split="train",
        )
        if self.columns:
            self.dataset = self.dataset.select_columns(self.columns)
        print(f"Loaded {len(self.dataset)} samples from {self.dataset}.")

    def __iter__(self):
        """
        Iterate over the dataset and yield batches of text data.
        """
        for batch in self.dataset.iter(batch_size=self.batch_size):
            if self.columns and len(self.columns) == 1:
                yield batch[self.columns[0]]
            else:
                yield batch
                # yield {col: batch[col] for col in self.columns}


In [75]:
dataloader = DataLoader(data_path='../data', batch_size=2)

Loaded 778 samples from Dataset({
    features: ['mong', 'vietnamese'],
    num_rows: 778
}).


In [76]:
for x in dataloader:
    print(x)
    break

{'mong': ['kêl', 'saz'], 'vietnamese': ['thấp', 'cao']}


## Train LLM

In [77]:
tokenizer.src_lang = "hm"
tokenizer.tgt_lang = "vi"

In [78]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples, from_lang="mong", to_lang="vietnamese"):
    print(examples)
    print(examples[from_lang])
    print(examples[to_lang])
    model_inputs = tokenizer(
        text=examples[from_lang],
        text_target=examples[to_lang],
        max_length=max_input_length, truncation=True)
    print(model_inputs)
    return model_inputs

In [79]:
dataloader.dataset = dataloader.dataset.map(lambda x: preprocess_function(x), batched=True, remove_columns=dataloader.dataset.column_names)
dataloader.dataset

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

{'mong': ['kêl', 'saz', 'lux', 'yaz', 'jông', 'đuz', 'cơưv', 'môngl', 'naox', 'uô', 'nôngs', 'shôngz', 'ntông', 'trăngz hlâu', 'trăngz cơư', 'trăngz tras', 'luz tsêr', 'luz cơưv', 'luz trôngx', 'lênhx', 'lênhx nav', 'lênhx txir', 'lênhx txir zơưv', 'đeiv qaox', 'đeiv ntơưr', 'đeiv blôngx', 'txux plâuz hâu', 'txux xur', 'chaor', 'puôk', 'cxaz', 'yiv', 'đangx', 'lôngx têl', 'phưz', 'langv phưx', 'đar', 'khaoz', 'jas', 'pluôv', 'hnuz', 'hmao', 'hlik (hli)', 'shông', 'ntiv', 'cheix', 'tav', 'tik', 'sênhv', 'txôngl', 'jaol', 'xar', 'xinhr', 'zangv', 'txeir', 'tuz', 'nxeik', 'nhuôs', 'tix', 'cưr', 'vêr', 'ndâus', 'nhangz', 'vâur', 'zơưv', 'pus', 'tsêr', 'têz', 'lax', 'đêr', 'keiz', 'ntơưr', 'đêx', 'shuv xinhz', 'cưk ntông', 'tsêr khu maoz', 'đha', 'nhaoz', 'zang zus', 'uô si', 'nưl đha', 'tâu naox', 'tâu hâuk', 'tâu cơưv', 'tâu uô si', 'tsuv tuô', 'tsuv tuv', 'trâus maoz', 'trâus  phuôv', 'tsuv ntâuk', 'az', 'kăngz', 'saz', 'greiv', 'yaz', 'tsêr nor jông hưngr', 'tsêr nor jông jông', 'tsêr n

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 778
})

In [15]:
tokenizer.decode(
    tokenized_dataset['train']['input_ids'][2],
    use_source_tokenizer=True
), tokenizer.decode(
    tokenized_dataset['train']['labels'][2],
    use_source_tokenizer=False
)

('hmn_VN lub zog</s>', 'vi_VN sức mạnh</s>')

In [16]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
import numpy as np

metric = load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [18]:
from transformers import Seq2SeqTrainingArguments

source_lang = "hmn_VN"
target_lang = "vi_VN"

batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch",
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)



In [19]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [20]:
# turn of warning
from transformers import logging
logging.set_verbosity_error()

trainer.train()

  0%|          | 0/390 [00:00<?, ?it/s]

{'loss': 5.6562, 'grad_norm': 22.556230545043945, 'learning_rate': 1.953846153846154e-05, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.6418263912200928, 'eval_bleu': 2.9175, 'eval_gen_len': 5.102, 'eval_runtime': 1.1532, 'eval_samples_per_second': 84.984, 'eval_steps_per_second': 3.469, 'epoch': 1.0}
{'loss': 2.3898, 'grad_norm': 13.926077842712402, 'learning_rate': 1.8871794871794873e-05, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.439190149307251, 'eval_bleu': 1.193, 'eval_gen_len': 5.7755, 'eval_runtime': 1.0248, 'eval_samples_per_second': 95.629, 'eval_steps_per_second': 3.903, 'epoch': 2.0}
{'loss': 1.9112, 'grad_norm': 21.751375198364258, 'learning_rate': 1.8205128205128208e-05, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.426964044570923, 'eval_bleu': 1.7169, 'eval_gen_len': 6.3878, 'eval_runtime': 1.1939, 'eval_samples_per_second': 82.086, 'eval_steps_per_second': 3.35, 'epoch': 3.0}
{'loss': 1.5774, 'grad_norm': 20.95884895324707, 'learning_rate': 1.753846153846154e-05, 'epoch': 4.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.4470629692077637, 'eval_bleu': 1.4129, 'eval_gen_len': 6.0612, 'eval_runtime': 1.2443, 'eval_samples_per_second': 78.76, 'eval_steps_per_second': 3.215, 'epoch': 4.0}
{'loss': 1.2982, 'grad_norm': 25.556779861450195, 'learning_rate': 1.687179487179487e-05, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.500206232070923, 'eval_bleu': 1.1367, 'eval_gen_len': 6.5408, 'eval_runtime': 1.2932, 'eval_samples_per_second': 75.78, 'eval_steps_per_second': 3.093, 'epoch': 5.0}
{'loss': 1.048, 'grad_norm': 29.09297752380371, 'learning_rate': 1.6205128205128207e-05, 'epoch': 6.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.602057456970215, 'eval_bleu': 1.933, 'eval_gen_len': 6.1531, 'eval_runtime': 1.3139, 'eval_samples_per_second': 74.584, 'eval_steps_per_second': 3.044, 'epoch': 6.0}
{'loss': 0.8621, 'grad_norm': 18.321640014648438, 'learning_rate': 1.553846153846154e-05, 'epoch': 7.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.536494255065918, 'eval_bleu': 1.8456, 'eval_gen_len': 6.2449, 'eval_runtime': 1.2659, 'eval_samples_per_second': 77.417, 'eval_steps_per_second': 3.16, 'epoch': 7.0}
{'loss': 0.6785, 'grad_norm': 20.66543960571289, 'learning_rate': 1.4871794871794874e-05, 'epoch': 8.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.6780757904052734, 'eval_bleu': 2.3296, 'eval_gen_len': 5.7959, 'eval_runtime': 1.1955, 'eval_samples_per_second': 81.972, 'eval_steps_per_second': 3.346, 'epoch': 8.0}
{'loss': 0.5475, 'grad_norm': 25.32037925720215, 'learning_rate': 1.4205128205128207e-05, 'epoch': 9.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.66363787651062, 'eval_bleu': 2.1021, 'eval_gen_len': 6.0714, 'eval_runtime': 1.3007, 'eval_samples_per_second': 75.344, 'eval_steps_per_second': 3.075, 'epoch': 9.0}
{'loss': 0.4468, 'grad_norm': 15.9557466506958, 'learning_rate': 1.353846153846154e-05, 'epoch': 10.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.7311465740203857, 'eval_bleu': 2.4879, 'eval_gen_len': 5.8673, 'eval_runtime': 1.2559, 'eval_samples_per_second': 78.029, 'eval_steps_per_second': 3.185, 'epoch': 10.0}
{'loss': 0.3468, 'grad_norm': 16.939138412475586, 'learning_rate': 1.2871794871794874e-05, 'epoch': 11.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.8056421279907227, 'eval_bleu': 2.6178, 'eval_gen_len': 5.8776, 'eval_runtime': 1.1534, 'eval_samples_per_second': 84.965, 'eval_steps_per_second': 3.468, 'epoch': 11.0}
{'loss': 0.2937, 'grad_norm': 20.56529426574707, 'learning_rate': 1.2205128205128208e-05, 'epoch': 12.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.856243371963501, 'eval_bleu': 2.4849, 'eval_gen_len': 5.9184, 'eval_runtime': 1.231, 'eval_samples_per_second': 79.607, 'eval_steps_per_second': 3.249, 'epoch': 12.0}
{'loss': 0.2246, 'grad_norm': 23.23770523071289, 'learning_rate': 1.1538461538461538e-05, 'epoch': 13.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.8133704662323, 'eval_bleu': 2.8135, 'eval_gen_len': 5.8163, 'eval_runtime': 1.2055, 'eval_samples_per_second': 81.295, 'eval_steps_per_second': 3.318, 'epoch': 13.0}
{'loss': 0.1883, 'grad_norm': 20.704294204711914, 'learning_rate': 1.0871794871794871e-05, 'epoch': 14.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.919157028198242, 'eval_bleu': 2.6053, 'eval_gen_len': 5.8571, 'eval_runtime': 1.228, 'eval_samples_per_second': 79.805, 'eval_steps_per_second': 3.257, 'epoch': 14.0}
{'loss': 0.1899, 'grad_norm': 27.97784423828125, 'learning_rate': 1.0205128205128205e-05, 'epoch': 15.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.928148031234741, 'eval_bleu': 2.8654, 'eval_gen_len': 5.7755, 'eval_runtime': 1.2335, 'eval_samples_per_second': 79.448, 'eval_steps_per_second': 3.243, 'epoch': 15.0}
{'loss': 0.1437, 'grad_norm': 4.720885276794434, 'learning_rate': 9.53846153846154e-06, 'epoch': 16.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.9969587326049805, 'eval_bleu': 2.7301, 'eval_gen_len': 5.7551, 'eval_runtime': 1.2113, 'eval_samples_per_second': 80.907, 'eval_steps_per_second': 3.302, 'epoch': 16.0}
{'loss': 0.1316, 'grad_norm': 10.93262004852295, 'learning_rate': 8.871794871794872e-06, 'epoch': 17.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.9931578636169434, 'eval_bleu': 3.0183, 'eval_gen_len': 5.7143, 'eval_runtime': 1.2414, 'eval_samples_per_second': 78.94, 'eval_steps_per_second': 3.222, 'epoch': 17.0}
{'loss': 0.1064, 'grad_norm': 12.195067405700684, 'learning_rate': 8.205128205128205e-06, 'epoch': 18.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.001044988632202, 'eval_bleu': 2.8663, 'eval_gen_len': 5.9082, 'eval_runtime': 1.2776, 'eval_samples_per_second': 76.705, 'eval_steps_per_second': 3.131, 'epoch': 18.0}
{'loss': 0.1027, 'grad_norm': 20.848590850830078, 'learning_rate': 7.538461538461539e-06, 'epoch': 19.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.1131162643432617, 'eval_bleu': 2.9311, 'eval_gen_len': 5.6735, 'eval_runtime': 1.2493, 'eval_samples_per_second': 78.442, 'eval_steps_per_second': 3.202, 'epoch': 19.0}
{'loss': 0.0947, 'grad_norm': 4.678923606872559, 'learning_rate': 6.871794871794872e-06, 'epoch': 20.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.036221742630005, 'eval_bleu': 2.8551, 'eval_gen_len': 5.7551, 'eval_runtime': 1.2629, 'eval_samples_per_second': 77.597, 'eval_steps_per_second': 3.167, 'epoch': 20.0}
{'loss': 0.0839, 'grad_norm': 6.849755764007568, 'learning_rate': 6.205128205128206e-06, 'epoch': 21.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.080073356628418, 'eval_bleu': 3.1948, 'eval_gen_len': 5.602, 'eval_runtime': 1.2364, 'eval_samples_per_second': 79.264, 'eval_steps_per_second': 3.235, 'epoch': 21.0}
{'loss': 0.082, 'grad_norm': 10.446834564208984, 'learning_rate': 5.538461538461539e-06, 'epoch': 22.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.089337110519409, 'eval_bleu': 2.8678, 'eval_gen_len': 5.7143, 'eval_runtime': 1.2721, 'eval_samples_per_second': 77.041, 'eval_steps_per_second': 3.145, 'epoch': 22.0}
{'loss': 0.0671, 'grad_norm': 10.18714714050293, 'learning_rate': 4.871794871794872e-06, 'epoch': 23.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.094733238220215, 'eval_bleu': 2.8367, 'eval_gen_len': 5.7245, 'eval_runtime': 1.274, 'eval_samples_per_second': 76.924, 'eval_steps_per_second': 3.14, 'epoch': 23.0}
{'loss': 0.0821, 'grad_norm': 5.516695976257324, 'learning_rate': 4.2051282051282055e-06, 'epoch': 24.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.0875072479248047, 'eval_bleu': 2.8953, 'eval_gen_len': 5.6735, 'eval_runtime': 1.269, 'eval_samples_per_second': 77.224, 'eval_steps_per_second': 3.152, 'epoch': 24.0}
{'loss': 0.0645, 'grad_norm': 7.612156391143799, 'learning_rate': 3.538461538461539e-06, 'epoch': 25.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.132955312728882, 'eval_bleu': 3.2697, 'eval_gen_len': 5.602, 'eval_runtime': 1.2061, 'eval_samples_per_second': 81.252, 'eval_steps_per_second': 3.316, 'epoch': 25.0}
{'loss': 0.0623, 'grad_norm': 3.3212013244628906, 'learning_rate': 2.8717948717948717e-06, 'epoch': 26.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.1104681491851807, 'eval_bleu': 3.0985, 'eval_gen_len': 5.7143, 'eval_runtime': 1.2541, 'eval_samples_per_second': 78.143, 'eval_steps_per_second': 3.19, 'epoch': 26.0}
{'loss': 0.0589, 'grad_norm': 5.647846698760986, 'learning_rate': 2.2051282051282052e-06, 'epoch': 27.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.130934953689575, 'eval_bleu': 2.9763, 'eval_gen_len': 5.7959, 'eval_runtime': 1.2614, 'eval_samples_per_second': 77.689, 'eval_steps_per_second': 3.171, 'epoch': 27.0}
{'loss': 0.053, 'grad_norm': 15.663979530334473, 'learning_rate': 1.5384615384615387e-06, 'epoch': 28.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.1595370769500732, 'eval_bleu': 2.959, 'eval_gen_len': 5.7551, 'eval_runtime': 1.2175, 'eval_samples_per_second': 80.493, 'eval_steps_per_second': 3.285, 'epoch': 28.0}
{'loss': 0.0454, 'grad_norm': 4.792672157287598, 'learning_rate': 8.717948717948718e-07, 'epoch': 29.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.1546945571899414, 'eval_bleu': 3.052, 'eval_gen_len': 5.7347, 'eval_runtime': 1.2295, 'eval_samples_per_second': 79.708, 'eval_steps_per_second': 3.253, 'epoch': 29.0}




{'loss': 0.0456, 'grad_norm': 2.8344736099243164, 'learning_rate': 2.0512820512820514e-07, 'epoch': 30.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 3.1520814895629883, 'eval_bleu': 2.9279, 'eval_gen_len': 5.7653, 'eval_runtime': 1.2522, 'eval_samples_per_second': 78.265, 'eval_steps_per_second': 3.194, 'epoch': 30.0}
{'train_runtime': 184.4892, 'train_samples_per_second': 63.418, 'train_steps_per_second': 2.114, 'train_loss': 0.6294279514214931, 'epoch': 30.0}


TrainOutput(global_step=390, training_loss=0.6294279514214931, metrics={'train_runtime': 184.4892, 'train_samples_per_second': 63.418, 'train_steps_per_second': 2.114, 'total_flos': 358431412125696.0, 'train_loss': 0.6294279514214931, 'epoch': 30.0})

In [25]:
dataset
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 390
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 98
    })
})

In [34]:
tokenized_dataset['test'][0]['input_ids'], tokenizer.decode(tokenized_dataset['test'][0]['input_ids'])

([250175, 74185, 250109, 876, 298, 170, 4981, 2],
 'hmn_VN tsi ntsi b koj dua</s>')

In [49]:
import torch

idx = 45
text = dataset['test']['from_lang_text'][idx]
print('Expect:', dataset['test']['to_lang_text'][idx])

input_tokenized = tokenizer(text, add_special_tokens=True)
print(f"{input_tokenized['input_ids']}  --  {tokenizer.decode(input_tokenized['input_ids'])}")
input_ids = torch.tensor(input_tokenized['input_ids']).unsqueeze(0).to('cuda')
attention_mask = torch.tensor(input_tokenized['attention_mask']).unsqueeze(0).to('cuda')

# Generate outputs
output_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=256
)

outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
outputs

Expect: cam kết hàng ngày
[250175, 311, 334, 552, 177, 6, 5782, 250117, 75, 11, 250067, 6, 5829, 2]  --  hmn_VN kev cog lus txh ua hn ub</s>


['cam kết dài hạn']