### Preprocess training data
- strip
- for Vietnamese sentences:
    - add prefix `"translate Vietnamese to English: "` at the beginning
    - add `" </s>" at the end
- save everything as jsonline format

In [1]:
# import os
# import jsonlines

# subsets = ["train", "dev", "test"]
# langs = ["en", "vi"]

# for subset in subsets:
#     temp = {}
#     data = []
#     for lang in langs:
#         path = os.path.join("data/PhoMT/tokenization",subset,f"{subset}.{lang}")
#         with open(path, "r", encoding='utf-8') as f:    
#             contents = f.readlines()
#         print(f"{path}:", len(contents), "lines")
#         for i in range(len(contents)):
#             line = contents[i].strip()
#             if line[-1:] == "\n":
#                 line = line[:-1]
#             if lang == "vi":
#                 contents[i] = "translate Vietnamese to English: " + line + " </s>"
#             else:
#                 contents[i] = line
#         temp[lang] = contents
#     for en, vi in zip(temp["en"], temp["vi"]):
#         data.append({
#             "en": en,
#             "vi": vi
#         })
#     with jsonlines.open(f'data/PhoMT/tokenization/{subset}.jsonl', mode='w') as writer:        
#         writer.write_all(data)
# del temp
# del data


### Init wandb for logging

In [1]:
# import wandb

# wandb.init(project="vietai-machine-translation", entity="fantastic-four", name="vit5_chkp-30000")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhuyhuy[0m ([33mfantastic-four[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Load the dataset from jsonlines files

Use streaming to reduce RAM usage, need to convert the dataset to `torch` format

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "json", 
    data_files={
        "train":"data/PhoMT/tokenization/train.jsonl",
        "dev":"data/PhoMT/tokenization/dev.jsonl",
        "test":"data/PhoMT/tokenization/test.jsonl"
    },
    streaming=True
)
dataset = dataset.with_format("torch")

Using custom data configuration default-961e8ec10ee50d7d


### Init model and Tokenizer

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")
# model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base").to("cuda:1")

tokenizer = AutoTokenizer.from_pretrained("checkpoint-40000")
model = AutoModelForSeq2SeqLM.from_pretrained("checkpoint-40000").to("cuda:1")
# model.gradient_checkpointing_enable()
# model.use_cache = False
max_input_length = 128
max_target_length = 128

### Function to tokenize dataset

In [4]:
def preprocess_function(examples):
    inputs = examples["vi"] 
    targets =examples["en"] 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["en", "vi"]
)
del dataset

#### Check RAM usage

In [5]:
import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 2607.81 MB


### Init DataCollator

In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

### Function to compute sacrebleu

In [6]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

### Init Training Arguments

In [7]:
from transformers import Seq2SeqTrainingArguments

batch_train = 64
num_epochs = 2
args = Seq2SeqTrainingArguments(
    f"vit5-phoMT",
    evaluation_strategy = 'steps',
    save_strategy="steps",
    logging_steps = 500,                   
    eval_steps = 5000, 
    save_steps=1000,
    learning_rate=0.001,
    per_device_eval_batch_size=128,
    save_total_limit=3,
    max_steps= 40000,
    predict_with_generate=True,
    fp16=True,
    half_precision_backend = "auto",
    # report_to="wandb",  # enable logging to W&B
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8
)



# 2977999 

### Init optimizer and scheduler

In [8]:
from transformers.optimization import Adafactor, AdafactorSchedule

optim = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=True, 
    warmup_init=True, 
    lr=None
)
lr_scheduler = AdafactorSchedule(optim)

### Init Trainer

In [9]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optim, lr_scheduler)
)

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [12]:
# trainer.evaluate(max_length=max_target_length)

In [None]:
# trainer.predict(tokenized_datasets["test"])

In [None]:
trainer.train("checkpoint-40000") # continue training
# trainer.train() # from stratch

### Inference 1 sentence

In [11]:
sentence = "Các trưởng lão địa phương và giám thị xung quanh đang giúp đỡ và cung cấp về vật chất và tinh thần cho các anh chị bị ảnh hưởng trong thảm hoạ này ."
text =  "translate Vietnamese to English: " + sentence + " </s>"
encoding = tokenizer(text, padding=True, max_length=max_input_length, truncation=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to("cuda:1"), encoding["attention_mask"].to("cuda:1")
outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=max_target_length,
    # early_stopping=True,
    # do_sample=True,
    # num_beams=5,
    # num_return_sequences=1,
    # no_repeat_ngram_size=1,
    # remove_invalid_values=True,
)
for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(line)

Local elders and the neighborhood supervisors are helping and providing physical and mentality to the brothers that are affected by this disaster.


In [None]:
a="Local elders and the circuit overseer are offering practical and spiritual support to those affected by this disaster ."