Login to Hugging Face to upload the result model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

ModuleNotFoundError: ignored

Installs all necessary libraries

In [None]:
pip install transformers datasets evaluate sacrebleu accelerate sentencepiece

Prepare KDE4 dataset

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="vi", cache_dir="D:/HuggingFaceCache/")

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading and preparing dataset kde4/en-vi to /content/D:/HuggingFaceCache/kde4/en-vi-lang1=en,lang2=vi/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/996k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset kde4 downloaded and prepared to /content/D:/HuggingFaceCache/kde4/en-vi-lang1=en,lang2=vi/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 42782
    })
})

In [None]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 38503
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 4279
    })
})

In [None]:
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 38503
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 4279
    })
})

In [None]:
split_datasets["train"][:3]["translation"]

[{'en': 'Text', 'vi': 'Văn bản'},
 {'en': 'Document Contents', 'vi': 'Nội dung Tài liệu'},
 {'en': 'Click this button to enter the parent folder. For instance, if the current location is file: / home/ %1 clicking this button will take you to file: / home.',
  'vi': 'Nhắp vào cái nút này để vào thư mục mẹ. Lấy thí dụ, nếu địa điểm hiện có là & lt; file: / home /% 1gt;, nhắp vào nút này sẽ hiển thị & lt; file: / homegt;.'}]

Load Helsinki-NLP en-vi tokenizer

In [None]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", cache_dir="D:/HuggingFaceCache/")

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]



Load model for fine-tuning

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/289M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Load data collator

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


Define preprocess function

In [None]:
max_length = 128

In [None]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["vi"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

Applying on datasets

In [None]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/38503 [00:00<?, ? examples/s]

Map:   0%|          | 0/4279 [00:00<?, ? examples/s]

Using SacreBLEU as metric for evaluation

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Build the DataLoaders

In [None]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

Instantiating an optimizer

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



Send all objects to accelerator.prepare() method

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)


In [None]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Create repository object to upload model to hub

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "kde4-en-vi-test"
repo_name = get_full_repo_name(model_name)
repo_name

'choidf/kde4-en-vi-test'

Cloning repository

In [None]:
output_dir = "kde4-en-vi-test"
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/choidf/kde4-en-vi-test into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/274M [00:00<?, ?B/s]

Download file runs/Apr02_03-56-43_5c5506ccf29c/events.out.tfevents.1680407806.5c5506ccf29c.3017.11: 100%|#####…

Download file runs/Apr02_01-48-05_5c5506ccf29c/events.out.tfevents.1680402983.5c5506ccf29c.3017.5: 100%|######…

Download file runs/Apr02_03-15-13_5c5506ccf29c/events.out.tfevents.1680405324.5c5506ccf29c.3017.8: 100%|######…

Download file runs/Apr02_00-43-38_5c5506ccf29c/events.out.tfevents.1680396236.5c5506ccf29c.3017.2: 100%|######…

Download file runs/Apr02_01-48-05_5c5506ccf29c/1680402983.883358/events.out.tfevents.1680402983.5c5506ccf29c.3…

Download file source.spm:   0%|          | 1.40k/790k [00:00<?, ?B/s]

Clean file runs/Apr02_03-56-43_5c5506ccf29c/events.out.tfevents.1680407806.5c5506ccf29c.3017.11:  16%|#6      …

Clean file runs/Apr02_01-48-05_5c5506ccf29c/events.out.tfevents.1680402983.5c5506ccf29c.3017.5:  14%|#3       …

Clean file runs/Apr02_03-15-13_5c5506ccf29c/events.out.tfevents.1680405324.5c5506ccf29c.3017.8:  11%|#1       …

Clean file runs/Apr02_00-43-38_5c5506ccf29c/events.out.tfevents.1680396236.5c5506ccf29c.3017.2:  11%|#1       …

Clean file runs/Apr02_01-48-05_5c5506ccf29c/1680402983.883358/events.out.tfevents.1680402983.5c5506ccf29c.3017…

Download file target.spm:   2%|2         | 16.5k/738k [00:00<?, ?B/s]

Download file runs/Apr02_03-15-13_5c5506ccf29c/1680405324.7451339/events.out.tfevents.1680405324.5c5506ccf29c.…

Clean file runs/Apr02_03-15-13_5c5506ccf29c/1680405324.7451339/events.out.tfevents.1680405324.5c5506ccf29c.301…

Download file runs/Apr02_03-56-43_5c5506ccf29c/1680407806.9794614/events.out.tfevents.1680407806.5c5506ccf29c.…

Download file runs/Apr02_00-26-06_5c5506ccf29c/1680395732.2912354/events.out.tfevents.1680395732.5c5506ccf29c.…

Download file runs/Apr02_00-26-06_5c5506ccf29c/events.out.tfevents.1680395597.5c5506ccf29c.3017.0: 100%|######…

Clean file runs/Apr02_03-56-43_5c5506ccf29c/1680407806.9794614/events.out.tfevents.1680407806.5c5506ccf29c.301…

Download file runs/Apr02_00-43-38_5c5506ccf29c/1680396236.0116088/events.out.tfevents.1680396236.5c5506ccf29c.…

Clean file runs/Apr02_00-26-06_5c5506ccf29c/events.out.tfevents.1680395597.5c5506ccf29c.3017.0:  18%|#8       …

Clean file runs/Apr02_00-26-06_5c5506ccf29c/1680395732.2912354/events.out.tfevents.1680395732.5c5506ccf29c.301…

Clean file runs/Apr02_00-43-38_5c5506ccf29c/1680396236.0116088/events.out.tfevents.1680396236.5c5506ccf29c.301…

Clean file source.spm:   0%|          | 1.00k/790k [00:00<?, ?B/s]

Download file runs/Apr02_03-15-13_5c5506ccf29c/events.out.tfevents.1680407796.5c5506ccf29c.3017.10: 100%|#####…

Download file training_args.bin: 100%|##########| 3.62k/3.62k [00:00<?, ?B/s]

Download file runs/Apr02_00-43-38_5c5506ccf29c/events.out.tfevents.1680398700.5c5506ccf29c.3017.4: 100%|######…

Download file runs/Apr02_01-48-05_5c5506ccf29c/events.out.tfevents.1680404584.5c5506ccf29c.3017.7: 100%|######…

Clean file runs/Apr02_03-15-13_5c5506ccf29c/events.out.tfevents.1680407796.5c5506ccf29c.3017.10: 100%|########…

Clean file runs/Apr02_00-43-38_5c5506ccf29c/events.out.tfevents.1680398700.5c5506ccf29c.3017.4: 100%|#########…

Clean file training_args.bin:  28%|##7       | 1.00k/3.62k [00:00<?, ?B/s]

Clean file runs/Apr02_01-48-05_5c5506ccf29c/events.out.tfevents.1680404584.5c5506ccf29c.3017.7: 100%|#########…

Clean file target.spm:   0%|          | 1.00k/738k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/274M [00:00<?, ?B/s]

Full training loop

In [None]:
import numpy as np

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )


  0%|          | 0/48130 [00:00<?, ?it/s]

  0%|          | 0/535 [00:00<?, ?it/s]

epoch 0, BLEU score: 50.17


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 1, BLEU score: 50.66


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 2, BLEU score: 50.76


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 3, BLEU score: 49.33


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 4, BLEU score: 51.19


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 5, BLEU score: 49.75


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 6, BLEU score: 47.44


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 7, BLEU score: 49.14


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 8, BLEU score: 48.21


  0%|          | 0/535 [00:00<?, ?it/s]

epoch 9, BLEU score: 48.21


Using the fine-tuned model

In [None]:
from transformers import pipeline

translator = pipeline("translation", model="choidf/kde4-en-vi-test")
translator("Here is your bill, please look it over.")


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/287M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/756k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



[{'translation_text': 'Đây là hóa đơn của bạn, xin hãy xem lại.'}]