Install the Transformers, Datasets, and Evaluate libraries to run this notebook

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell

In [None]:
!git config --global user.email "thanhhuya10lk@gmail.com"
!git config --global user.name "huynguyen314"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [10]:
from datasets import load_dataset

raw_datasets = load_dataset('kde4', lang1='en', lang2='vi')

Using custom data configuration en-vi-lang1=en,lang2=vi
Reusing dataset kde4 (C:\Users\Admin\.cache\huggingface\datasets\kde4\en-vi-lang1=en,lang2=vi\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac)
100%|██████████| 1/1 [00:00<00:00, 39.95it/s]


In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 42782
    })
})

In [5]:
raw_datasets["train"][1]

{'id': '1',
 'translation': {'en': 'Add Feeds to Akregator',
  'vi': 'Thêm các nguồn tin cho Akregator'}}

In [6]:
print(raw_datasets["train"][:3])

{'id': ['0', '1', '2'], 'translation': [{'en': 'Add Feed to Akregator', 'vi': 'Thêm nguồn tin cho Akregator'}, {'en': 'Add Feeds to Akregator', 'vi': 'Thêm các nguồn tin cho Akregator'}, {'en': 'Add All Found Feeds to Akregator', 'vi': 'Thêm mọi nguồn tin cho Akregator'}]}


# Process data json

In [3]:
from tqdm import tqdm
import json


prefixes = ['test', 'dev']
for prefix in prefixes:
    with open(f'PhoMT-{prefix}.json', mode='w', encoding='utf-8') as f:
        with open(f'PhoMT/tokenization/{prefix}/{prefix}.en', encoding='utf-8') as f_en:
            data_en = f_en.readlines()
        data_en = [s.strip() for s in data_en]
        print(len(data_en))

        with open(f'PhoMT/tokenization/{prefix}/{prefix}.vi', encoding='utf-8') as f_vi:
            data_vi = f_vi.readlines()
        data_vi = [s.strip() for s in data_vi]
        print(len(data_vi))

        for i, (ele_en, ele_vi) in tqdm(enumerate(zip(data_en, data_vi))):
            line = {'id': str(i), 'translation': {'en': ele_en, 'vi': ele_vi}}
            f.write(str(line)+'\n')

dataset_json = load_dataset('csv', data_files={'test': 'PhoMT-test.csv', 'dev': 'PhoMT-dev.csv'})
dataset_json

19151
19151


19151it [00:00, 125504.07it/s]


18719
18719


18719it [00:00, 154623.20it/s]
Using custom data configuration default-2ab0915114ba865c
Reusing dataset csv (C:\Users\Admin\.cache\huggingface\datasets\csv\default-2ab0915114ba865c\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
100%|██████████| 2/2 [00:00<00:00, 180.57it/s]


DatasetDict({
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 19151
    })
    dev: Dataset({
        features: ['id', 'translation'],
        num_rows: 18719
    })
})

## Process data csv

In [77]:
from tqdm import tqdm
import json
import csv
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk


prefixes = ['dev', 'test']
df1 = []
df2 = []

dict_data = {
    'train': [],
    'test': []
}
for prefix in prefixes:
    with open(f'PhoMT/tokenization/{prefix}/{prefix}.en', encoding='utf-8') as f_en:
        data_en = f_en.readlines()
    data_en = [s.strip() for s in data_en]
    print(len(data_en))

    with open(f'PhoMT/tokenization/{prefix}/{prefix}.vi', encoding='utf-8') as f_vi:
        data_vi = f_vi.readlines()
    data_vi = [s.strip() for s in data_vi]
    print(len(data_vi))

    data = {
        'id': [],
        'translation': []
    }
    for i, (ele_en, ele_vi) in tqdm(enumerate(zip(data_en, data_vi))):
        data['id'].append(str(i))
        data['translation'].append({'en': ele_en, 'vi': ele_vi})
    df = pd.DataFrame(data)
    df.to_csv(f'PhoMT-{prefix}.csv', index=False, encoding='utf-8')

    if prefix == 'dev':
        df1 = df
        dict_data['train'] = data
    elif prefix == 'test':
        df2 = df
        dict_data['test'] = data

# dataset_csv = Dataset.from_dict(dict_data)
dataset_csv = DatasetDict.from_csv(data_en)
 
# dataset_csv = load_dataset('csv', data_files={'train': 'PhoMT-test.csv', 'test': 'PhoMT-dev.csv'})

dataset_csv

18719
18719


18719it [00:00, 851950.22it/s]


19151
19151


19151it [00:00, 768110.12it/s]


AttributeError: 'dict' object has no attribute 'decode'

In [7]:
dataset_json

DatasetDict({
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 19151
    })
    dev: Dataset({
        features: ['id', 'translation'],
        num_rows: 18719
    })
})

In [72]:
dataset_csv

{'en': '\ufeffHurricane Dorian , one of the most powerful storms ever recorded in the Atlantic Ocean , made landfall as a Category 5 storm on Great Abaco Island in the northern Bahamas on Sunday morning , September 1 , 2019 .',
 'vi': 'Vào chủ nhật ngày 1-9-2019 , cơn bão Dorian , một trong những cơn bão mạnh nhất được ghi nhận ở Đại Tây Dương , với sức gió 362 km/h đổ bộ vào đảo Great Abaco , miền bắc Bahamas .'}

In [11]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 42782
    })
})

In [12]:
dataset_json['test']['translation'][0]

"{'en': 'Brother Albert Barnett and his wife , Sister Susan Barnett , from the West Congregation in Tuscaloosa , Alabama', 'vi': 'Anh Albert Barnett và chị Susan Barnett , thuộc hội thánh West ở Tuscaloosa , Alabama'}"

In [64]:
dataset_csv['test']['translation'][0]

KeyError: "Column test not in the dataset. Current columns in the dataset: ['id', 'translation']"

In [14]:
raw_datasets['train']['translation'][0]

{'en': 'Add Feed to Akregator', 'vi': 'Thêm nguồn tin cho Akregator'}

In [28]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

Loading cached split indices for dataset at C:\Users\Admin\.cache\huggingface\datasets\kde4\en-vi-lang1=en,lang2=vi\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac\cache-9b29ee767755543f.arrow and C:\Users\Admin\.cache\huggingface\datasets\kde4\en-vi-lang1=en,lang2=vi\0.0.0\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac\cache-fccf769c659306e8.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 38503
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 4279
    })
})

In [None]:
# from datasets import concatenate_datasets
# dataset_cc = concatenate_datasets([raw_datasets['train'], another_datasets['train']])
# dataset_cc

In [22]:
split_datasets["validation"] = split_datasets.pop("test")

In [23]:
split_datasets["train"][1]["translation"]

"{'en': 'Severe storms ripped through parts of the southern and midwestern United States on January 11 and 12 , 2020 .', 'vi': 'Ngày 11 và 12-1-2020 , những cơn bão lớn đã quét qua và phá huỷ nhiều vùng ở miền nam và miền trung Hoa Kỳ .'}"

In [24]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-vi"
translator = pipeline("translation", model=model_checkpoint)
translator("My name is Sarah and I live in London")

Downloading: 100%|██████████| 1.36k/1.36k [00:00<00:00, 470kB/s]
Downloading: 100%|██████████| 275M/275M [00:32<00:00, 8.78MB/s] 
Downloading: 100%|██████████| 44.0/44.0 [00:00<00:00, 20.3kB/s]
Downloading: 100%|██████████| 790k/790k [00:01<00:00, 436kB/s]  
Downloading: 100%|██████████| 738k/738k [00:02<00:00, 331kB/s]  
Downloading: 100%|██████████| 1.14M/1.14M [00:02<00:00, 510kB/s]


[{'translation_text': 'Tên tôi là Sarah và tôi sống ở London'}]

In [25]:
translator("My name is Huy and I live in Ho Chi Minh")

[{'translation_text': 'Tên tôi là Huệ Nguyệt và tôi sống ở Hồ Thiên Chí Minh'}]

## Check on Helsinki-NLP/opus-mt-en-vi model

In [26]:
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-vi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")

In [39]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
vi_sentence = split_datasets["train"][1]["translation"]['vi']

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(vi_sentence)

In [40]:
print(vi_sentence)
print(en_sentence)

Nội dung Tài liệu
Document Contents


In [41]:
wrong_targets = tokenizer(vi_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁N', 'ộ', 'i', '▁dung', '▁T', 'à', 'i', '▁li', 'ệ', 'u', '</s>']
['▁Nội', '▁dung', '▁Tài', '▁liệu', '</s>']


In [42]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["vi"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [43]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

100%|██████████| 39/39 [00:06<00:00,  5.84ba/s]
100%|██████████| 5/5 [00:00<00:00,  6.72ba/s]


In [44]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [45]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [46]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [47]:
batch["labels"]

tensor([[ 8794,  2076,  6003,   851,     0,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [39545,   124,   132,  3828,    44,    77,   124,   671,   792,   326,
             2,  2936,  2380,  1321,     4,   277,   849,   608,   358,    18,
            14,   942,  1861,    29,   140,  2327,    53,  1078,   654,  1078,
          2038,   168, 14627,   140,     4, 16301,   124,  3828,    44,    35,
          2013,  1009,   942,  1861,    29,   140,  2327,    53,  1078,   654,
         14627,   140,     2,     0]])

In [48]:
batch["decoder_input_ids"]

tensor([[53684,  8794,  2076,  6003,   851,     0, 53684, 53684, 53684, 53684,
         53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684,
         53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684,
         53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684,
         53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684, 53684,
         53684, 53684, 53684, 53684],
        [53684, 39545,   124,   132,  3828,    44,    77,   124,   671,   792,
           326,     2,  2936,  2380,  1321,     4,   277,   849,   608,   358,
            18,    14,   942,  1861,    29,   140,  2327,    53,  1078,   654,
          1078,  2038,   168, 14627,   140,     4, 16301,   124,  3828,    44,
            35,  2013,  1009,   942,  1861,    29,   140,  2327,    53,  1078,
           654, 14627,   140,     2]])

In [49]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[8794, 2076, 6003, 851, 0]
[39545, 124, 132, 3828, 44, 77, 124, 671, 792, 326, 2, 2936, 2380, 1321, 4, 277, 849, 608, 358, 18, 14, 942, 1861, 29, 140, 2327, 53, 1078, 654, 1078, 2038, 168, 14627, 140, 4, 16301, 124, 3828, 44, 35, 2013, 1009, 942, 1861, 29, 140, 2327, 53, 1078, 654, 14627, 140, 2, 0]


In [50]:
!pip install sacrebleu



In [51]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script: 100%|██████████| 8.15k/8.15k [00:00<00:00, 8.17MB/s]


In [52]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [53]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [54]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

In [55]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [56]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to C:\Users\Admin/.huggingface/token


In [57]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-vi",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [59]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/datnth1709/marian-finetuned-kde4-en-to-vi into local empty directory.
Using cuda_amp half precision backend


In [60]:
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 4279
  Batch size = 64


RuntimeError: CUDA out of memory. Tried to allocate 54.00 MiB (GPU 0; 4.00 GiB total capacity; 2.20 GiB already allocated; 0 bytes free; 2.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.train()

In [None]:
trainer.evaluate(max_length=max_target_length)

In [None]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

In [None]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "marian-finetuned-kde4-en-to-vi-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

In [None]:
output_dir = "marian-finetuned-kde4-en-to-vi-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huynguyen208/marian-finetuned-kde4-en-to-vi"
translator = pipeline("translation", model=model_checkpoint)
translator("My name is Huy. I come from Vietnam")
translator("My favorite quote is: ")
translator("Try not to become a man of success, but rather try to become a man of value.")

In [None]:
translator("My name is Huy. I come from Ho Chi Minh city")

In [None]:
translator("I haven't been to a public gym before. When I exercise in a private space, I feel more comfortable.")

In [None]:
translator("Why Are People Afraid Of Public Speaking?")

In [None]:
translator("Choose A Topic You’re Passionate About")