In [None]:
import pandas as pd
import numpy as np 

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import evaluate

import torch

import random
from IPython.display import display, HTML

# MPS checking
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")

  from .autonotebook import tqdm as notebook_tqdm


tensor([1.], device='mps:0')


In [3]:
import transformers

print(transformers.__version__)

4.51.3


In [None]:
# adjust the model by change this variable
MODEL_CHECKPOINT = "t5-small"

In [8]:
dataset = load_dataset("vhtran/en-id")
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 22021
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
})

In [9]:
dataset["train"][0]

{'translation': {'id': 'Philip Johnson, seorang professor di University of California di Barkeley yang juga salah seorang pengkritik Darwinisme terkemuka, menggambarkan pertentangan antara kenyataan paleontologis ini dengan Darwinisme.',
  'en': "Phillip Johnson, a professor at the University of California at Berkeley who is also one of the world's foremost critics of Darwinism, describes the contradiction between this paleontological truth and Darwinism."}}

In [10]:
dataset['train'].features

{'translation': {'id': Value(dtype='string', id=None),
  'en': Value(dtype='string', id=None)}}

In [11]:
import datasets

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [12]:
show_random_elements(dataset["train"])


Unnamed: 0,translation
0,"{'id': 'Bank umum, lanjutnya juga bisa memanfaatkan kantor cabangnya untuk membuat 'office chanelling' bank syariah.', 'en': 'Commercial banks, she added, can also use their branch offices to build a bank syariah channeling office.'}"
1,"{'id': 'nurut UK Trade and Investment UKTI, nilai ekspor Indonesia ke Inggris sepanjang tahun 2005 tercatat senilai 517 juta Pound Sterling , yang kebanyakan didominasi jenis barang-barang tekstil dan alas kaki.', 'en': 'According to UK Investment and Trade UKIT data, Indonesia's exports to Britain in 2005 stood at 912 million pound sterling against Britain's total imports of 912 million pound sterling, most of which were textile products and footwear.'}"
2,"{'id': 'Hubungan Grant dan Hurley yang telah terjalin lama berakhir pada 2000.', 'en': 'Grant's long relationship with Hurley ended in 2000.'}"
3,"{'id': 'Di ibukota negeri itu, Conakry, jumlah tentara dan polisi yang berada di jalan berkurang dibandingkan jumlah mereka Senin, hari paling berdarah dalam pemogokan tersebut, ketika pasukan keamanan melepaskan tembakan ke arah demonstran selama bentrokan.', 'en': 'In the capital Conakry, there were fewer police and troops on the streets than on Monday, the bloodiest day of the strike, when security forces fired at demonstrators during running battles.'}"
4,"{'id': 'Isu ketidakamanan data e-mail kita sempat dipersoalkan karena data kita ternyata bisa dibaca oleh server Google untuk mendistribusikan iklan.', 'en': 'The issue on inconvenience related to the content of our emails was on one occasion discussed, as apparently our data could be read by Google server for distribution of their advertisements'}"


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
source_lang = "id"
target_lang = "en"
prefix = "translate Indonesia to English: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [15]:
tokenized_books = dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 22021/22021 [00:01<00:00, 11800.07 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 9492.79 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 11020.85 examples/s]


In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()