In [1]:
import pandas as pd
import numpy as np 

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
import evaluate

import torch

import random
from IPython.display import display, HTML

# MPS checking
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [2]:
import transformers

print(transformers.__version__)

4.51.3


In [None]:
from huggingface_hub import login

# get token from here https://huggingface.co/settings/tokens
login(token="your_huggingface_token_here")

In [None]:
# MODEL_CHECKPOINT = "t5-small"
MODEL_CHECKPOINT = "gemma2-9b-cpt-sahabatai-v1-instruct"
MODEL_DIR = "Gemma2-9b-Sahabatai-MT"

LEARNING_RATE = 2e-5
BATCH_SIZE = 6

In [5]:
dataset = load_dataset("vhtran/en-id")
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 22021
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
})

In [6]:
dataset["train"][0]

{'translation': {'id': 'Philip Johnson, seorang professor di University of California di Barkeley yang juga salah seorang pengkritik Darwinisme terkemuka, menggambarkan pertentangan antara kenyataan paleontologis ini dengan Darwinisme.',
  'en': "Phillip Johnson, a professor at the University of California at Berkeley who is also one of the world's foremost critics of Darwinism, describes the contradiction between this paleontological truth and Darwinism."}}

In [7]:
dataset['train'].features

{'translation': {'id': Value(dtype='string', id=None),
  'en': Value(dtype='string', id=None)}}

In [8]:
import datasets

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [9]:
show_random_elements(dataset["train"])

Unnamed: 0,translation
0,"{'id': 'Pemnerontak melepaskan tembakan dan granat serta bom ke pos polisi pada serangan Jumat malam di distrik Bokaro, sekitar 140 kilometer dari ibukota negara bagian Jharkhand, Ranchi.', 'en': 'The rebels opened fire and hurled grenades and bombs at a police station in the late night attack Friday in Bokaro district, 140 kilometers (87 miles), from Jharkhand state capital Ranchi.'}"
1,"{'id': 'Sementara dari Paris ditemukan fosil dinosaurus yang dijuluki raja tikus seberat 1 ton.', 'en': 'While in Paris was found dinosaurs fossils which was called The King Rat as weight as 1 ton'}"
2,"{'id': 'Dia melakukan kesalahan dalam pertandingan terakhir mereka lawan Everton, sehingga memungkinkan Andrew Johnson membuka skor ketika Toffees meraih kemenangan.', 'en': 'He made a mistake in their last match against Everton, allowing Andrew Johnson to open the scoring as the Toffees went on to record a victory.'}"
3,"{'id': 'Kami merasa sudah bermain bagus, karena ini merupakan tahun pertama kami maju ke pertandingan grup dunia ini, kata Horna.', 'en': 'We did well considering this was the first time we made it this far, Horna said.'}"
4,"{'id': 'Bush, setelah berbicara dengan Perdana Menteri Irak Nuri Al-Maliki dan komandan tinggi militer AS di Irak, mengatakan di Washington bahwa rencana barunya untuk memulihkan ketenangan di Irak akan memerlukan waktu berbulan-bulan.', 'en': 'Bush, after speaking to Iraqi Prime Minister Nuri al-Maliki and the top US military commander in Iraq, said in Washington that his new plan to pacify war-wracked Iraq would take months.'}"


In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [11]:
source_lang = "id"
target_lang = "en"
prefix = "translate Indonesia to English: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [12]:
tokenized = dataset.map(preprocess_function, batched=True)

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=MODEL_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


In [None]:
trainer.save_model(MODEL_DIR)

In [None]:
trainer.push_to_hub()