In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [1]:
from datasets import load_dataset

# Load the 'google/fleurs' dataset for 'hi_in' (Hindi - India)
dset = load_dataset("SEACrowd/indo_general_mt_en_id", trust_remote_code=True)


In [2]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
# Split the dataset into training, validation, and test sets
train_data = dset['train']
val_data = dset['validation']
test_data = dset['test']

In [4]:
# Randomly select 10k indices
import random
random_indices = random.sample(range(len(train_data)), 10000)

# Select the 10k rows
train_data = train_data.select(random_indices)

In [5]:
# source_lang = "src"
# target_lang = "tgt"
# prefix = f"""
# Translate the following sentences from English to Indonesian:

# English: I am going to the market.
# Indonesian: Saya akan pergi ke pasar.

# English: {input_text}
# Indonesian:
# #answer
# """


# def preprocess_function(examples):
#     inputs = [prefix + example for example in examples[source_lang]]
#     targets = [example for example in examples[target_lang]]
#     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
#     return model_inputs

In [6]:
source_lang = "src"
target_lang = "tgt"


def preprocess_function(examples):
    inputs = [f"""
Translate the following sentences from English to Indonesian:

English: I am going to the market.
Indonesian: Saya akan pergi ke pasar.

English: {example}
Indonesian:
#answer
""" for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
# Apply tokenization to the dataset (you can tokenize both the train and test splits)
tokenized_train_datasets = train_data.map(preprocess_function, batched=True)

tokenized_val_datasets = val_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
tokenized_train_datasets

Dataset({
    features: ['id', 'src', 'tgt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [9]:
import evaluate

metric = evaluate.load("sacrebleu")

2024-12-15 18:37:44.429066: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-15 18:37:44.502374: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-15 18:37:44.502401: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-15 18:37:44.508286: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-15 18:37:44.572844: I tensorflow/core/platform/cpu_feature_guar

In [10]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
# Define the callback to save evaluation metrics in CSV
from transformers import TrainerCallback
import csv
class SaveMetricsCallback(TrainerCallback):
    def __init__(self, output_dir="evaluation_metrics.csv"):
        self.output_dir = output_dir
        self.first_epoch = True
        self.fieldnames = ["epoch", "eval_loss", "eval_runtime", "eval_samples_per_second", "eval_steps_per_second", "eval_bleu", "eval_gen_len", "should_save"]
        
        # Open CSV file in append mode to preserve previous data
        with open(self.output_dir, mode="w", newline='') as file:
            writer = csv.DictWriter(file, fieldnames=self.fieldnames)
            writer.writeheader()

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Append evaluation metrics to the CSV file after each evaluation
        with open(self.output_dir, mode="a", newline='') as file:
            writer = csv.DictWriter(file, fieldnames=self.fieldnames)
            
            # Ensure "epoch" is part of metrics, fallback to -1 if not available
            metrics["epoch"] = state.epoch if state.epoch is not None else -1
            row = {field: metrics.get(field, "N/A") for field in self.fieldnames}
            writer.writerow(row)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

import transformers
from transformers import T5ForConditionalGeneration  # Ensure you load a generation model


# Training arguments (ensure to use Seq2SeqTrainingArguments)
output_dir = "cps"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "adamw_hf"
save_steps = 30
logging_steps = 10
learning_rate = 5e-4
max_grad_norm = 0.3
max_steps = 50
warmup_ratio = 0.03
evaluation_strategy="epoch"
lr_scheduler_type = "constant"

training_args = Seq2SeqTrainingArguments(  # Ensure to use Seq2SeqTrainingArguments
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    evaluation_strategy=evaluation_strategy,
    save_steps=save_steps,
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    max_grad_norm=max_grad_norm,
    num_train_epochs=40,
    # max_steps=max_steps,
    fp16=True,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    ddp_find_unused_parameters=False,
    eval_accumulation_steps=eval_accumulation_steps,
    per_device_eval_batch_size=per_device_eval_batch_size,
)

# Define the trainer and add the custom callback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SaveMetricsCallback(output_dir="evaluation_t5_metrics.csv")]  # Add custom callback here
)

# Start training
trainer.train()


  trainer = Seq2SeqTrainer(


  0%|          | 0/25000 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'loss': 4.8084, 'grad_norm': 9.159037590026855, 'learning_rate': 0.0005, 'epoch': 0.02}
{'loss': 4.0748, 'grad_norm': 4.57291316986084, 'learning_rate': 0.0005, 'epoch': 0.03}
{'loss': 3.8818, 'grad_norm': 5.1550493240356445, 'learning_rate': 0.0005, 'epoch': 0.05}
{'loss': 3.9148, 'grad_norm': 4.939235210418701, 'learning_rate': 0.0005, 'epoch': 0.06}
{'loss': 3.8928, 'grad_norm': 12.148293495178223, 'learning_rate': 0.0005, 'epoch': 0.08}
{'loss': 3.6269, 'grad_norm': 5.9248433113098145, 'learning_rate': 0.0005, 'epoch': 0.1}
{'loss': 3.4454, 'grad_norm': 4.483435153961182, 'learning_rate': 0.0005, 'epoch': 0.11}
{'loss': 3.5011, 'grad_norm': 4.966068744659424, 'learning_rate': 0.0005, 'epoch': 0.13}
{'loss': 3.6717, 'grad_norm': 5.079488277435303, 'learning_rate': 0.0005, 'epoch': 0.14}
{'loss': 3.6724, 'grad_norm': 8.171299934387207, 'learning_rate': 0.0005, 'epoch': 0.16}
{'loss': 3.2177, 'grad_norm': 5.419557571411133, 'learning_rate': 0.0005, 'epoch': 0.18}
{'loss': 3.2813, 'gr

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
test_data['src']

['Indira Gandhi imposed emergency rule in India.',
 '"After a brief fight, the North Korean minelayer was sunk with a loss of all crew and no South Korean casualties were reported."',
 '"Promoting culture as distinct to other goods and services, mapping and measuring the sectors adequately."',
 "Jackie Robinson-Burnette was named the Chief of the Corps' Small Business Program in May 2010.",
 '"At around that time, he was also made a member of the College of Pontiffs and of the Sodales Augustales."',
 '"On February 29, Miss A won the Champion Song on MBC MUSIC\'s Show Champion with the song ""Touch""."',
 '"William of Malmesbury also alleged that Alan IV had Constance poisoned to death, but this was unverified However, Orderic Vitalis wrote that as duchess, Constance did all she could to further the welfare of the Bretons, who grieved deeply at her death in 1090."',
 '"More recent research has found that human populations over the past 50,000 years have changed from dark-skinned to ligh

In [None]:
import torch
import os
from transformers import AutoModelForSeq2SeqLM
import csv
import numpy as np
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained("t5_result/checkpoint-25000")
# Move the model to the correct device (GPU or CPU)
model = model.to(device)

from nltk.translate.bleu_score import sentence_bleu
bleu_scores = []
# Iterate over your test dataset
for key, (input_text, output_text_real) in enumerate(zip(test_data['src'], test_data['tgt'])):
    # Tokenize input text and prepare inputs for the model
    input_text_perfix = f"""{input_text}"""
    inputs = tokenizer(input_text_perfix, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move tensors to the correct device

    # Generate the predicted output
    outputs = model.generate(inputs['input_ids'], max_length=50)
    output_text_pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    

    # Calculate BLEU score for this sentence
    bleu_score = sentence_bleu([output_text_real], output_text_pred)
    with open('predictions_t5_test.csv', mode='a', newline='') as file:
        if not os.path.exists('predictions_t5_test.csv'):
            writer.writerow(['Index', 'Input', 'Predicted Output', 'Correct Output', 'BLEU Score'])
        writer = csv.writer(file)
        writer.writerow([key + 1, input_text, output_text_pred, output_text_real, bleu_score])
    # Print the results
    print(f"Input: {input_text}")
    print(f"Predicted: {output_text_pred}")
    print(f"Reference: {output_text_real}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print("-" * 50)
    bleu_scores.append(bleu_score) 
print(np.mean(bleu_scores))

Input: Indira Gandhi imposed emergency rule in India.
Predicted: Indira Gandhi adik dikti dalam kepada dalam dalam dalam dalam dalam dalam dalam d
Reference: Kemudian Thoha Hanafi mendirikan perusahaan ekspor impor di Rengat.
BLEU Score: 0.0744
--------------------------------------------------
Input: "After a brief fight, the North Korean minelayer was sunk with a loss of all crew and no South Korean casualties were reported."
Predicted: "Setelah saat tahun, minelayer Korean Nord sunk yang berbagai berbagai saat dan kembali kem
Reference: "Setelah pertempuran singkat, kapal ranjau Korea Utara tenggelam dengan hilangnya semua awak dan tidak ada korban yang dilaporkan dari pihak Korea Selatan."
BLEU Score: 0.1791
--------------------------------------------------
Input: "Promoting culture as distinct to other goods and services, mapping and measuring the sectors adequately."
Predicted: "Promoting culture as distinct to other goods and services, mapping dan messuring sectors adequately."

NameError: name 'np' is not defined