In [2]:
import ast
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from sklearn.metrics import precision_score, recall_score, f1_score


In [4]:
DATASET_DIR = "/kaggle/input/dataset-t5"

train_df = pd.read_csv(f"{DATASET_DIR}/train.csv")
test_df  = pd.read_csv(f"{DATASET_DIR}/test.csv")

pd.set_option('display.max_colwidth', None)

# Tampilkan 5 data pertama
print(train_df.head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [3]:
class T5MultiLabelDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_len=512, max_target_len=128):
        self.texts = df["text"].tolist()
        self.targets = df["tag_list"].tolist()
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = "classify: " + self.texts[idx]
        labels = ", ".join(ast.literal_eval(self.targets[idx]))

        enc = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        dec = self.tokenizer(
            labels,
            max_length=self.max_target_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        label_ids = dec["input_ids"].squeeze()
        label_ids[label_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": label_ids
        }


In [4]:
MODEL_NAME = "t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Device:", device)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device: cpu


In [5]:
train_dataset = T5MultiLabelDataset(train_df, tokenizer)
eval_dataset  = T5MultiLabelDataset(test_df, tokenizer)


In [6]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    preds_text = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels_text = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # kumpulkan semua label unik dari TRAIN
    all_labels = set()
    for t in train_df["tag_list"]:
        all_labels.update(ast.literal_eval(t))
    all_labels = sorted(list(all_labels))

    def to_binary(tags):
        return [1 if l in tags else 0 for l in all_labels]

    y_true, y_pred = [], []

    for t, p in zip(labels_text, preds_text):
        y_true.append(to_binary([x.strip() for x in t.split(",") if x.strip()]))
        y_pred.append(to_binary([x.strip() for x in p.split(",") if x.strip()]))

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return {
        "micro_precision": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "micro_recall":    recall_score(y_true, y_pred, average="micro", zero_division=0),
        "micro_f1":        f1_score(y_true, y_pred, average="micro", zero_division=0),
    }


In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/t5_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,   # batch efektif = 16
    num_train_epochs=3,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=128,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    report_to="none",
    logging_steps=200,
)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model("/kaggle/working/t5_results/best_model")
tokenizer.save_pretrained("/kaggle/working/t5_results/best_model")
