In [1]:
!pip install transformers datasets evaluate accelerate scikit-learn -U

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0

In [2]:
import os
import torch
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset

os.environ["WANDB_DISABLED"] = "true"

# 1. LOAD DATASET
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")


label_list = dataset["train"].features["labels"].feature.names
num_labels = len(label_list)

id2label = {idx: label for idx, label in enumerate(label_list)}
label2id = {label: idx for idx, label in enumerate(label_list)}

print(f"Total Labels: {num_labels}")

# 2. TOKENIZER & PREPROCESSING
MODEL_CKPT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def preprocess_function(examples):

    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True, max_length=128)


    labels_matrix = np.zeros((len(examples["labels"]), num_labels), dtype=np.float32)
    for idx, label_indices in enumerate(examples["labels"]):
        labels_matrix[idx, label_indices] = 1.0

    tokenized_inputs["labels_new"] = labels_matrix.tolist()
    return tokenized_inputs

print("Sedang memproses data...")
tokenized_ds = dataset.map(preprocess_function, batched=True)


tokenized_ds = tokenized_ds.remove_columns(['labels', 'text', 'id'])


tokenized_ds = tokenized_ds.rename_column("labels_new", "labels")

tokenized_ds.set_format("torch")


print("\n[DEBUG] Cek Tipe Data Label:")
print(f"Tipe data: {tokenized_ds['train'][0]['labels'].dtype}")


# 3. METRIC
f1_metric = evaluate.load("f1")
acc_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))


    y_pred = np.zeros(probs.shape)
    y_pred[probs >= 0.5] = 1

    y_true_flat = labels.flatten()
    y_pred_flat = y_pred.flatten()

    f1 = f1_metric.compute(predictions=y_pred_flat, references=y_true_flat, average="micro")
    acc = acc_metric.compute(predictions=y_pred_flat, references=y_true_flat)

    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

# 4. TRAINING
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

args = TrainingArguments(
    output_dir="./go_emotions_final",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Mulai Training...")
trainer.train()

# Simpan
trainer.save_model("./final_go_emotions_sukses")
print("Selesai!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Total Labels: 28


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Sedang memproses data...


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]


[DEBUG] Cek Tipe Data Label:
Tipe data: torch.float32


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Mulai Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1144,0.094946,0.967662,0.967662
2,0.0886,0.08681,0.969532,0.969532
3,0.0804,0.085886,0.969683,0.969683


Selesai!
