<a href="https://colab.research.google.com/github/cindyhps/analisa_sent_3label/blob/main/emo_multikelas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preparation

In [None]:
%%capture
!pip install -U transformers datasets torch accelerate sentencepiece deep_translator evaluate optuna

In [None]:
!pip install googletrans==4.0.0-rc1



In [None]:
!pip uninstall torchvision --y
!pip install torchvision

Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Successfully uninstalled torchvision-0.21.0
Collecting torchvision
  Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
Installing collected packages: torchvision
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.18 requires torch<2.6,>=1.10, but you have torch 2.6.0 which is incompatible.[0m[31m
[0mSuccessfully installed torchvision-0.21.0


In [None]:
import torchvision
print(torchvision.__version__)

0.21.0+cu124


In [None]:
!pip install optuna-integration[pytorch_lightning]



In [None]:
!pip install nlpaug textattack
!pip install torch transformers --upgrade



Library

In [None]:
import torch
import optuna
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding, EarlyStoppingCallback, pipeline, PreTrainedModel, AutoConfig
from transformers.integrations import TensorBoardCallback
from datasets import load_dataset
import evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
from deep_translator import GoogleTranslator
from googletrans import Translator
import pandas as pd
from functools import partial
import json
from textattack.augmentation import EasyDataAugmenter
import os

In [None]:
os.makedirs("./cache_dir", exist_ok=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Inisiasi Model

In [None]:
MODEL_NAME = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=28).to(device)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset

In [None]:
# Muat dataset GoEmotion
dataset = load_dataset("go_emotions")

In [None]:
# Lihat contoh data
print(dataset['train'][0])

{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [None]:
# Pisahkan dataset terlebih dahulu
train_val = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_val["train"]
val_dataset = train_val["test"]
test_dataset = dataset["test"]

In [None]:
# # Muat model penerjemah
translator_en_id = pipeline(
    "translation_en_to_id",
    model="Helsinki-NLP/opus-mt-en-id",
    device=0 if torch.cuda.is_available() else -1,
    batch_size=16  # Sesuaikan dengan memori GPU
)

translator_id_en = pipeline(
    "translation_id_to_en",
    model="Helsinki-NLP/opus-mt-id-en",
    device=0 if torch.cuda.is_available() else -1,
    batch_size=16
)

Device set to use cuda:0
Device set to use cuda:0


In [None]:
# #versi pertama
def augment_batch(batch):
    # Ekstrak teks
    texts = batch["text"]

    try:
        # Terjemahkan EN->ID->EN dalam satu batch
        id_translations = translator_en_id(texts, max_length=256)
        id_texts = [t['translation_text'] for t in id_translations]

        back_translations = translator_id_en(id_texts, max_length=256)
        augmented_texts = [t['translation_text'] for t in back_translations]

        return {"text": augmented_texts}

    except Exception as e:
        print(f"Error in batch translation: {e}")
        return {"text": texts}  # Return original jika error

In [None]:
#versi kedua
def augment_text(text):
    try:
        translated = GoogleTranslator(source='en', target='id').translate(text)
        back_translated = GoogleTranslator(source='id', target='en').translate(translated)
        return back_translated if back_translated else text
    except:
        return text

In [None]:
# Inisialisasi augmenter
eda_augmenter = EasyDataAugmenter(
    pct_words_to_swap=0.2,  # Ganti 20% kata
    transformations_per_example=1
)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
#versi 3
def eda_augment_batch(batch):
    augmented = []
    for text in batch["text"]:
        try:
            augmented.append(eda_augmenter.augment(text)[0])
        except:
            augmented.append(text)
    return {"text": augmented}

In [None]:
sample_text = "I really love this movie"
print("Original:", sample_text)
print("Augmented:", augment_text(sample_text))

Original: I really love this movie
Augmented: I really like this movie


In [None]:
# Augmentasi hanya pada data training
train_dataset = train_dataset.map(
    augment_batch,
    batched=True,
    batch_size=32,
    num_proc=4,
    cache_file_name=os.path.abspath("./cache_dir/augmented_dataset.arrow")  # Path absolut
)

Setting TOKENIZERS_PARALLELISM=false for forked processes.


In [None]:
# Contoh: Cek 5 sampel hasil augmentasi
for i in range(5):
    print(f"Original: {train_dataset[i]['text']}")
    print(f"Augmented: {augment_text(train_dataset[i]['text'])}\n")

Original: PAC man looks too fast
Augmented: PAC man looks too fast

Original: We occasionally awake from our routines and wonder if we are still alive.
Augmented: Sometimes we wake up from our routines and wonder if we are still alive.

Original: Multiply it by hundreds to account for all employees of the store and then yes, it is true Also love how they mock the mentally disabled here. 10/10 /s
Augmented: Multiply that by hundreds to account for all the store employees and it turns out to be true. I also love how they make fun of the mentally disabled here. 10/10 /s

Original: Unfortunately, on that we can agree.
Augmented: Unfortunately, we can agree on that.

Original: [NAME] as Manly's top try scorer is pathetically bad.
Augmented: [NAME] as Manly's top scorer was simply awful.



In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="longest",
        max_length=256,
        add_special_tokens=True
    )

In [None]:
# Tokenisasi dataset setelah pemisahan
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/34728 [00:00<?, ? examples/s]

In [None]:
def process_labels(example):
    labels = np.zeros(28, dtype=np.float32)
    for lbl in example["labels"]:
        labels[lbl] = 1.0
    return {"labels": torch.tensor(labels, dtype=torch.float32)}  # <-- Ubah ke tensor float

In [None]:
# Proses labels untuk semua split
tokenized_train = tokenized_train.map(process_labels)
tokenized_val = tokenized_val.map(process_labels)
tokenized_test = tokenized_test.map(process_labels)

Map:   0%|          | 0/34728 [00:00<?, ? examples/s]

In [None]:
# Set format ke torch
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Konversi ke DataFrame
train_df = pd.DataFrame(tokenized_train)
val_df = pd.DataFrame(tokenized_val)
test_df = pd.DataFrame(tokenized_test)

In [None]:
train_df['text'] = train_dataset['text']
val_df['text'] = val_dataset['text']
test_df['text'] = test_dataset['text']

In [None]:
# Cek duplikat
duplicates_train_val = pd.merge(train_df, val_df, how="inner", on=["text", "labels"])
print(f"Duplikat train-val: {len(duplicates_train_val)}")

Duplikat train-val: 0


In [None]:
all_labels = [label for ex in train_dataset for label in ex["labels"]]
class_weights = compute_class_weight("balanced", classes=np.arange(28), y=all_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

In [None]:
print(tokenized_train[0])

{'labels': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]), 'input_ids': tensor([    0, 35765,   313,  1326,   350,  1769,     2,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}


In [None]:
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2):
        super().__init__()
        self.alpha = alpha  # Tensor bobot kelas (28,)
        self.gamma = gamma

    def forward(self, logits, labels):
        bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels, reduction="none")
        probs = torch.sigmoid(logits)
        if self.alpha is not None:
            alpha = self.alpha.to(logits.device)[None, :]  # Menambahkan dimensi batch
            alpha_factor = labels * alpha + (1 - labels) * (1 - alpha)
        else:
            alpha_factor = 1.0
        focal_loss = alpha_factor * (1 - probs)**self.gamma * bce_loss
        return focal_loss.mean()

In [None]:
# Modifikasi model untuk weighted loss
class WeightedBERT(PreTrainedModel):  # <-- Perubahan utama di sini
    def __init__(self, class_weights, dropout_rate=0.2):
        config = AutoConfig.from_pretrained(MODEL_NAME)
        super().__init__(config)

        self.roberta = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=28)

        # Freeze layer encoder
        for param in self.roberta.roberta.encoder.layer[:-4].parameters():
            param.requires_grad = False

        self.dropout = torch.nn.Dropout(dropout_rate)
        self.loss_fct = FocalLoss(alpha=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        logits = self.dropout(outputs.logits)
        loss = self.loss_fct(logits, labels.float()) if labels is not None else None
        return {'loss': loss, 'logits': logits}

    # Enable gradient checkpointing
    def gradient_checkpointing_enable(self):
        self.roberta.gradient_checkpointing_enable()

model = WeightedBERT(class_weights=class_weights, dropout_rate=0.5).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimizer dan Scheduler (Parameter)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-5,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_micro_f1",  # Ubah ini ke metric yang tersedia
    greater_is_better=True
)



In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=len(tokenized_train) * training_args.num_train_epochs
)



In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    compute_metrics=None,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
# Ganti blok pencarian threshold global dengan ini:
val_predictions = trainer.predict(tokenized_val)
val_probs = torch.sigmoid(torch.tensor(val_predictions.predictions)).numpy()
val_labels = val_predictions.label_ids


# Hitung threshold per kelas (misal: ambil persentil ke-90)
class_thresholds = np.percentile(val_probs, 90, axis=0)
best_threshold = class_thresholds  # Sekarang best_threshold adalah array per kelas



In [None]:
def compute_metrics(eval_pred, thresholds):
    predictions, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(predictions)).numpy()
    preds = np.zeros_like(probs)
    for i in range(probs.shape[1]):
        preds[:, i] = (probs[:, i] > thresholds[i]).astype(int)
    return {
        "micro_f1": f1_score(labels, preds, average="micro"),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [None]:
trainer.compute_metrics = partial(compute_metrics, thresholds=class_thresholds)

In [None]:
def predict(text, class_thresholds):  # Tambahkan parameter threshold
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    inputs = {k: v for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.sigmoid(outputs['logits'][0]).cpu().numpy()
    return [i for i, p in enumerate(probs) if p > class_thresholds[i]]

Training

In [None]:
# best_threshold = 0.59999999

In [None]:
# # Gunakan threshold optimal untuk prediksi
# pred_label = predict(text, threshold=best_threshold)

In [None]:
# compute_metrics_with_threshold = partial(compute_metrics, threshold=best_threshold)
# trainer.compute_metrics = compute_metrics_with_threshold

In [None]:
print("Sample predictions:", trainer.predict(tokenized_val).predictions[:5])

Sample predictions: [[ 0.08087158  0.40356445 -0.12756348  0.01369476  0.04040527 -0.34179688
  -0.28979492 -0.07824707  0.03198242 -0.33422852 -0.3527832  -0.25976562
   0.20483398 -0.06933594 -0.50341797 -0.03469849  0.05499268  0.6254883
   0.1640625   0.28686523  0.36865234 -0.34106445 -0.29907227 -0.03540039
  -0.09094238 -0.14416504 -0.38793945 -0.00183392]
 [ 0.04733276  0.3635254  -0.08435059  0.11682129  0.10144043 -0.36132812
  -0.27416992 -0.01637268  0.01428986 -0.4272461  -0.42651367 -0.3083496
   0.18762207 -0.07397461 -0.50097656 -0.04919434  0.00699234  0.6455078
   0.07122803  0.21801758  0.35302734 -0.32299805 -0.25170898 -0.07196045
  -0.09851074 -0.13964844 -0.31030273  0.02416992]
 [-0.06469727  0.30029297 -0.10534668  0.19348145  0.12670898 -0.38793945
  -0.22497559  0.0411377   0.06365967 -0.41357422 -0.42529297 -0.2467041
   0.13305664 -0.121521   -0.39697266  0.0125351  -0.08294678  0.62353516
   0.04425049  0.11212158  0.3293457  -0.328125   -0.22155762 -0.054

In [None]:
errors = []
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    text = sample["text"]
    true_labels = sample["labels"]
    pred_labels = predict(text, class_thresholds)
    # Hitung berapa banyak label benar yang tidak terprediksi
    missed = sum(1 for lbl in true_labels if lbl not in pred_labels)
    if missed > 0:
        errors.append({"text": text, "true": true_labels, "pred": pred_labels})

In [None]:
# Simpan atau tampilkan hasil analisis
print("\nContoh Kesalahan Prediksi:")
for error in errors[:5]:  # Tampilkan 5 kesalahan pertama
    print(f"Text: {error['text']}\nTrue: {error['true']}\nPred: {error['pred']}\n")


Contoh Kesalahan Prediksi:
Text: I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!
True: [25]
Pred: [6, 11, 15, 22, 26]

Text: It's wonderful because it's awful. At not with.
True: [0]
Pred: []

Text: Kings fan here, good luck to you guys! Will be an interesting game to watch! 
True: [13]
Pred: [4, 7, 11, 22, 24]

Text: They got bored from haunting earth for thousands of years and ultimately moved on to the afterlife.
True: [27]
Pred: [1, 10, 12, 13]

Text: Thank you for asking questions and recognizing that there may be things that you don’t know or understand about police tactics. Seriously. Thank you.
True: [15]
Pred: [5, 9, 12, 16, 17, 19, 27]



In [None]:
best_threshold_tensor = torch.tensor(best_threshold, dtype=torch.float32).to(device)

In [None]:
# Ambil prediksi dan label
test_predictions = trainer.predict(tokenized_test)
test_probs = torch.sigmoid(torch.tensor(test_predictions.predictions).to(device))
test_preds = (test_probs > best_threshold_tensor).int().cpu().numpy()
test_labels = test_predictions.label_ids

In [None]:
# Load the label mapping from the dataset
label_mapping = dataset["train"].features["labels"].feature

# Print all label names
label_mapping.names


['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [None]:
# Generate report
label_names = dataset["train"].features["labels"].feature.names
print(classification_report(test_labels, test_preds, target_names=label_names, zero_division=1))

                precision    recall  f1-score   support

    admiration       0.11      0.12      0.11       504
     amusement       0.07      0.14      0.09       264
         anger       0.02      0.05      0.03       198
     annoyance       0.07      0.13      0.09       320
      approval       0.06      0.09      0.07       351
        caring       0.02      0.07      0.03       135
     confusion       0.03      0.09      0.04       153
     curiosity       0.10      0.19      0.13       284
        desire       0.01      0.07      0.02        83
disappointment       0.03      0.11      0.05       151
   disapproval       0.06      0.12      0.08       267
       disgust       0.03      0.11      0.04       123
 embarrassment       0.00      0.03      0.00        37
    excitement       0.01      0.03      0.01       103
          fear       0.01      0.08      0.02        78
     gratitude       0.11      0.15      0.13       352
         grief       0.00      0.00      0.00  

In [None]:
error_types = {
    'false_positive': [],
    'false_negative': []
}
for error in errors:
    for lbl in error['true']:
        if lbl not in error['pred']:
            error_types['false_negative'].append(label_names[lbl])
    for lbl in error['pred']:
        if lbl not in error['true']:
            error_types['false_positive'].append(label_names[lbl])

In [None]:
trainer.train()

TypeError: WeightedBERT.gradient_checkpointing_enable() got an unexpected keyword argument 'gradient_checkpointing_kwargs'

Testing dan Evaluasi

In [None]:
results = trainer.evaluate(tokenized_test, threshold=best_threshold)
print("Hasil Evaluasi Test:", results)

In [None]:
external_dataset = load_dataset("emotion")
tokenized_external = external_dataset.map(tokenize_function, batched=True)
results = trainer.evaluate(tokenized_external)

Save Model

In [None]:
tokenizer.save_pretrained("./indoBERT-goemotions")
model.save_pretrained("./indoBERT-goemotions")