# **✅ Bagian 1: Import dan Setup**

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

# Transformers
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments

# Sklearn
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Tentukan path penyimpanan di Google Drive
my_drive_folder = "/content/drive/MyDrive/KlikBERT2/KlikBERT_HP/IndoBERT/3 Epoch/"

# ✅ Buat folder jika belum ada
os.makedirs(my_drive_folder, exist_ok=True)

# ✅ Ubah direktori kerja
os.chdir(my_drive_folder)

# ✅ Konfirmasi
print(f"✅ Current working directory set to: {os.getcwd()}")

# ✅ Cek CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")


Mounted at /content/drive
✅ Current working directory set to: /content/drive/MyDrive/KlikBERT2/KlikBERT_HP/XLM-RoBERTa/3 Epoch
✅ Using device: cuda


#✅ Bagian 2: Load & Preprocess Dataset

In [None]:
# Contoh untuk load
df = pd.read_csv("https://raw.githubusercontent.com/gikirima/KlikBERT2/refs/heads/main/KlikBERT_dataset/labeled_data_balanced_2.csv")
df = df.drop_duplicates().dropna()
print(f"Data shape after cleaning: {df.shape}")

# Label Encoding
from sklearn.preprocessing import LabelEncoder
le_clickbait = LabelEncoder()
le_kategori = LabelEncoder()

df["clickbait_label_encoded"] = le_clickbait.fit_transform(df["clickbait_label"])
df["kategori_label_encoded"] = le_kategori.fit_transform(df["kategori_berita"])

print("Clickbait labels:", le_clickbait.classes_)
print("Kategori labels:", le_kategori.classes_)


Data shape after cleaning: (23757, 4)
Clickbait labels: ['exaggeration' 'misleading' 'non clickbait' 'teasing']
Kategori labels: ['bisnis' 'entertainment' 'kesehatan' 'kriminal' 'lifestyle' 'lingkungan'
 'politik' 'sport' 'teknologi']


# ✅ Bagian 3: Split Train/Val/Test

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["clickbait_label_encoded"], random_state=42
)
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["clickbait_label_encoded"], random_state=42
)

print(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")


Train: (15204, 6), Val: (3801, 6), Test: (4752, 6)


# ✅ Bagian 4: Tokenization

In [None]:
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 256

def tokenize_batch(df):
    return tokenizer(
        df["judul"].tolist(),
        df["isi"].tolist(),
        truncation=True,
        padding=True,
        max_length=max_len
    )

train_enc = tokenize_batch(train_df)
val_enc = tokenize_batch(val_df)
test_enc = tokenize_batch(test_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

# **✅ Bagian 5: Dataset Class**

In [None]:
class MultiTaskDataset(Dataset):
    def __init__(self, encodings, clickbait_labels, kategori_labels):
        self.encodings = encodings
        self.clickbait_labels = clickbait_labels
        self.kategori_labels = kategori_labels

    def __len__(self):
        return len(self.clickbait_labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["clickbait_labels"] = torch.tensor(self.clickbait_labels[idx], dtype=torch.long)
        item["kategori_labels"] = torch.tensor(self.kategori_labels[idx], dtype=torch.long)
        return item

train_dataset = MultiTaskDataset(
    train_enc,
    train_df["clickbait_label_encoded"].tolist(),
    train_df["kategori_label_encoded"].tolist()
)
val_dataset = MultiTaskDataset(
    val_enc,
    val_df["clickbait_label_encoded"].tolist(),
    val_df["kategori_label_encoded"].tolist()
)
test_dataset = MultiTaskDataset(
    test_enc,
    test_df["clickbait_label_encoded"].tolist(),
    test_df["kategori_label_encoded"].tolist()
)


# ✅ Bagian 6: Define Model

In [None]:
class IndoBERTMultiTask(nn.Module):
    def __init__(self, model_name, num_clickbait_labels, num_kategori_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.clickbait_head = nn.Linear(hidden_size, num_clickbait_labels)
        self.kategori_head = nn.Linear(hidden_size, num_kategori_labels)

    def forward(self, input_ids, attention_mask, token_type_ids, clickbait_labels=None, kategori_labels=None):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled = output.last_hidden_state[:,0,:]
        pooled = self.dropout(pooled)
        clickbait_logits = self.clickbait_head(pooled)
        kategori_logits = self.kategori_head(pooled)

        loss = None
        loss_clickbait = None
        loss_kategori = None
        if clickbait_labels is not None and kategori_labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_clickbait = loss_fct(clickbait_logits, clickbait_labels)
            loss_kategori = loss_fct(kategori_logits, kategori_labels)
            loss = loss_clickbait + loss_kategori

        return {
            "loss": loss,
            "loss_clickbait": loss_clickbait,
            "loss_kategori": loss_kategori,
            "clickbait_logits": clickbait_logits,
            "kategori_logits": kategori_logits
        }

model = IndoBERTMultiTask(
    model_name,
    num_clickbait_labels=len(le_clickbait.classes_),
    num_kategori_labels=len(le_kategori.classes_)
).to(device)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

# ✅ Bagian 7: Trainer Setup

In [None]:
class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels_clickbait = inputs.pop("clickbait_labels")
        labels_kategori = inputs.pop("kategori_labels")

        outputs = model(
            **inputs,
            clickbait_labels=labels_clickbait,
            kategori_labels=labels_kategori
        )

        loss = outputs["loss"]
        loss_clickbait = outputs["loss_clickbait"].detach().cpu().item()
        loss_kategori = outputs["loss_kategori"].detach().cpu().item()
        loss_total = loss.detach().cpu().item()

        # ✅ Log ke Trainer logger
        self.log({
            "loss_total": loss_total,
            "loss_clickbait": loss_clickbait,
            "loss_kategori": loss_kategori
        })

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=200,
    save_total_limit=1,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard"  # ✅ Aktifkan logging ke TensorBoard
)

trainer = MultiTaskTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# menyimpan hyperparameter training
with open("training_args.json", "w") as f:
    f.write(training_args.to_json_string())

# ✅ Bagian 8: Train

In [None]:
trainer.train()

TypeError: IndoBERTMultiTask.forward() missing 1 required positional argument: 'token_type_ids'

# **data log training tersimpan pada folder ./log**
jika ingin menampilkan grafik, run kode di bawah dalam notebook (melalui tensorboard).

direkomendasikan membuat grafik sendiri untuk paper


%load_ext tensorboard

%tensorboard --logdir ./logs

In [None]:
# Export Training Logs
import pandas as pd

# Ambil semua log
logs = trainer.state.log_history

# Convert ke DataFrame
df_logs = pd.DataFrame(logs)

# Simpan ke CSV
df_logs.to_csv("training_logs.csv", index=False)

print("✅ Log history disimpan ke training_logs.csv")

# ✅ Bagian 9: Evaluation

In [None]:
def get_predictions(trainer, dataset):
    loader = DataLoader(dataset, batch_size=8)
    all_clickbait = []
    all_kategori = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            pred_clickbait = outputs["clickbait_logits"].argmax(dim=1).cpu().tolist()
            pred_kategori = outputs["kategori_logits"].argmax(dim=1).cpu().tolist()

            all_clickbait.extend(pred_clickbait)
            all_kategori.extend(pred_kategori)

    return all_clickbait, all_kategori

pred_clickbait, pred_kategori = get_predictions(trainer, test_dataset)
true_clickbait = test_df["clickbait_label_encoded"].tolist()
true_kategori = test_df["kategori_label_encoded"].tolist()

# Joint Accuracy
joint_correct = [
    (a == b) and (c == d)
    for a, b, c, d in zip(true_clickbait, pred_clickbait, true_kategori, pred_kategori)
]
joint_acc = np.mean(joint_correct)
print(f"✅ Joint Accuracy: {joint_acc:.4f}")


# ✅ Bagian 10: Metrics & Report

In [None]:
def evaluate_task(true, pred, label_names, task_name):
    acc = accuracy_score(true, pred)
    print(f"\n✅ {task_name} Accuracy: {acc:.4f}")

    for avg in ['macro', 'micro', 'weighted']:
        f1 = f1_score(true, pred, average=avg)
        print(f" - {avg.capitalize()} F1: {f1:.4f}")

    # Per-class Precision, Recall, F1
    prec, rec, f1s, _ = precision_recall_fscore_support(true, pred)
    for i, label in enumerate(label_names):
        print(f"   {label}: Precision={prec[i]:.2f}, Recall={rec[i]:.2f}, F1={f1s[i]:.2f}")

    # Confusion Matrix
    cm = confusion_matrix(true, pred)
    plt.figure(figsize=(6,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    plt.title(f"{task_name} Confusion Matrix")
    plt.savefig(f"{task_name.lower().replace(' ','_')}_confusion_matrix.png")
    plt.close()

    # Classification Report
    report = classification_report(true, pred, target_names=label_names)
    print(report)
    with open(f"{task_name.lower().replace(' ','_')}_report.txt", "w") as f:
        f.write(report)

evaluate_task(true_clickbait, pred_clickbait, le_clickbait.classes_, "Clickbait")
evaluate_task(true_kategori, pred_kategori, le_kategori.classes_, "Kategori")


# ✅ Bagian 11: Bias Analysis

In [None]:
def plot_label_distribution(true, pred, labels, title):
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    # Pastikan true/pred terkonversi ke np.array
    true = np.array(true)
    pred = np.array(pred)

    # Buat count DataFrame supaya seaborn tahu semua kategori
    true_counts = pd.DataFrame({'label': true})
    pred_counts = pd.DataFrame({'label': pred})

    sns.countplot(x='label', data=true_counts, order=range(len(labels)), ax=axs[0])
    axs[0].set_title(f"{title} - True")
    axs[0].set_xticks(range(len(labels)))
    axs[0].set_xticklabels(labels, rotation=45, ha='right')

    sns.countplot(x='label', data=pred_counts, order=range(len(labels)), ax=axs[1])
    axs[1].set_title(f"{title} - Predicted")
    axs[1].set_xticks(range(len(labels)))
    axs[1].set_xticklabels(labels, rotation=45, ha='right')

    plt.tight_layout()
    plt.savefig(f"{title.lower().replace(' ','_')}_distribution.png")
    plt.show()
    plt.close()


plot_label_distribution(true_clickbait, pred_clickbait, le_clickbait.classes_, "Clickbait")
plot_label_distribution(true_kategori, pred_kategori, le_kategori.classes_, "Kategori")

# ✅ Bagian 12: Simpan Model

In [None]:
save_dir = "./saved_model"
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(save_dir)
print(f"\n✅ Model and tokenizer saved in {save_dir}")

# ✅ Bagian 13: Simpan JSON

In [None]:
import json

config = {
    "architectures": ["IndoBERTaMultiTask"],
    "model_type": "indobert-multitask",
    "num_clickbait_labels": 4,
    "num_kategori_labels": 9,
    "_name_or_path": "indobenchmark/indobert-base-p1",
    "hidden_size": 768
}

with open(os.path.join(save_dir, "config.json"), "w") as f:
    json.dump(config, f, indent=2)