<a href="https://colab.research.google.com/github/cycl0ps/Tutorial/blob/main/xna_ai_text_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install -U transformers datasets evaluate accelerate huggingface_hub torch

In [None]:
import torch
import transformers
import datasets

print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)
print("CUDA available:", torch.cuda.is_available())

In [None]:
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import os

DATA_DIR = "/content/data/hc3"
os.makedirs(DATA_DIR, exist_ok=True)

jsonl_path = hf_hub_download(
    repo_id="Hello-SimpleAI/HC3",
    filename="all.jsonl",
    repo_type="dataset",
    local_dir=DATA_DIR,
)

ds = load_dataset("json", data_files=jsonl_path)

print(ds)
print(ds["train"][0].keys())

In [None]:
raw = ds["train"].shuffle(seed=42)

split = raw.train_test_split(test_size=0.2, seed=42)
train_raw = split["train"]
temp = split["test"]

split2 = temp.train_test_split(test_size=0.5, seed=42)
val_raw = split2["train"]
test_raw = split2["test"]

print(len(train_raw), len(val_raw), len(test_raw))

In [None]:
from datasets import Dataset

def flatten_hc3(dataset):
    texts = []
    labels = []

    for row in dataset:
        for ans in row["human_answers"]:
            texts.append(ans)
            labels.append(0)   # HUMAN

        for ans in row["chatgpt_answers"]:
            texts.append(ans)
            labels.append(1)   # AI

    return Dataset.from_dict({"text": texts, "label": labels})

train_ds = flatten_hc3(train_raw)
val_ds   = flatten_hc3(val_raw)
test_ds  = flatten_hc3(test_raw)

print(train_ds)

In [None]:
from collections import Counter

print("Train:", Counter(train_ds["label"]))
print("Val  :", Counter(val_ds["label"]))
print("Test :", Counter(test_ds["label"]))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize, batched=True, remove_columns=["text"])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy.compute(
            predictions=preds,
            references=labels
        )["accuracy"],

        "precision": precision.compute(
            predictions=preds,
            references=labels,
            average="binary"
        )["precision"],

        "recall": recall.compute(
            predictions=preds,
            references=labels,
            average="binary"
        )["recall"],

        "f1": f1.compute(
            predictions=preds,
            references=labels,
            average="binary"
        )["f1"],
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/models/xna-ai-text-detector",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate(test_ds)

In [None]:
SAVE_DIR = "/content/models/xna-ai-text-detector"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.whoami()

In [None]:
from huggingface_hub import HfFolder, HfApi
print("TOKEN:", HfFolder.get_token())
print("WHOAMI:", HfApi().whoami())

In [None]:
from huggingface_hub import create_repo

create_repo(
    repo_id="xnajoan/xna-ai-text-detector",
    exist_ok=True,
    private=False   # True kalau mau private
)

In [None]:
#====Cell-15====#
## Push to HuggingFace Hub

repo_id = "xnajoan/xna-ai-text-detector"  # ganti USERNAME

trainer.push_to_hub(
    repo_id=repo_id,
    commit_message="Initial commit: fine-tuned BERT on HC3 for AI text detection"
)

tokenizer.push_to_hub(repo_id)