In [1]:
!pip -q install "transformers>=4.44" "trl>=0.9.6" "peft>=0.12.0" \
                 "bitsandbytes>=0.43" "accelerate>=0.34" \
                 "datasets>=2.20.0" sentencepiece "protobuf<5"


In [7]:
import os, torch
from pathlib import Path

# Paths
BASE_MODEL  = "google/gemma-3-4b-it"
SRC_JSON    = "/workspace/QAs_Hukumonline_Train.json"
DATA_DIR    = "/workspace/data"
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
CHAT_JSONL  = f"{DATA_DIR}/train_chat.jsonl"

OUT_DIR     = "/workspace/gemma3_qlora_hukum"
MERGED_DIR  = "/workspace/gemma3_qlora_merged"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MERGED_DIR, exist_ok=True)

# Training params
MAX_SEQ_LEN    = 4096    # we will pass this to trainer, not SFTConfig
EPOCHS         = 2
BATCH_SIZE     = 1
GRAD_ACC       = 8
LORA_R         = 16
LORA_ALPHA     = 32
LORA_DROPOUT   = 0.05
LR             = 2e-4
WARMUP_RATIO   = 0.03
LOG_STEPS      = 20
SAVE_STEPS     = 500
EVAL_RATIO     = 0.02     # 0 means no eval split


In [3]:
import json, random, re, unicodedata
from pathlib import Path

random.seed(42)

def norm_txt(x: str) -> str:
    x = x.strip()
    x = unicodedata.normalize("NFKC", x)
    x = re.sub(r"[ \t]+", " ", x)       # collapse spaces/tabs
    x = re.sub(r"\n{3,}", "\n\n", x)    # collapse excessive newlines
    return x

# 1) Load
with open(SRC_JSON, "r", encoding="utf-8") as f:
    rows = json.load(f)
assert isinstance(rows, list) and all(isinstance(r, dict) for r in rows), "Expect a list of dicts."

# 2) Clean & filter
clean = []
for r in rows:
    instr = norm_txt(r.get("instruction", ""))
    resp  = norm_txt(r.get("response", ""))
    if not instr or not resp:
        continue
    if len(instr.split()) < 3 or len(resp.split()) < 5:
        continue
    clean.append({"instruction": instr, "response": resp})

# 3) Deduplicate by (instruction, response)
seen = set(); dedup = []
for r in clean:
    k = (r["instruction"], r["response"])
    if k in seen: 
        continue
    seen.add(k); dedup.append(r)

# 4) Export to messages[] JSONL (no split here)
def write_jsonl(path, records):
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps({
                "messages": [
                    {"role":"system","content":"Anda adalah asisten hukum yang akurat, padat, dan mengutip dasar hukum bila relevan."},
                    {"role":"user","content": r["instruction"]},
                    {"role":"assistant","content": r["response"]}
                ]}, ensure_ascii=False) + "\n")

Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
write_jsonl(CHAT_JSONL, dedup)

print(f"Total asli: {len(rows)} | Bersih+dedup: {len(dedup)}")
print("Saved chat JSONL →", CHAT_JSONL)


Total asli: 384 | Bersih+dedup: 383
Saved chat JSONL → /workspace/data/train_chat.jsonl


In [4]:
import trl
print(trl.__version__)


  from .autonotebook import tqdm as notebook_tqdm


0.24.0


In [13]:
# Cell 4 — QLoRA training (OOM-safe on A40, TRL 0.24.0)
import os, torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

# --- dial down sequence length first to avoid OOM ---
MAX_LEN_TRAIN = min(MAX_SEQ_LEN, 3072)  # start safer; try 4096 later

# 4-bit quantization (use fp16 compute to save VRAM)
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,  # <- was bfloat16; fp16 saves memory
)

# Tokenizer (do NOT pass to trainer)
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if getattr(tok, "chat_template", None) is None:
    tok.padding_side = "right"

# Dataset: messages[]
ds = load_dataset("json", data_files=CHAT_JSONL, split="train")
def only_messages(e): return {"messages": e["messages"]}
ds = ds.map(only_messages, remove_columns=[c for c in ds.column_names if c != "messages"])

# Turn off eval to avoid evaluation-time memory spikes
train_ds, eval_ds = ds, None
_eval_strategy = "no"

# LoRA config
peft_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

# TRL 0.24.0 — put max_length, packing, and model_init_kwargs in SFTConfig
cfg = SFTConfig(
    output_dir=OUT_DIR,
    bf16=True,                           # activations in bf16 (safe on A40)
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    logging_steps=LOG_STEPS,
    save_total_limit=2,
    report_to="none",

    # Data/packing controls
    max_length=MAX_LEN_TRAIN,            # <- control sequence length here
    packing=False,

    # Forwarded to AutoModelForCausalLM.from_pretrained(...)
    model_init_kwargs={
        "quantization_config": bnb,
        "torch_dtype": torch.bfloat16,   # model weights bfloat16 on load (ok)
        "device_map": "auto",
        "attn_implementation": "sdpa",   # memory-friendlier attention
        "low_cpu_mem_usage": True,
    },

    # No eval to reduce VRAM
    eval_strategy=_eval_strategy,
)

trainer = SFTTrainer(
    model=BASE_MODEL,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=peft_cfg,
    args=cfg,
)

trainer.train()
trainer.save_model(OUT_DIR)
tok.save_pretrained(OUT_DIR)
print("LoRA adapter saved to:", OUT_DIR)


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.16s/it]
Tokenizing train dataset: 100%|██████████| 383/383 [00:01<00:00, 250.83 examples/s]
Truncating train dataset: 100%|██████████| 383/383 [00:00<00:00, 8878.13 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.


Step,Training Loss
20,1.8767
40,1.3996
60,1.2511
80,1.2045


LoRA adapter saved to: /workspace/gemma3_qlora_hukum


In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch, os

BASE_MODEL = "google/gemma-3-4b-it"
OUT_DIR    = "/workspace/gemma3_qlora_hukum"

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base, OUT_DIR)

# Avoid pad/eos mismatch warnings during generation
pad_id = tok.pad_token_id if tok.pad_token_id is not None else tok.eos_token_id
tok.pad_token = tok.eos_token if tok.pad_token is None else tok.pad_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_new_tokens=256,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
Device set to use cuda:0


In [15]:
import re, numpy as np

prompts = [
    "User: Ringkas unsur Pasal 1320 KUHPerdata.\nAssistant:",
    "User: Jelaskan perbedaan wanprestasi dan perbuatan melawan hukum beserta dasar hukumnya.\nAssistant:",
    "User: Apa syarat sahnya perjanjian menurut hukum Indonesia? Cantumkan pasal terkait.\nAssistant:",
]

def gen(p):
    return pipe(p, temperature=0.2, do_sample=False, max_new_tokens=256)[0]["generated_text"]

outs = [gen(p) for p in prompts]
for i, o in enumerate(outs, 1):
    text = o.split("Assistant:", 1)[-1]
    print(f"\n--- Q{i} ---\n{text.strip()}\n")

# quick-and-dirty “sanity” metrics
lens = [len(o.split()) for o in outs]
has_legal_terms = [bool(re.search(r"\b(Pasal|UU|KUHP|KUHPerdata|Perma|Permen|Putusan)\b", o, flags=re.I)) for o in outs]
print("\nApprox word lengths:", lens)
print("Contains legal terms:", has_legal_terms, " → score:", np.mean(has_legal_terms))



--- Q1 ---
Pasal 1320 KUHPerdata mengatur tentang kewajiban membayar ganti kerugian kepada orang yang dirugikan akibat perbuatan melawan hukum.

Penjelasan Pasal 1320 KUHPerdata adalah sebagai berikut:

Pasal 1320

Barang siapa dengan sengaja atau karena kelalaiannya menimbulkan kerugian pada orang lain, wajib mengganti kerugian tersebut.

Pasal 1320 ayat (1) KUHPerdata mengatur tentang kewajiban membayar ganti kerugian kepada orang yang dirugikan akibat perbuatan melawan hukum.

Pasal 1320 ayat (2) KUHPerdata mengatur bahwa kerugian yang harus diganti adalah kerugian yang timbul akibat perbuatan yang melawan hukum.

Pasal 1320 ayat (3) KUHPerdata mengatur bahwa kerugian yang harus diganti adalah kerugian yang timbul akibat perbuatan yang melawan hukum.

Pasal 1320 ayat (4) KUHPerdata mengatur bahwa kerugian yang harus diganti adalah kerugian yang timbul akibat perbuatan yang melawan hukum.

Pasal 1320 ayat (5) KUHPerdata mengatur bahwa kerugian yang harus dig


--- Q2 ---
Wanprestasi

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch, os

BASE_MODEL = "google/gemma-3-4b-it"
OUT_DIR    = "/workspace/gemma3_qlora_hukum"
MERGED_DIR = "/workspace/gemma3_qlora_merged"

os.makedirs(MERGED_DIR, exist_ok=True)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.bfloat16, device_map="cpu"
)
merged = PeftModel.from_pretrained(base, OUT_DIR).merge_and_unload()

merged.save_pretrained(MERGED_DIR)
tok.save_pretrained(MERGED_DIR)
print("Merged full model saved to:", MERGED_DIR)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.09it/s]


Merged full model saved to: /workspace/gemma3_qlora_merged
