In [None]:
import os, json, random, re, unicodedata
from pathlib import Path

import torch


In [None]:
# ---- Model ----
BASE_MODEL  = "Qwen/Qwen2.5-3B-Instruct"   # Qwen 2.5 3B Instruct

# ---- Dataset paths ----
SRC_JSON    = "/workspace/QAs_Hukumonline_Train.json"  # your original JSON
DATA_DIR    = "/workspace/data"
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
CHAT_JSONL  = f"{DATA_DIR}/train_chat_qwen.jsonl"

# ---- Output dirs ----
OUT_DIR     = "/workspace/qwen25_3b_qlora_hukum"
MERGED_DIR  = "/workspace/qwen25_3b_qlora_merged"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MERGED_DIR, exist_ok=True)

# ---- Training params ----
MAX_SEQ_LEN    = 4096
EPOCHS         = 2
BATCH_SIZE     = 1
GRAD_ACC       = 8
LORA_R         = 16
LORA_ALPHA     = 32
LORA_DROPOUT   = 0.05
LR             = 2e-4
WARMUP_RATIO   = 0.03
LOG_STEPS      = 20
SAVE_STEPS     = 500
EVAL_RATIO     = 0.0   # keep 0.0 for OOM-safe (no eval)

# OOM safety on GPU pods
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
random.seed(42)

print("BASE_MODEL:", BASE_MODEL)
print("SRC_JSON  :", SRC_JSON)
print("CHAT_JSONL:", CHAT_JSONL)
print("OUT_DIR   :", OUT_DIR)
print("MERGED_DIR:", MERGED_DIR)


In [None]:
def norm_txt(x: str) -> str:
    x = (x or "").strip()
    x = unicodedata.normalize("NFKC", x)
    x = re.sub(r"[ \t]+", " ", x)       # collapse spaces/tabs
    x = re.sub(r"\n{3,}", "\n\n", x)    # collapse excessive newlines
    return x

with open(SRC_JSON, "r", encoding="utf-8") as f:
    rows = json.load(f)

assert isinstance(rows, list) and all(isinstance(r, dict) for r in rows), "Expect a list of dicts."

clean = []
for r in rows:
    instr = norm_txt(r.get("instruction", ""))
    resp  = norm_txt(r.get("response", ""))
    if not instr or not resp:
        continue
    if len(instr.split()) < 3 or len(resp.split()) < 5:
        continue
    clean.append({"instruction": instr, "response": resp})

# Dedup
seen = set()
dedup = []
for r in clean:
    k = (r["instruction"], r["response"])
    if k in seen:
        continue
    seen.add(k)
    dedup.append(r)

def write_jsonl(path, records):
    sys_prompt = "Anda adalah asisten hukum yang akurat, padat, dan mengutip dasar hukum bila relevan."
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            obj = {
                "messages": [
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": r["instruction"]},
                    {"role": "assistant", "content": r["response"]},
                ]
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

write_jsonl(CHAT_JSONL, dedup)

print(f"Total asli: {len(rows)} | Bersih+dedup: {len(dedup)}")
print("Saved chat JSONL â†’", CHAT_JSONL)

In [None]:
import trl
print("trl:", trl.__version__)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

# Sequence length safety dial (naikkan kalau aman)
MAX_LEN_TRAIN = min(MAX_SEQ_LEN, 3072)  # start safer; try 4096 later

# 4-bit quantization config
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,  # fp16 saves memory
)

# Tokenizer
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.padding_side = "right"

# Dataset: messages[]
ds = load_dataset("json", data_files=CHAT_JSONL, split="train")
ds = ds.map(lambda e: {"messages": e["messages"]},
            remove_columns=[c for c in ds.column_names if c != "messages"])

# Turn off eval to reduce VRAM spikes
train_ds, eval_ds = ds, None
_eval_strategy = "no"

# LoRA config (Qwen2.5 uses LLaMA-like projection names)
peft_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

cfg = SFTConfig(
    output_dir=OUT_DIR,
    bf16=True,                           # A40 supports bf16
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_8bit",
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    logging_steps=LOG_STEPS,
    save_total_limit=2,
    report_to="none",

    # sequence
    max_length=MAX_LEN_TRAIN,
    packing=False,

    # forwarded to AutoModelForCausalLM.from_pretrained(...)
    model_init_kwargs={
        "quantization_config": bnb,
        "torch_dtype": torch.bfloat16,
        "device_map": "auto",
        "attn_implementation": "sdpa",
        "low_cpu_mem_usage": True,
        "trust_remote_code": True,
    },

    eval_strategy=_eval_strategy,
)

trainer = SFTTrainer(
    model=BASE_MODEL,           # TRL will load via model_init_kwargs
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=peft_cfg,
    args=cfg,
    tokenizer=tok,              # ensure chat_template applied properly
)

trainer.train()
trainer.save_model(OUT_DIR)
tok.save_pretrained(OUT_DIR)
print("LoRA adapter saved to:", OUT_DIR)


In [None]:
from transformers import AutoModelForCausalLM, pipeline
from peft import PeftModel

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base, OUT_DIR)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_new_tokens=256,
)

def gen_chat(user_text: str):
    messages = [
        {"role": "system", "content": "Anda adalah asisten hukum yang akurat, padat, dan mengutip dasar hukum bila relevan."},
        {"role": "user", "content": user_text},
    ]
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    out = pipe(prompt, temperature=0.2, do_sample=False, max_new_tokens=256)[0]["generated_text"]
    return out[len(prompt):].strip()

tests = [
    "Ringkas unsur Pasal 1320 KUHPerdata.",
    "Jelaskan perbedaan wanprestasi dan perbuatan melawan hukum beserta dasar hukumnya.",
    "Apa syarat sahnya perjanjian menurut hukum Indonesia? Cantumkan pasal terkait.",
]
for i, t in enumerate(tests, 1):
    print(f"\n--- Q{i} ---\n{gen_chat(t)}\n")


In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

os.makedirs(MERGED_DIR, exist_ok=True)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)

base_cpu = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="cpu",
    trust_remote_code=True,
)

merged = PeftModel.from_pretrained(base_cpu, OUT_DIR).merge_and_unload()

merged.save_pretrained(MERGED_DIR)
tok.save_pretrained(MERGED_DIR)
print("Merged full model saved to:", MERGED_DIR)
