In [1]:
!pip -q install -U "transformers>=4.44" accelerate peft safetensors
!pip -q install -U bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset


## **Dataset + Data Preparation**
**Load Dataset**

In [None]:
import json


file_path = "dataset_pmb_uajy.json"

try:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    
    if not isinstance(data, list):
        raise ValueError("JSON harus berupa LIST of objects")


    required = {"category", "instruction", "input", "output"}
    for i, ex in enumerate(data[:20]):  # cek 20 pertama
        missing = required - set(ex.keys())
        if missing:
            raise ValueError(f"Index {i} missing keys: {missing}")

    categories = sorted({d["category"] for d in data})

    print("✅ JSON berhasil dibaca")
    print("Jumlah data :", len(data))
    print("Categories  :", categories)
    print("Contoh data:", data[0])

except FileNotFoundError:
    print(f"❌ File '{file_path}' tidak ditemukan. Pastikan nama file & lokasi benar.")
except json.JSONDecodeError:
    print("❌ Format JSON tidak valid. Cek koma, kurung, dll.")
except Exception as e:
    print("❌ Error lain:", e)


✅ JSON berhasil dibaca
Jumlah data : 463
Categories  : ['Alur', 'Beasiswa', 'Biaya', 'PMB_Umum', 'Pembayaran', 'Prodi', 'Profesi', 'S1', 'S2', 'S3', 'Umum']
Contoh data: {'category': 'S1', 'instruction': 'Jawablah pertanyaan berikut berdasarkan informasi resmi PMB Universitas Atma Jaya Yogyakarta.', 'input': 'Siapa saja yang dapat mendaftar Program Sarjana (S1) di Universitas Atma Jaya Yogyakarta melalui Program Nilai Ijazah?', 'output': 'Program Nilai Ijazah terbuka bagi siswa SMA/SMK yang telah menyelesaikan studi (Lulusan tahun 2026 dan sebelumnya) yang penerimaannya didasarkan pada nilai ijazah.'}


**Split train/val/test stratified by category**

In [6]:
from sklearn.model_selection import train_test_split

labels = [d["category"] for d in data]

train_data, temp_data = train_test_split(
    data, test_size=0.2, random_state=42, shuffle=True, stratify=labels
)

temp_labels = [d["category"] for d in temp_data]
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42, shuffle=True, stratify=temp_labels
)

print(len(train_data), len(val_data), len(test_data))


370 46 47


**Templating**

In [7]:
def format_prompt(category: str, instruction: str, inp: str) -> str:
    category = (category or "").strip()
    instruction = (instruction or "").strip()
    inp = (inp or "").strip()

    header = f"### Category: {category}\n### Instruction:\n{instruction}\n\n"
    if inp:
        return header + f"### Input:\n{inp}\n\n### Response:\n"
    else:
        return header + "### Response:\n"


## **Tokenization & Prompt Formatting**
**Tokenization**

In [None]:


model_id = "Sahabat-AI/gemma2-9b-cpt-sahabatai-v1-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 1024

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=1024):
        self.data = data
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]

        prompt = format_prompt(
            ex.get("category", ""),
            ex.get("instruction", ""),
            ex.get("input", "")
        )
        answer = (ex.get("output", "") or "").strip()

        full_text = prompt + answer + self.tok.eos_token

        enc = self.tok(
            full_text,
            truncation=True,
            max_length=self.max_len,
        )

        input_ids = enc["input_ids"]
        attention_mask = enc["attention_mask"]

        prompt_ids = self.tok(
            prompt,
            truncation=True,
            max_length=self.max_len,
        )["input_ids"]

        labels = input_ids.copy()
        
        for i in range(min(len(prompt_ids), len(labels))):
            labels[i] = -100

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

train_ds = InstructionDataset(train_data, tokenizer, MAX_LEN)
val_ds   = InstructionDataset(val_data, tokenizer, MAX_LEN)
test_ds  = InstructionDataset(test_data, tokenizer, MAX_LEN)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

**Collate function (padding batch)**

In [11]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = pad_sequence([x["input_ids"] for x in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([x["attention_mask"] for x in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([x["labels"] for x in batch], batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


**Test DataLoader + sanity check decode**

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn)
batch = next(iter(train_loader))
print({k: v.shape for k, v in batch.items()})


sample = train_data[0]
print("\n--- Preview ---")
print(format_prompt(sample["category"], sample["instruction"], sample.get("input","")) + sample["output"][:200])


{'input_ids': torch.Size([2, 73]), 'attention_mask': torch.Size([2, 73]), 'labels': torch.Size([2, 73])}

--- Preview ---
### Category: Profesi
### Instruction:
Sebutkan minimal IPK yang disyaratkan untuk mendaftar PPAr UAJY.

### Input:
Minimal IPK PPAr UAJY

### Response:
Minimal IPK yang disyaratkan untuk mendaftar PPAr UAJY adalah 3,00.


## **Fine Tunning**
**Load model**

In [1]:
import torch
from transformers import AutoModelForCausalLM

model_id = "Sahabat-AI/gemma2-9b-cpt-sahabatai-v1-base"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Model Sahabat-AI loaded")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Model Sahabat-AI loaded


**LoRA**

In [None]:
!pip -q install peft

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


**Training Loop**

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-4)
model.train()

**Hitung Loss awal (1 batch)**

In [None]:
batch = next(iter(train_loader))
batch = {k: v.to(model.device) for k, v in batch.items()}

with torch.no_grad():
    out = model(**batch)
    print("Initial loss:", out.loss.item())


In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
    total_loss = 0
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(model.device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        if step % 50 == 0:
            print(f"Epoch {epoch} Step {step} Loss {loss.item():.4f}")

    print(f"Epoch {epoch} Avg loss:", total_loss / len(train_loader))


**Save Adapter**

In [None]:
save_dir = "gemma2b_lora_adapter"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Adapter saved to", save_dir)


## **Model Prediction/ Demo**

**Load base model + adapter (untuk inference)**

In [None]:
from peft import PeftModel

base_model_id = "Sahabat-AI/gemma2-9b-cpt-sahabatai-v1-base"
adapter_dir = "gemma9b_lora_adapter"  

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()


**Fungsi generate jawaban**

In [None]:
def generate_answer(sample, max_new_tokens=200):
    prompt = format_prompt(sample["category"], sample["instruction"], sample.get("input",""))
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )

    text = tokenizer.decode(out[0], skip_special_tokens=True)

    # ambil bagian setelah "### Response:"
    marker = "### Response:"
    if marker in text:
        return text.split(marker, 1)[1].strip()
    return text.strip()


**Generate untuk test set + simpan**

In [None]:
import json

pred_test = []
for s in test_data:
    s2 = dict(s)
    s2["model_response"] = generate_answer(s2, max_new_tokens=200)
    pred_test.append(s2)

out_path = "test_predictions.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(pred_test, f, ensure_ascii=False, indent=2)

print("Saved:", out_path)


## **Model Evaluation (LLM-based Scoring)**
**Setup Judge**

Model yang kami gunakan adalah Qwen karena Qwen mendukung Multi bahasa, salah satunya adalah bahasa indonesia yang kami gunakan pada LLM Sahabat AI Gemma

In [None]:
!pip -q install -U huggingface_hub
import os
from huggingface_hub import InferenceClient

#HF_TOKEN = os.environ.get("HF_TOKEN") 
judge_model = "Qwen/Qwen2.5-7B-Instruct" 

judge = InferenceClient(model=judge_model, token=HF_TOKEN)


In [None]:
import re
import json
import numpy as np

def judge_prompt(sample):
    return f"""
Nilai jawaban model berikut dari 0 sampai 5.
Balas HANYA JSON valid: {{"score":0-5,"reason":"singkat"}}

Instruction: {sample["instruction"]}
Input: {sample.get("input","")}
Expected: {sample["output"]}
Model Response: {sample["model_response"]}
""".strip()

def parse_score(text):
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m: 
        return None
    try:
        obj = json.loads(m.group(0))
        score = int(obj.get("score"))
        if 0 <= score <= 5:
            return score, str(obj.get("reason","")).strip()
    except:
        return None
    return None


**Menjalankan scoring + stats**

In [None]:
scored = []
invalid = 0

for s in pred_test:
    prompt = judge_prompt(s)
    resp = judge.text_generation(prompt, max_new_tokens=120, temperature=0.0)
    parsed = parse_score(resp)

    s2 = dict(s)
    if parsed is None:
        invalid += 1
        s2["judge_score"] = None
        s2["judge_reason"] = "INVALID_JUDGE_OUTPUT"
    else:
        s2["judge_score"], s2["judge_reason"] = parsed

    scored.append(s2)

valid_scores = [x["judge_score"] for x in scored if x["judge_score"] is not None]
stats = {
    "n_total": len(scored),
    "n_valid": len(valid_scores),
    "n_invalid": invalid,
    "mean": float(np.mean(valid_scores)) if valid_scores else None,
    "median": float(np.median(valid_scores)) if valid_scores else None,
    "dist": {str(k): sum(1 for v in valid_scores if v == k) for k in range(6)}
}

print(stats)

out_path2 = "test_with_judge.json"
with open(out_path2, "w", encoding="utf-8") as f:
    json.dump(scored, f, ensure_ascii=False, indent=2)

print("Saved:", out_path2)
