In [None]:
!pip -q install -U "transformers>=4.44" accelerate peft safetensors
!pip -q install -U bitsandbytes


^C
^C


  You can safely remove it manually.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset


## **Dataset + Data Preparation**
**Load Dataset**

In [None]:
import json

# GANTI sesuai nama file di explorer VSCode kamu
file_path = "dataset_pmb_uajy.json"

try:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Validasi tipe data
    if not isinstance(data, list):
        raise ValueError("JSON harus berupa LIST of objects")

    # Validasi struktur field
    required = {"category", "instruction", "input", "output"}
    for i, ex in enumerate(data[:20]):  # cek 20 pertama
        missing = required - set(ex.keys())
        if missing:
            raise ValueError(f"Index {i} missing keys: {missing}")

    categories = sorted({d["category"] for d in data})

    print("✅ JSON berhasil dibaca")
    print("Jumlah data :", len(data))
    print("Categories  :", categories)
    print("Contoh data:", data[0])

except FileNotFoundError:
    print(f"❌ File '{file_path}' tidak ditemukan. Pastikan nama file & lokasi benar.")
except json.JSONDecodeError:
    print("❌ Format JSON tidak valid. Cek koma, kurung, dll.")
except Exception as e:
    print("❌ Error lain:", e)


✅ JSON berhasil dibaca
Jumlah data : 463
Categories  : ['Alur', 'Beasiswa', 'Biaya', 'PMB_Umum', 'Pembayaran', 'Prodi', 'Profesi', 'S1', 'S2', 'S3', 'Umum']
Contoh data: {'category': 'S1', 'instruction': 'Jawablah pertanyaan berikut berdasarkan informasi resmi PMB Universitas Atma Jaya Yogyakarta.', 'input': 'Siapa saja yang dapat mendaftar Program Sarjana (S1) di Universitas Atma Jaya Yogyakarta melalui Program Nilai Ijazah?', 'output': 'Program Nilai Ijazah terbuka bagi siswa SMA/SMK yang telah menyelesaikan studi (Lulusan tahun 2026 dan sebelumnya) yang penerimaannya didasarkan pada nilai ijazah.'}


**Split train/val/test stratified by category**

In [None]:
from sklearn.model_selection import train_test_split

labels = [d["category"] for d in data]

train_data, temp_data = train_test_split(
    data, test_size=0.2, random_state=42, shuffle=True, stratify=labels
)

temp_labels = [d["category"] for d in temp_data]
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42, shuffle=True, stratify=temp_labels
)

print(len(train_data), len(val_data), len(test_data))


370 46 47


**Templating**

In [5]:
def format_prompt(category: str, instruction: str, inp: str) -> str:
    category = (category or "").strip()
    instruction = (instruction or "").strip()
    inp = (inp or "").strip()

    header = f"### Category: {category}\n### Instruction:\n{instruction}\n\n"
    if inp:
        return header + f"### Input:\n{inp}\n\n### Response:\n"
    else:
        return header + "### Response:\n"


## **Tokenization & Prompt Formatting**
**Tokenization**

In [None]:


model_id = "Sahabat-AI/gemma2-9b-cpt-sahabatai-v1-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 1024

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=1024):
        self.data = data
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]

        prompt = format_prompt(
            ex.get("category", ""),
            ex.get("instruction", ""),
            ex.get("input", "")
        )
        answer = (ex.get("output", "") or "").strip()

        full_text = prompt + answer + self.tok.eos_token

        enc = self.tok(
            full_text,
            truncation=True,
            max_length=self.max_len,
        )

        input_ids = enc["input_ids"]
        attention_mask = enc["attention_mask"]

        # token prompt untuk masking label
        prompt_ids = self.tok(
            prompt,
            truncation=True,
            max_length=self.max_len,
        )["input_ids"]

        labels = input_ids.copy()
        # mask prompt tokens supaya loss fokus pada output
        for i in range(min(len(prompt_ids), len(labels))):
            labels[i] = -100

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

train_ds = InstructionDataset(train_data, tokenizer, MAX_LEN)
val_ds   = InstructionDataset(val_data, tokenizer, MAX_LEN)
test_ds  = InstructionDataset(test_data, tokenizer, MAX_LEN)


In [3]:
prompt = "Apa itu Program Tanpa Tes di UAJY?"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=100
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Apa itu Program Tanpa Tes di UAJY?

Program Tanpa Tes di UAJY adalah program yang memungkinkan calon mahasiswa untuk masuk ke UAJY tanpa harus mengikuti tes masuk. Program ini sangat cocok bagi mereka yang ingin masuk ke UAJY tanpa harus melalui proses tes yang panjang dan melelahkan.

Bagaimana Cara Mendaftar Program Tanpa Tes di UAJY?

Untuk mendaftar Program Tanpa Tes di UAJY, calon mahasiswa harus memenuhi beberapa persyaratan yang telah ditentukan oleh pihak UAJY.


**Collate function (padding batch)**

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = pad_sequence([x["input_ids"] for x in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([x["attention_mask"] for x in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([x["labels"] for x in batch], batch_first=True, padding_value=-100)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


**Test DataLoader + sanity check decode**

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn)
batch = next(iter(train_loader))
print({k: v.shape for k, v in batch.items()})

# lihat 1 contoh text (biar yakin template bener)
sample = train_data[0]
print("\n--- Preview ---")
print(format_prompt(sample["category"], sample["instruction"], sample.get("input","")) + sample["output"][:200])
