In [None]:
!pip install -q faiss-cpu pymupdf

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
import fitz
import os
from transformers import AutoModelForCausalLM, AutoTokenizer



# **Check Dataset**

In [None]:
DATASET_PATH = "/content/drive/MyDrive/RAG_Expirement/Dataset"

documents = []

for file in os.listdir(DATASET_PATH):
    if file.endswith(".pdf"):
        path = os.path.join(DATASET_PATH, file)
        doc = fitz.open(path)

        for page_num, page in enumerate(doc):
            text = page.get_text()
            documents.append({
                "text": text,
                "source": file,
                "page": page_num + 1
            })

print("Total halaman:", len(documents))

# Chunking

In [None]:
def chunk_text(text, size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        chunks.append(text[start:start+size])
        start += size - overlap
    return chunks

chunks = []

for doc in documents:
    pieces = chunk_text(doc["text"], size=1200, overlap=300)
    for p in pieces:
        chunks.append({
            "text": p,
            "source": doc["source"],
            "page": doc["page"]
        })

print("Total chunk:", len(chunks))

# Embedding

In [None]:
embedder = SentenceTransformer(
 "intfloat/multilingual-e5-base"
)

texts = ["passage: " + c["text"] for c in chunks]
embeddings = embedder.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings)

# Vector DB

In [None]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("Vector DB siap")

# Retrieval

In [None]:
def retrieve(query, k=3):
    q_emb = embedder.encode(["query: " + query])
    distances, indices = index.search(np.array(q_emb), k)
    return [chunks[i] for i in indices[0]]

# QA Model

In [None]:
qa = pipeline(
    "question-answering",
    model="deepset/xlm-roberta-large-squad2"
)

In [None]:
generator = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_new_tokens=200,
    device=0
)

# RAG with source

In [None]:
def rag_answer(query):
    results = retrieve(query, k=3)
    context = " ".join([r["text"][:1500] for r in results])

    # ---- Extractive QA ----
    answer = qa(
        question=query,
        context=context,
        max_answer_len=400
    )["answer"]


    if len(answer.strip()) < 25 or answer.strip() in ["(1)", "(2)", "1", "a."]:
        prompt = f"""
Gunakan konteks berikut untuk menjawab pertanyaan secara singkat dan jelas.

Konteks:
{context}

Pertanyaan:
{query}

Jawaban:
"""
        gen = generator(
            prompt,
            max_new_tokens=150,
            do_sample=False
        )[0]["generated_text"]

        answer = gen.split("Jawaban:")[-1].strip()

    # ---- Sumber ----
    sources = list(set([
        f'{r["source"]} halaman {r["page"]}'
        for r in results
    ]))[:2]

    # ---- Normalisasi ----
    replacements = {
        "\n": " ",
        "ora.ng": "orang",
        "persetqiuan": "persetujuan",
        "kebljakan": "kebijakan"
    }

    for k, v in replacements.items():
        answer = answer.replace(k, v)

    answer = " ".join(answer.split())

    return answer, sources

In [None]:
ans, src = rag_answer("Pertanyaan: Apa kewajiban baru bagi Penyelenggara Sistem Elektronik (PSE) yang memberikan layanan kepada pengguna anak? Langkah teknis apa yang wajib mereka sediakan?")
print("Jawaban:", ans)
print("Sumber:", src)

In [None]:
def batch_rag(questions):
    results = []

    for i, q in enumerate(questions, 1):
        ans, src = rag_answer(q)
        results.append({
            "No": i,
            "Pertanyaan": q,
            "Jawaban": ans,
            "Sumber": src[0] if len(src)>0 else "-"
        })

    return results


In [None]:
questions = [
    "Kapan Undang-Undang Nomor 1 Tahun 2024 ini diundangkan?",
    "Apa ancaman pidana maksimal bagi orang yang dengan sengaja menyebarkan konten perjudian?",
    "Apakah Informasi Elektronik diakui sebagai alat bukti hukum yang sah?",
    "Apa sanksi administratif bagi Penyelenggara Sistem Elektronik (PSE) yang melanggar kewajiban pelindungan anak?",
    "Apa definisi Data Pribadi menurut undang-undang ini?",
    "Sebutkan dua jenis kategori Data Pribadi!",
    "Berapa lama batas waktu maksimal bagi Pengendali Data Pribadi untuk memberitahukan kegagalan pelindungan data (kebocoran data) secara tertulis?",
    "Apa ancaman pidana bagi orang yang dengan sengaja membuat Data Pribadi palsu untuk menguntungkan diri sendiri?"
]


In [None]:
results = batch_rag(questions)

for r in results:
    print(f"No: {r['No']}")
    print("Pertanyaan:", r["Pertanyaan"])
    print("Jawaban:", r["Jawaban"])
    print("Sumber:", r["Sumber"])
    print("-"*50)

In [None]:
import gradio as gr

def chat_rag(message, history):
    answer, sources = rag_answer(message)
    return f"{answer}\n\nSumber: {sources}"

demo = gr.ChatInterface(
    fn=chat_rag,
    title="Chatbot RAG Dokumen Hukum",
    description="Tanya jawab berbasis UU No 1 Tahun 2024 & UU No 27 Tahun 2022"
)

demo.launch(share=True)
