In [2]:
!pip install -q faiss-cpu pymupdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
import fitz
import os
from transformers import AutoModelForCausalLM, AutoTokenizer



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Check Dataset**

In [4]:
DATASET_PATH = "/content/drive/MyDrive/RAG_Expirement/Dataset"

documents = []

for file in os.listdir(DATASET_PATH):
    if file.endswith(".pdf"):
        path = os.path.join(DATASET_PATH, file)
        doc = fitz.open(path)

        for page_num, page in enumerate(doc):
            text = page.get_text()
            documents.append({
                "text": text,
                "source": file,
                "page": page_num + 1
            })

print("Total halaman:", len(documents))

Total halaman: 89


# Chunking

In [5]:
def chunk_text(text, size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        chunks.append(text[start:start+size])
        start += size - overlap
    return chunks

chunks = []

for doc in documents:
    pieces = chunk_text(doc["text"], size=1200, overlap=300)
    for p in pieces:
        chunks.append({
            "text": p,
            "source": doc["source"],
            "page": doc["page"]
        })

print("Total chunk:", len(chunks))

Total chunk: 166


# Embedding

In [6]:
embedder = SentenceTransformer(
 "intfloat/multilingual-e5-base"
)

texts = ["passage: " + c["text"] for c in chunks]
embeddings = embedder.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: intfloat/multilingual-e5-base
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

# Vector DB

In [7]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print("Vector DB siap")

Vector DB siap


# Retrieval

In [8]:
def retrieve(query, k=3):
    q_emb = embedder.encode(["query: " + query])
    distances, indices = index.search(np.array(q_emb), k)
    return [chunks[i] for i in indices[0]]

# QA Model

In [9]:
qa = pipeline(
    "question-answering",
    model="deepset/xlm-roberta-large-squad2"
)

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

XLMRobertaForQuestionAnswering LOAD REPORT from: deepset/xlm-roberta-large-squad2
Key                         | Status     |  | 
----------------------------+------------+--+-
roberta.pooler.dense.bias   | UNEXPECTED |  | 
roberta.pooler.dense.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [10]:
generator = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_new_tokens=200,
    device=0
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Passing `generation_config` together with generation-related arguments=({'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


# RAG with source

In [11]:
def rag_answer(query):
    results = retrieve(query, k=3)
    context = " ".join([r["text"][:1500] for r in results])

    # ---- Extractive QA ----
    answer = qa(
        question=query,
        context=context,
        max_answer_len=400
    )["answer"]


    if len(answer.strip()) < 25 or answer.strip() in ["(1)", "(2)", "1", "a."]:
        prompt = f"""
Gunakan konteks berikut untuk menjawab pertanyaan secara singkat dan jelas.

Konteks:
{context}

Pertanyaan:
{query}

Jawaban:
"""
        gen = generator(
            prompt,
            max_new_tokens=150,
            do_sample=False
        )[0]["generated_text"]

        answer = gen.split("Jawaban:")[-1].strip()

    # ---- Sumber ----
    sources = list(set([
        f'{r["source"]} halaman {r["page"]}'
        for r in results
    ]))[:2]

    # ---- Normalisasi ----
    replacements = {
        "\n": " ",
        "ora.ng": "orang",
        "persetqiuan": "persetujuan",
        "kebljakan": "kebijakan"
    }

    for k, v in replacements.items():
        answer = answer.replace(k, v)

    answer = " ".join(answer.split())

    return answer, sources

In [12]:
ans, src = rag_answer("Pertanyaan: Apa kewajiban baru bagi Penyelenggara Sistem Elektronik (PSE) yang memberikan layanan kepada pengguna anak? Langkah teknis apa yang wajib mereka sediakan?")
print("Jawaban:", ans)
print("Sumber:", src)

Jawaban: menerapkan teknologi dan langkah teknis operasional untuk memberikan pelindungan
Sumber: ['Salinan UU Nomor 1 Tahun 2024.pdf halaman 5', 'Salinan UU Nomor 1 Tahun 2024.pdf halaman 23']


In [13]:
def batch_rag(questions):
    results = []

    for i, q in enumerate(questions, 1):
        ans, src = rag_answer(q)
        results.append({
            "No": i,
            "Pertanyaan": q,
            "Jawaban": ans,
            "Sumber": src[0] if len(src)>0 else "-"
        })

    return results


In [14]:
questions = [
    "Kapan Undang-Undang Nomor 1 Tahun 2024 ini diundangkan?",
    "Apa ancaman pidana maksimal bagi orang yang dengan sengaja menyebarkan konten perjudian?",
    "Apakah Informasi Elektronik diakui sebagai alat bukti hukum yang sah?",
    "Apa sanksi administratif bagi Penyelenggara Sistem Elektronik (PSE) yang melanggar kewajiban pelindungan anak?",
    "Apa definisi Data Pribadi menurut undang-undang ini?",
    "Sebutkan dua jenis kategori Data Pribadi!",
    "Berapa lama batas waktu maksimal bagi Pengendali Data Pribadi untuk memberitahukan kegagalan pelindungan data (kebocoran data) secara tertulis?",
    "Apa ancaman pidana bagi orang yang dengan sengaja membuat Data Pribadi palsu untuk menguntungkan diri sendiri?"
]


In [15]:
results = batch_rag(questions)

for r in results:
    print(f"No: {r['No']}")
    print("Pertanyaan:", r["Pertanyaan"])
    print("Jawaban:", r["Jawaban"])
    print("Sumber:", r["Sumber"])
    print("-"*50)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'max_new_tokens', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=150) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=150) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=150) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take

No: 1
Pertanyaan: Kapan Undang-Undang Nomor 1 Tahun 2024 ini diundangkan?
Jawaban: Undang-Undang Nomor 1 Tahun 2024 ini diundangkan di Jakarta pada tanggal 2 Januari 2024.
Sumber: Salinan UU Nomor 1 Tahun 2024.pdf halaman 21
--------------------------------------------------
No: 2
Pertanyaan: Apa ancaman pidana maksimal bagi orang yang dengan sengaja menyebarkan konten perjudian?
Jawaban: 10 (sepuluh) tahun dan/atau denda paling banyak Rp 1 0. 000.000.000,00 (sepuluh miliar rupiah).
Sumber: Salinan UU Nomor 1 Tahun 2024.pdf halaman 19
--------------------------------------------------
No: 3
Pertanyaan: Apakah Informasi Elektronik diakui sebagai alat bukti hukum yang sah?
Jawaban: mengikat dan diakui sebagai alat bukti yang sah
Sumber: Salinan UU Nomor 1 Tahun 2024.pdf halaman 24
--------------------------------------------------
No: 4
Pertanyaan: Apa sanksi administratif bagi Penyelenggara Sistem Elektronik (PSE) yang melanggar kewajiban pelindungan anak?
Jawaban: Penyelenggara Sistem 

In [26]:
import gradio as gr

def chat_rag(message, history):
    answer, sources = rag_answer(message)
    return f"{answer}\n\nSumber: {sources}"

demo = gr.ChatInterface(
    fn=chat_rag,
    title="Chatbot RAG Dokumen Hukum",
    description="Tanya jawab berbasis UU No 1 Tahun 2024 & UU No 27 Tahun 2022"
)

demo.launch(share=True)


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c8b66f3dc9e6ce90cf.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


