## Install Dependencies

In [1]:
import sys
sys.executable

'C:\\Users\\aldha\\Projects\\mini-rag\\.venv\\Scripts\\python.exe'

In [2]:
!pip install pdfplumber beautifulsoup4 nltk tqdm

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Downloading pillow-12.0.0-cp314-cp314-win_amd64.whl.metadata (9.0 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-win_amd64.whl.metadata (67 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six==20251107->pdfplumber)
  Using cached charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl.metadata (38 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20251107->pdfplumber)
  Downloading cryptography-46.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 

In [3]:
import os
import re
import nltk
import pdfplumber
from bs4 import BeautifulSoup
from tqdm import tqdm

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aldha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aldha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

## Preprocessing Step

In [4]:
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)        # remove extra spaces
    text = re.sub(r"\n+", "\n", text)       # normalize newlines
    text = re.sub(r"\t+", " ", text)
    return text.strip()

Fungsi untuk extract text dari file .pdf

In [5]:
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return clean_text(text)

Fungsi untuk extract text dari file .html

In [6]:
def extract_text_from_html(html_path: str) -> str:
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # remove script & style
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = soup.get_text(separator="\n")
    return clean_text(text)

Fungsi untuk extract text dari file .txt

In [7]:
def extract_text_from_txt(txt_path: str) -> str:
    with open(txt_path, "r", encoding="utf-8") as f:
        return clean_text(f.read())

In [8]:
RAW_DIR = "corpus/raw"
OUT_DIR = "corpus/processed/docs"

os.makedirs(OUT_DIR, exist_ok=True)

doc_id = 1

for root, _, files in os.walk(RAW_DIR):
    for file in tqdm(files):
        path = os.path.join(root, file)

        try:
            if file.endswith(".html"):
                text = extract_text_from_html(path)

            else:
                continue

            if len(text) < 500:  # skip dokumen terlalu pendek
                continue

            out_path = os.path.join(OUT_DIR, f"doc_{doc_id:02d}.txt")
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(text)

            doc_id += 1

        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21509.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:07<00:00,  2.76it/s]


## Chunking

In [9]:
DOC_DIR = "corpus/processed/docs"
docs = sorted(os.listdir(DOC_DIR))
print(f"Loaded {len(docs)} documents")

Loaded 43 documents


In [10]:
def chunk_text(
    text: str,
    chunk_size: int = 150,
    overlap: int = 30
):
    tokens = nltk.word_tokenize(text)
    chunks = []

    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = " ".join(chunk_tokens)

        chunks.append(chunk_text)

        start = end - overlap  # overlap

    return chunks

In [11]:
all_chunks = []
chunk_id = 0

for doc_id, file in enumerate(tqdm(docs)):
    path = os.path.join(DOC_DIR, file)
    
    if not file.lower().endswith(".txt"):
        continue

    try:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
    
        chunks = chunk_text(text)
    
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "chunk_id": chunk_id,
                "doc_id": doc_id,
                "doc_name": file,
                "chunk_index": i,
                "text": chunk
            })
            chunk_id += 1
            
    except UnicodeDecodeError:
        print(f"❌ Error encoding di file: {file}")
    except Exception as e:
        print(f"⚠️ Error lain pada {file}: {e}")

100%|█████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 125.47it/s]


In [12]:
import json

OUT_PATH = "corpus/chunks/chunks.json"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

with open(OUT_PATH, "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_chunks)} chunks")

Saved 284 chunks


In [13]:
for c in all_chunks[:3]:
    print("\n--- Chunk", c["chunk_id"], "---")
    print("Doc:", c["doc_name"])
    print(c["text"][:300], "...")


--- Chunk 0 ---
Doc: doc_01.txt
15.687 Pengungsi Bencana di Sumut Terpapar Penyakit Kulit login register Nasional Peristiwa 15.687 Pengungsi Bencana di Sumut Terpapar Penyakit Kulit CNN Indonesia Selasa , 23 Des 2025 13:52 WIB Bagikan : url telah tercopy Dinas Kesehatan Provinsi Sumatera Utara mencatat ada 15.687 warga terdampak b ...

--- Chunk 1 ---
Doc: doc_01.txt
Kasus terbanyak ditemukan di Kabupaten Langkat , Tapanuli Tengah , Deliserdang , Batubara , Tebingtinggi , dan Mandailing Natal . `` Ini perlu menjadi fokus perhatian , terutama terkait faktor risiko seperti paparan air kotor , sanitasi lingkungan yang belum optimal , keterbatasan air bersih , serta ...

--- Chunk 2 ---
Doc: doc_01.txt
banjir . `` Sekarang Tapteng mulai kering , sehingga keluhan ISPA meningkat , '' ujarnya . Pilihan Redaksi Pengungsi Banjir-Longsor Sumut Mulai Terserang Penyakit Kulit dan ISPA 100 Nakes Sulsel Dikirim ke Daerah Bencana Aceh Tamiang Gelombang II Bantuan Internasional ke Aceh : Obat-obatan 

In [14]:
lengths = [len(nltk.word_tokenize(c["text"])) for c in all_chunks]

print("Chunk stats:")
print(f"  Total chunks      : {len(all_chunks)}")
print(f"  Avg tokens/chunk  : {sum(lengths)//len(lengths)}")
print(f"  Min tokens/chunk  : {min(lengths)}")
print(f"  Max tokens/chunk  : {max(lengths)}")

Chunk stats:
  Total chunks      : 284
  Avg tokens/chunk  : 133
  Min tokens/chunk  : 6
  Max tokens/chunk  : 152


## Embedding

### Load Chunk

In [15]:
import json

with open("corpus/chunks/chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [c["text"] for c in chunks]

### Define Compared Embedding Models

In [16]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.1-cp314-cp314-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.8.0-cp314-cp314-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp314-cp314-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from transformers<6.0.0,>=4.41.0->sentence-transformers)
  Downloading filelock-3.20.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.36.0-py3-none-

In [17]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

embedding_models = {
    "bge_m3": "BAAI/bge-m3",
    "e5_multilingual": "intfloat/multilingual-e5-base"
}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer

os.makedirs("corpus/chunks", exist_ok=True)

for key, model_name in embedding_models.items():
    print(f"Embedding with {model_name}")

    model = SentenceTransformer(model_name)

    embeddings = model.encode(
        texts,
        show_progress_bar=True,
        normalize_embeddings=True
    )

    out_path = f"corpus/chunks/embeddings_{key}.npy"
    np.save(out_path, embeddings)

    print(f"Saved {out_path} | shape = {embeddings.shape}")

Embedding with BAAI/bge-m3


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


## Vector Retrieval

### Import & Load Data

In [None]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load chunks
with open("corpus/chunks/chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

texts = [c["text"] for c in chunks]

# Load embeddings (pilih salah satu / bandingkan dua)
embeddings_bge = np.load("corpus/chunks/embeddings_bge_m3.npy")
embeddings_e5  = np.load("corpus/chunks/embeddings_e5_multilingual.npy")

print("Chunks:", len(chunks))
print("Embeddings shape:", embeddings_bge.shape)

In [None]:
from sentence_transformers import SentenceTransformer

query_model_bge = SentenceTransformer("BAAI/bge-m3")
query_model_e5  = SentenceTransformer("intfloat/multilingual-e5-base")

In [None]:
def retrieve_top_k(
    query: str,
    model: SentenceTransformer,
    embeddings: np.ndarray,
    chunks: list,
    k: int = 3
):
    # Encode query
    query_emb = model.encode(
        [query],
        normalize_embeddings=True
    )

    # Cosine similarity
    scores = cosine_similarity(query_emb, embeddings)[0]

    # Top-k index
    top_k_idx = np.argsort(scores)[::-1][:k]

    results = []
    for idx in top_k_idx:
        results.append({
            "score": float(scores[idx]),
            "text": chunks[idx]["text"],
            "doc_name": chunks[idx]["doc_name"],
            "chunk_id": chunks[idx]["chunk_id"]
        })

    return results

In [None]:
print("\n=== Compare Embedding Models (k=3) ===")

query = "Apakah fenomena langit merah itu berbahaya?"

results_bge = retrieve_top_k(
    query,
    query_model_bge,
    embeddings_bge,
    chunks,
    k=3
)

results_e5 = retrieve_top_k(
    query,
    query_model_e5,
    embeddings_e5,
    chunks,
    k=3
)

print("\n--- BGE-M3 ---")
for r in results_bge:
    print(f"{r['score']:.4f} | {r['text'][:200]}...")

print("\n--- E5 Multilingual ---")
for r in results_e5:
    print(f"{r['score']:.4f} | {r['text'][:200]}...")

### Build Context

In [None]:
def build_context(retrieved_chunks):
    return "\n\n".join(
        [f"- {c['text']}" for c in retrieved_chunks]
    )

In [None]:
context = build_context(results_bge)
print(context[:1000])

## RAG Generation

Download dependencies google llm and init API KEY

In [None]:
!pip install -U google-genai

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = ""

Init LLM Model Flash

In [None]:
import os
from google import genai

ai_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

In [None]:
MODEL_FLASH = "gemini-2.5-flash"
MODEL_PRO   = "gemini-2.0-pro"

Add generic func to answer question using context or not

In [None]:
def generate_answer_gemini(
    client,
    question: str,
    context: str = None,
    model=MODEL_FLASH
):
    if context:
        prompt = f"""
You are an AI assistant. Answer the question using ONLY the context below.
If the answer is not in the context, say "Information not found in the provided documents."

Context:
{context}

Question:
{question}

Answer:
"""
    else:
        prompt = f"""
Question:
{question}

Answer concisely:
"""


    response = client.models.generate_content(
        model=model,
        contents=prompt
    )
    return response.text

Fill to query

In [None]:
query = "Apakah ada fenomena langit merah di Indonesia akhir-akhir ini?"

### Answer with NON RAG

In [None]:
answer_non_rag = generate_answer_gemini(ai_client, query, None, MODEL_FLASH)
print("=== NON-RAG ANSWER ===")
print(answer_non_rag)

### Answer with RAG

In [None]:
# Ambil top-3 chunk (contoh pakai BGE-M3)
retrieved_chunks = retrieve_top_k(
    query=query,
    model=query_model_bge,
    embeddings=embeddings_bge,
    chunks=chunks,
    k=3
)

context = build_context(retrieved_chunks)

answer_rag = generate_answer_gemini(
    client=ai_client,
    question=query,
    context=context,
    model=MODEL_FLASH
)

print("=== RAG ANSWER ===")
print(answer_rag)

## Evaluation Using MMR Function

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def mmr(
    query_emb,
    doc_embs,
    docs,
    k=3,
    lambda_param=0.7
):
    sim_to_query = cosine_similarity(query_emb, doc_embs)[0]

    selected = []
    selected_idx = []

    for _ in range(k):
        scores = []

        for i in range(len(docs)):
            if i in selected_idx:
                scores.append(-np.inf)
                continue

            diversity = 0
            if selected_idx:
                diversity = max(
                    cosine_similarity(
                        doc_embs[i].reshape(1, -1),
                        doc_embs[selected_idx]
                    )[0]
                )

            score = (
                lambda_param * sim_to_query[i]
                - (1 - lambda_param) * diversity
            )
            scores.append(score)

        idx = int(np.argmax(scores))
        selected_idx.append(idx)
        selected.append(docs[idx])

    return selected

In [None]:
!pip install gradio

In [None]:
import gradio as gr
import time

def answer_with_k(query, k):
    answers_dict = []

    # Non RAG
    answer_non_rag = generate_answer_gemini(
        ai_client, 
        query, 
        None, 
        MODEL_FLASH
    )
    answers_dict.append({
        'k': -1,
        'embeddings': 'Non-RAG',
        'answers': answer_non_rag,
    })

    
    # embeddings: bge-m3
    retrieved = retrieve_top_k(
        query=query,
        model=query_model_bge,
        embeddings=embeddings_bge,
        chunks=chunks,
        k=k
    )
    context = build_context(retrieved)
    answer = generate_answer_gemini(
        client=ai_client,
        question=query,
        context=context,
        model=MODEL_FLASH
    )
    answers_dict.append({
        'k': k,
        'embeddings': 'BAAI/bge-m3',
        'answers': answer,
    })

    time.sleep(2)
    
    # embeddings: e5
    retrieved = retrieve_top_k(
        query=query,
        model=query_model_e5,
        embeddings=embeddings_e5,
        chunks=chunks,
        k=k
    )
    context = build_context(retrieved)
    answer = generate_answer_gemini(
        client=ai_client,
        question=query,
        context=context,
        model=MODEL_FLASH
    )
    answers_dict.append({
        'k': k,
        'embeddings': 'intfloat/multilingual-e5-base',
        'answers': answer,
    })
    return answers_dict


def rag_chat(question):
    answers = []

    answers_k = answer_with_k(query=question, k=1)
    answers.extend(answers_k)
    
    
    answers_k = answer_with_k(query=question, k=2)
    answers.extend(answers_k)
    
    answers_k = answer_with_k(query=question, k=3)
    answers.extend(answers_k)

    final_answers = ""
    for ans in answers:
        final_answers += "\n"
        answers_text = f"""
            k={ans['k']}
            embeddings={ans['embeddings']}
            answers={ans['answers']}
        """
        final_answers += answers_text
    
    return final_answers

#print(rag_chat("Apakah ada fenomena langit merah di Indonesia akhir-akhir ini?"))


interface = gr.Interface(
    fn=rag_chat,
    inputs=gr.Textbox(label="Ask a question"),
    outputs=gr.Textbox(label="RAG Answer"),
    title="Mini RAG Demo",
    description="Simple Retrieval-Augmented Generation system"
)

interface.launch()