In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading p

In [4]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [5]:
# !pip -q install pdfplumber jiwer pandas

import re
from pathlib import Path
import pandas as pd
import pdfplumber
from jiwer import wer, cer

# ------------------ CONFIG ------------------
PDF_ROOT = "/kaggle/input/peraturan-keuangan/UU Keuangan"
GT_CSV   = "/kaggle/input/gt-wer-cer/ground truth WER CER - Sheet2.csv"
OUT_CSV  = "wer_cer_final_fixed_heading.csv"
SHOW_EXAMPLES = 8

# ------------------ PDF -> TEXT ------------------
def extract_text_from_pdf(pdf_path):
    out = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for p in pdf.pages:
                out.append(p.extract_text() or "")
    except Exception as e:
        print(f"[WARN] gagal baca {pdf_path}: {e}")
        return ""
    return "\n".join(out)

# ------------------ NORMALIZATION ------------------
def strong_normalize(text):
    if not text:
        return ""
    text = text.replace("\r", "\n")
    text = re.sub(r"-\n", "", text)                 # join hyphen linebreak
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)

    # robust PASAL
    text = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", text, flags=re.I)

    # remove url/email/web noise
    text = re.sub(r"https?://\S+|www\.\S+", " ", text, flags=re.I)
    text = re.sub(r"\b\w+@\w+\.\w+\b", " ", text)
    text = re.sub(r"\bweb\b|\bwww\b", " ", text, flags=re.I)

    # remove page markers
    text = re.sub(r"^\s*(halaman|page)\s*\d+\s*$", " ", text, flags=re.I|re.M)
    text = re.sub(r"^\s*\d+\s*$", " ", text, flags=re.M)

    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

# ------------------ BATANG TUBUH ONLY (fallback safe) ------------------
def keep_only_batang_tubuh(text):
    full = text
    lower = text.lower()
    m = re.search(r"\bpenjelasan\b", lower)
    if not m:
        return full

    cut = full[:m.start()].strip()
    # fallback kalau potongan terlalu pendek / tidak ada pasal sama sekali
    if len(cut) < 500:
        return full
    if not re.search(r"\bpasal\s+\d+\b", cut, flags=re.I):
        return full
    return cut

# ------------------ PASAL CANONICAL ------------------
def canonical_pasal(p):
    p = str(p).strip()
    p = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", p, flags=re.I)
    p = re.sub(r"\s+", " ", p).strip()
    m = re.match(r"(?i)pasal\s+(\d+)([a-z]?)", p)
    if not m:
        return p
    return f"Pasal {m.group(1)}{m.group(2).upper()}"

# ------------------ HEADING DETECTION (KEY FIX) ------------------
def is_pasal_heading(line: str):
    """
    True hanya jika baris itu benar-benar judul pasal.
    Mencegah false split karena kalimat: "sebagaimana dimaksud dalam Pasal 3 ..."
    Heuristik:
    - 'Pasal <angka><opsional huruf>' muncul di awal baris
    - baris sangat pendek (misal: "Pasal 7" atau "Pasal 7.")
    - tidak mengandung kata-kata kalimat (sebagaimana/dimaksud/ayat/huruf/dalam)
    """
    s = line.strip()
    if not s:
        return False

    # harus start dengan Pasal
    m = re.match(r"(?i)^pasal\s+\d+[a-z]?\b", s)
    if not m:
        return False

    # buang ekor punctuation untuk pengecekan "pendek"
    s_clean = re.sub(r"[.:;,\-–—()\[\]]+", " ", s)
    s_clean = re.sub(r"\s+", " ", s_clean).strip()

    # token harus sedikit (heading biasanya 2 token: Pasal + angka)
    toks = s_clean.split()
    if len(toks) > 3:
        return False

    # hindari baris yang jelas kalimat/referensi
    bad_markers = ["sebagaimana", "dimaksud", "dalam", "ayat", "huruf", "angka", "pada"]
    low = s.lower()
    if any(b in low for b in bad_markers):
        return False

    return True

# ------------------ QUALITY SCORING (avoid "cukup jelas") ------------------
def pasal_quality_score(text):
    t = (text or "").lower()
    score = 0
    if re.search(r"\(\s*1\s*\)", t):   # ada ayat (1)
        score += 5
    if "cukup jelas" in t:
        score -= 5
    score += min(len(t) // 400, 5)     # tambah skor jika panjang (wajar untuk pasal normatif)
    return score

# ------------------ EXTRACT PASAL (STATE MACHINE, fixed heading) ------------------
def extract_pasal(text):
    text = strong_normalize(text)
    text = keep_only_batang_tubuh(text)

    lines = [l.strip() for l in text.split("\n") if l.strip()]

    pasal_map = {}
    current = None
    buffer = []

    for line in lines:
        if is_pasal_heading(line):
            # simpan pasal sebelumnya
            if current and buffer:
                cand = " ".join(buffer).strip()
                if (current not in pasal_map) or (pasal_quality_score(cand) > pasal_quality_score(pasal_map[current])):
                    pasal_map[current] = cand

            current = canonical_pasal(line)
            buffer = [line]
        else:
            if current:
                buffer.append(line)

    # simpan terakhir
    if current and buffer:
        cand = " ".join(buffer).strip()
        if (current not in pasal_map) or (pasal_quality_score(cand) > pasal_quality_score(pasal_map[current])):
            pasal_map[current] = cand

    return pasal_map

# ------------------ METRIC NORMALIZATION ------------------
def normalize_metric(t):
    t = (t or "").lower().replace("\n", " ")
    t = re.sub(r"https?://\S+|www\.\S+", " ", t, flags=re.I)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# ============================================================
# RUN
# ============================================================
try:
    from IPython.display import display
except:
    def display(x):
        print(x)

print("=== [1] Load Ground Truth ===")
gt = pd.read_csv(GT_CSV)
gt_use = gt[["doc_id", "pasal", "reference_text"]].copy()
gt_use["doc_id"] = gt_use["doc_id"].astype(str).str.strip()
gt_use["pasal"]  = gt_use["pasal"].apply(canonical_pasal)

target_docs = set(gt_use["doc_id"])
print("GT rows:", len(gt_use))
print("Unique doc_id:", len(target_docs))

print("\n=== [2] Index PDFs ===")
pdf_index = {p.name: str(p) for p in Path(PDF_ROOT).rglob("*.pdf")}
print("Total PDFs found:", len(pdf_index))

print("\n=== [3] Extract Pasal only for target docs ===")
extracted = {}
missing = []
for doc in sorted(target_docs):
    path = pdf_index.get(doc)
    if not path:
        missing.append(doc)
        continue
    raw = extract_text_from_pdf(path)
    if not raw.strip():
        continue
    pasal_map = extract_pasal(raw)
    for k, v in pasal_map.items():
        extracted[(doc, k)] = v

print("Extracted (doc,pasal) pairs:", len(extracted))
print("Missing PDFs:", len(missing))
if missing:
    print("Missing examples:", missing[:10])

print("\n=== [4] Evaluate WER/CER ===")
rows = []
skipped = 0
for _, r in gt_use.iterrows():
    key = (r["doc_id"], r["pasal"])
    hyp = extracted.get(key, "")
    if not hyp.strip():
        skipped += 1
        continue

    ref_n = normalize_metric(r["reference_text"])
    hyp_n = normalize_metric(hyp)

    rows.append({
        "doc_id": r["doc_id"],
        "pasal": r["pasal"],
        "WER": wer(ref_n, hyp_n),
        "CER": cer(ref_n, hyp_n),
        "ref_words": len(ref_n.split()),
        "hyp_words": len(hyp_n.split())
    })

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)

mean_WER = df["WER"].mean() if len(df) else None
mean_CER = df["CER"].mean() if len(df) else None
median_WER = df["WER"].median() if len(df) else None
median_CER = df["CER"].median() if len(df) else None

print("\n=== SUMMARY ===")
print("Evaluated pairs:", len(df))
print("Skipped (no match):", skipped)
print("Mean WER:", mean_WER)
print("Median WER:", median_WER)
print("Mean CER:", mean_CER)
print("Median CER:", median_CER)
print("Saved:", OUT_CSV)

print("\n=== [5] Examples (GT vs EXTRACTED) ===")
shown = 0
for _, r in gt_use.iterrows():
    key = (r["doc_id"], r["pasal"])
    if key not in extracted:
        continue
    print(f"\n--- {key[0]} | {key[1]} ---")
    print("[GT]  :", r["reference_text"][:350])
    print("[EXT] :", extracted[key][:900])
    shown += 1
    if shown >= SHOW_EXAMPLES:
        break

print("\n=== [6] Worst WER top 10 ===")
if len(df):
    display(df.sort_values("WER", ascending=False).head(10))
else:
    print("No evaluated rows. Cek doc_id/pasal match.")

=== [1] Load Ground Truth ===
GT rows: 50
Unique doc_id: 47

=== [2] Index PDFs ===
Total PDFs found: 3721

=== [3] Extract Pasal only for target docs ===
Extracted (doc,pasal) pairs: 941
Missing PDFs: 0

=== [4] Evaluate WER/CER ===

=== SUMMARY ===
Evaluated pairs: 37
Skipped (no match): 13
Mean WER: 0.17328717666075727
Median WER: 0.05102040816326531
Mean CER: 0.13276682411795904
Median CER: 0.02046783625730994
Saved: wer_cer_final_fixed_heading.csv

=== [5] Examples (GT vs EXTRACTED) ===

--- PP_Nomor_2_Tahun_2012_cde4.pdf | Pasal 12 ---
[GT]  : (1) Menteri/pimpinan lembaga pemerintah non kementerian
dapat mengusulkan besaran hibah dan daftar nama
Pemerintah Daerah yang diusulkan sebagai penerima hibah
kepada Menteri berdasarkan penetapan Pemerintah untuk
hibah kepada Pemerintah Daerah yang bersumber dari
penerimaan dalam negeri.
(2) Menteri/pimpinan lembaga pemerintah non kementerian
meng
[EXT] : Pasal 12 (1) Menteri/pimpinan lembaga pemerintah non kementerian dapat mengusulkan be

Unnamed: 0,doc_id,pasal,WER,CER,ref_words,hyp_words
4,PP_Nomor_24_Tahun_2018_c71d.pdf,Pasal 6,1.516667,0.94274,60,103
5,1._Salinan_UU_Nomor_15_Tahun_2025_a051.pdf,Pasal 7,1.33871,1.316876,186,426
31,PP Nomor 34 Tahun 2023.pdf,Pasal 3,0.848837,0.793722,86,19
36,PERBUP MBD NOMOR 33 TAHUN 2022 TARIF AIR MINUM...,Pasal 3,0.510204,0.370821,49,74
2,PP_Nomor_24_Tahun_2018_1dec.pdf,Pasal 4,0.306122,0.27933,49,64
20,Peraturan_Menteri_Keuangan_(PMK)_Nomor_21PMK.0...,Pasal 13,0.191489,0.137834,94,112
28,perpu 001 1959.pdf,Pasal 3,0.175,0.196507,40,47
19,Perubahan_Kedua_atas_Peraturan_Presiden_Nomor_...,Pasal 10,0.164179,0.128623,67,78
7,1._Salinan_UU_Nomor_15_Tahun_2025_b3d9.pdf,Pasal 5,0.136986,0.083825,146,158
6,1._Salinan_UU_Nomor_15_Tahun_2025_cce2.pdf,Pasal 5,0.123288,0.080283,146,158


In [6]:
# ============================================================
# 1-RUN: EXTRACT ALL PDF -> PASAL CHUNKS (FOR IR INDEXING)
# Using SAME logic:
# - strong_normalize
# - keep_only_batang_tubuh (fallback aman)
# - is_pasal_heading (anti "Pasal X" in-sentence)
# - state-machine + quality scoring
#
# Output:
# - pasal_chunks.jsonl  (recommended for IR)
# - pasal_chunks.csv    (optional, same content)
# - stats_per_doc.csv
# ============================================================

# !pip -q install pdfplumber pandas

import re, json
from pathlib import Path
import pandas as pd
import pdfplumber

# ------------------ CONFIG ------------------
PDF_ROOT = "/kaggle/input/peraturan-keuangan/UU Keuangan"   # <-- root folder (bisa banyak subfolder)
OUT_JSONL = "pasal_chunks.jsonl"
OUT_CSV   = "pasal_chunks.csv"
OUT_STATS = "stats_per_doc.csv"

# Safety limits (biar gak kebablasan kalau PDF rusak)
MAX_CHARS_PER_PASAL = 20000
MIN_WORDS_TO_KEEP   = 10

# ------------------ PDF -> TEXT ------------------
def extract_text_from_pdf(pdf_path):
    out = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for p in pdf.pages:
                out.append(p.extract_text() or "")
    except Exception as e:
        print(f"[WARN] gagal baca {pdf_path}: {e}")
        return ""
    return "\n".join(out)

# ------------------ NORMALIZATION ------------------
def strong_normalize(text):
    if not text:
        return ""
    text = text.replace("\r", "\n")
    text = re.sub(r"-\n", "", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)

    text = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", text, flags=re.I)

    text = re.sub(r"https?://\S+|www\.\S+", " ", text, flags=re.I)
    text = re.sub(r"\b\w+@\w+\.\w+\b", " ", text)
    text = re.sub(r"\bweb\b|\bwww\b", " ", text, flags=re.I)

    text = re.sub(r"^\s*(halaman|page)\s*\d+\s*$", " ", text, flags=re.I|re.M)
    text = re.sub(r"^\s*\d+\s*$", " ", text, flags=re.M)

    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()

def keep_only_batang_tubuh(text):
    full = text
    lower = text.lower()
    m = re.search(r"\bpenjelasan\b", lower)
    if not m:
        return full

    cut = full[:m.start()].strip()
    if len(cut) < 500:
        return full
    if not re.search(r"\bpasal\s+\d+\b", cut, flags=re.I):
        return full
    return cut

# ------------------ PASAL CANONICAL ------------------
def canonical_pasal(p):
    p = str(p).strip()
    p = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", p, flags=re.I)
    p = re.sub(r"\s+", " ", p).strip()
    m = re.match(r"(?i)pasal\s+(\d+)([a-z]?)", p)
    if not m:
        return p
    return f"Pasal {m.group(1)}{m.group(2).upper()}"

# ------------------ HEADING DETECTION ------------------
def is_pasal_heading(line: str):
    s = line.strip()
    if not s:
        return False
    m = re.match(r"(?i)^pasal\s+\d+[a-z]?\b", s)
    if not m:
        return False

    s_clean = re.sub(r"[.:;,\-–—()\[\]]+", " ", s)
    s_clean = re.sub(r"\s+", " ", s_clean).strip()
    toks = s_clean.split()
    if len(toks) > 3:
        return False

    bad_markers = ["sebagaimana", "dimaksud", "dalam", "ayat", "huruf", "angka", "pada"]
    low = s.lower()
    if any(b in low for b in bad_markers):
        return False
    return True

# ------------------ QUALITY SCORING ------------------
def pasal_quality_score(text):
    t = (text or "").lower()
    score = 0
    if re.search(r"\(\s*1\s*\)", t):
        score += 5
    if "cukup jelas" in t:
        score -= 5
    score += min(len(t) // 400, 5)
    return score

# ------------------ EXTRACT PASAL ------------------
def extract_pasal_map(raw_text):
    text = strong_normalize(raw_text)
    text = keep_only_batang_tubuh(text)

    if not text:
        return {}

    lines = [l.strip() for l in text.split("\n") if l.strip()]
    pasal_map = {}
    current = None
    buffer = []

    for line in lines:
        if is_pasal_heading(line):
            if current and buffer:
                cand = " ".join(buffer).strip()
                if len(cand) > MAX_CHARS_PER_PASAL:
                    cand = cand[:MAX_CHARS_PER_PASAL]
                if (current not in pasal_map) or (pasal_quality_score(cand) > pasal_quality_score(pasal_map[current])):
                    pasal_map[current] = cand

            current = canonical_pasal(line)
            buffer = [line]
        else:
            if current:
                buffer.append(line)

    if current and buffer:
        cand = " ".join(buffer).strip()
        if len(cand) > MAX_CHARS_PER_PASAL:
            cand = cand[:MAX_CHARS_PER_PASAL]
        if (current not in pasal_map) or (pasal_quality_score(cand) > pasal_quality_score(pasal_map[current])):
            pasal_map[current] = cand

    return pasal_map

# ============================================================
# RUN: PROCESS ALL PDF
# ============================================================
pdf_paths = list(Path(PDF_ROOT).rglob("*.pdf"))
print("Total PDFs found:", len(pdf_paths))

all_rows = []
stats = []

for i, p in enumerate(pdf_paths, start=1):
    doc_id = p.name
    pdf_path = str(p)

    raw = extract_text_from_pdf(pdf_path)
    if not raw.strip():
        stats.append({"doc_id": doc_id, "path": pdf_path, "num_pasal": 0, "status": "empty_or_unreadable"})
        continue

    pasal_map = extract_pasal_map(raw)

    kept = 0
    for pasal, chunk_text in pasal_map.items():
        n_words = len(chunk_text.split())
        if n_words < MIN_WORDS_TO_KEEP:
            continue

        all_rows.append({
            "doc_id": doc_id,
            "path": pdf_path,
            "pasal": pasal,
            "chunk_text": chunk_text,
            "n_words": n_words
        })
        kept += 1

    stats.append({"doc_id": doc_id, "path": pdf_path, "num_pasal": kept, "status": "ok" if kept > 0 else "no_pasal_detected"})

    if i % 25 == 0:
        print(f"[{i}/{len(pdf_paths)}] processed... total_chunks={len(all_rows)}")

# ============================================================
# SAVE OUTPUTS
# ============================================================
df_chunks = pd.DataFrame(all_rows)
df_stats  = pd.DataFrame(stats)

# JSONL (recommended)
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for row in all_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

# CSV (optional)
df_chunks.to_csv(OUT_CSV, index=False)
df_stats.to_csv(OUT_STATS, index=False)

print("\nDONE ✅")
print("Total pasal chunks:", len(df_chunks))
print("Saved:", OUT_JSONL, OUT_CSV, OUT_STATS)

print("\nTop 10 docs by extracted pasal:")
print(df_stats.sort_values("num_pasal", ascending=False).head(10))


Total PDFs found: 3721
[25/3721] processed... total_chunks=217
[50/3721] processed... total_chunks=734
[75/3721] processed... total_chunks=1123
[100/3721] processed... total_chunks=1474
[125/3721] processed... total_chunks=1919
[150/3721] processed... total_chunks=2223
[175/3721] processed... total_chunks=2385
[200/3721] processed... total_chunks=2574
[225/3721] processed... total_chunks=2766
[250/3721] processed... total_chunks=2993
[275/3721] processed... total_chunks=3268
[300/3721] processed... total_chunks=3571
[325/3721] processed... total_chunks=3882
[350/3721] processed... total_chunks=4051
[375/3721] processed... total_chunks=4302
[400/3721] processed... total_chunks=4539
[425/3721] processed... total_chunks=4829


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is a

[450/3721] processed... total_chunks=5083
[475/3721] processed... total_chunks=5480
[500/3721] processed... total_chunks=5791
[525/3721] processed... total_chunks=6294
[WARN] gagal baca /kaggle/input/peraturan-keuangan/UU Keuangan/PERDA NOMOR 9 TAHUN 2012 TENTANG PENCABUTAN PERDA NOMOR 19 TAHUN 2003 TENTANG RETRIBUSI TERHADAP HASIL PRODUKSI BAHAN OLA~1.pdf: No /Root object! - Is this really a PDF?
[550/3721] processed... total_chunks=6440
[575/3721] processed... total_chunks=6626
[600/3721] processed... total_chunks=6986
[625/3721] processed... total_chunks=7215
[650/3721] processed... total_chunks=7712
[675/3721] processed... total_chunks=8033
[700/3721] processed... total_chunks=8318
[725/3721] processed... total_chunks=8593
[750/3721] processed... total_chunks=8685
[775/3721] processed... total_chunks=8828
[800/3721] processed... total_chunks=9189
[825/3721] processed... total_chunks=9437
[850/3721] processed... total_chunks=9813
[875/3721] processed... total_chunks=10020
[900/3721]

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P11' is

[1325/3721] processed... total_chunks=14190
[1350/3721] processed... total_chunks=14357
[1375/3721] processed... total_chunks=14516
[1400/3721] processed... total_chunks=14727
[1425/3721] processed... total_chunks=15058
[1450/3721] processed... total_chunks=15336
[1475/3721] processed... total_chunks=15579
[1500/3721] processed... total_chunks=15845
[1525/3721] processed... total_chunks=16142
[1550/3721] processed... total_chunks=16426
[1575/3721] processed... total_chunks=16705
[1600/3721] processed... total_chunks=17054
[1625/3721] processed... total_chunks=17269
[1650/3721] processed... total_chunks=17496
[1675/3721] processed... total_chunks=17746
[1700/3721] processed... total_chunks=18035
[1725/3721] processed... total_chunks=18380
[1750/3721] processed... total_chunks=18525
[1775/3721] processed... total_chunks=18782
[1800/3721] processed... total_chunks=19281
[1825/3721] processed... total_chunks=19559
[1850/3721] processed... total_chunks=19721
[1875/3721] processed... total_c

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P11' is

[1975/3721] processed... total_chunks=21119
[2000/3721] processed... total_chunks=21599
[2025/3721] processed... total_chunks=21850
[2050/3721] processed... total_chunks=22198
[2075/3721] processed... total_chunks=22628
[2100/3721] processed... total_chunks=23009
[2125/3721] processed... total_chunks=23276
[2150/3721] processed... total_chunks=23494
[2175/3721] processed... total_chunks=23766
[2200/3721] processed... total_chunks=23951
[2225/3721] processed... total_chunks=24188
[2250/3721] processed... total_chunks=24365
[2275/3721] processed... total_chunks=24812
[2300/3721] processed... total_chunks=25255
[2325/3721] processed... total_chunks=25567
[2350/3721] processed... total_chunks=25811
[2375/3721] processed... total_chunks=26060
[2400/3721] processed... total_chunks=26541
[2425/3721] processed... total_chunks=26799
[2450/3721] processed... total_chunks=26997
[2475/3721] processed... total_chunks=27210
[2500/3721] processed... total_chunks=27700
[2525/3721] processed... total_c

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P11' is an invalid float value
Cannot set gray non-stroke color because /'P12' i

[2675/3721] processed... total_chunks=29727
[2700/3721] processed... total_chunks=29910
[2725/3721] processed... total_chunks=30147
[WARN] gagal baca /kaggle/input/peraturan-keuangan/UU Keuangan/Permenkes Nomor 19 Tahun 2020.pdf: No /Root object! - Is this really a PDF?
[2750/3721] processed... total_chunks=30359
[2775/3721] processed... total_chunks=30556
[2800/3721] processed... total_chunks=30887
[2825/3721] processed... total_chunks=31062
[2850/3721] processed... total_chunks=31480
[2875/3721] processed... total_chunks=31694
[2900/3721] processed... total_chunks=31899
[2925/3721] processed... total_chunks=32318
[2950/3721] processed... total_chunks=32628
[2975/3721] processed... total_chunks=32865
[3000/3721] processed... total_chunks=33031
[3025/3721] processed... total_chunks=33412
[3050/3721] processed... total_chunks=33660
[3075/3721] processed... total_chunks=33948
[3100/3721] processed... total_chunks=34133
[3125/3721] processed... total_chunks=34415
[WARN] gagal baca /kaggle

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value


[3550/3721] processed... total_chunks=38914
[3575/3721] processed... total_chunks=39077
[3600/3721] processed... total_chunks=39381
[3625/3721] processed... total_chunks=39710
[3650/3721] processed... total_chunks=39862
[3675/3721] processed... total_chunks=40077
[3700/3721] processed... total_chunks=40374

DONE ✅
Total pasal chunks: 40724
Saved: pasal_chunks.jsonl pasal_chunks.csv stats_per_doc.csv

Top 10 docs by extracted pasal:
                                                 doc_id  \
2591  Peraturan_Badan_Pengawasan_Keuangan_dan_Pemban...   
1922                     UU_Nomor_1_Tahun_2022_28d2.pdf   
633                      UU_Nomor_1_Tahun_2022_38de.pdf   
1987                     UU_Nomor_1_Tahun_2022_511d.pdf   
3018                     UU_Nomor_1_Tahun_2022_80a4.pdf   
2272                     UU_Nomor_1_Tahun_2022_c86f.pdf   
2915                     UU_Nomor_1_Tahun_2022_2cfb.pdf   
2490                          UU Nomor 1 Tahun 2022.pdf   
2063                     UU_Nomor

## Sebelum Revisi GroundTruth

In [1]:
# =============================================================================
# BAGIAN 1: INSTALL, IMPORT, CONFIG
# =============================================================================
# Install library yang diperlukan
!pip -q install pandas numpy scikit-learn rank-bm25 sentence-transformers sacrebleu tqdm

import os, re, math, json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder
import sacrebleu

# Cek GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device yang digunakan: {device}")

# ==== KONFIGURASI ====
TOP_K_RERANK_CANDIDATES = 25  
TOP_K_FETCH = 100           
MAX_K_EVAL = 30             
K_LEVELS = [5, 15, 30]       

# ==== PATHS (SESUAIKAN JIKA PERLU) ====
PASAL_CHUNKS_CSV = "/kaggle/input/dataset-clean-and-ground-truth/pasal_chunks.csv"
QUERIES_CSV      = "/kaggle/input/dataset-clean-and-ground-truth/ground truth uas  - queries.csv"
QRELS_CSV        = "/kaggle/input/dataset-clean-and-ground-truth/ground truth uas  - qrels.csv"

OUT_DIR = "ir_final_project"
os.makedirs(OUT_DIR, exist_ok=True)

# =============================================================================
# BAGIAN 2: LOAD DATA & NORMALISASI
# =============================================================================
print("Loading Data...")
corpus_df  = pd.read_csv(PASAL_CHUNKS_CSV)
queries_df = pd.read_csv(QUERIES_CSV)
qrels_df   = pd.read_csv(QRELS_CSV)

# --- FUNGSI NORMALISASI ---
def smart_norm(text: str) -> str:
    if not isinstance(text, str): text = str(text)
    text = re.sub(r"(?<=\d)O(?=\d)", "0", text) # Fix OCR: '199O' -> '1990'
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text) # Hapus URL
    text = re.sub(r"[^a-z0-9\.,\(\)\s]", " ", text) # Pertahankan tanda baca penting hukum
    text = re.sub(r"\s+", " ", text).strip()
    return text

def canon_pasal(p):
    p = str(p).strip()
    p = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", p, flags=re.I)
    p = re.sub(r"\s+", " ", p).strip()
    m = re.match(r"(?i)pasal\s+(\d+)([a-z]?)", p)
    if not m: return p
    return f"Pasal {m.group(1)}{m.group(2).upper()}"

# --- TERAPKAN NORMALISASI ---
corpus_df["pasal_canon"] = corpus_df["pasal"].apply(canon_pasal)
corpus_df["doc_fp"] = corpus_df["doc_id"].astype(str).str.strip()
corpus_df["chunk_text_norm"] = corpus_df["chunk_text"].apply(smart_norm)

queries_df["question_text_norm"] = queries_df["question_text"].apply(smart_norm)

qrels_df["pasal_canon"] = qrels_df["pasal"].apply(canon_pasal)
qrels_df["doc_fp"] = qrels_df["doc_id"].astype(str).str.strip()
qrels_df["gold_passage_norm"] = qrels_df["gold_passage"].astype(str).apply(smart_norm)

# List Cache untuk Akses Cepat
corpus_texts = corpus_df["chunk_text_norm"].tolist()
corpus_docfp = corpus_df["doc_fp"].tolist()
corpus_pasal = corpus_df["pasal_canon"].tolist()

print(f"Data Loaded. Corpus Size: {len(corpus_texts)}")

# =============================================================================
# BAGIAN 3: ADVANCED GROUND TRUTH (ANTI FALSE-NEGATIVE)
# =============================================================================
# Kita butuh mapping konten untuk verifikasi duplikat
doc_id_to_text = {
    str(row["doc_id"]).strip(): row["chunk_text_norm"] 
    for _, row in corpus_df.iterrows()
}

GLOBAL_QRELS_ENHANCED = {}

for qid, sub in qrels_df.groupby("question_id"):
    valid_pasals = set()
    texts_by_pasal = {} 
    rel_map_fp = {}
    gold_texts_list = [] # Untuk BLEU
    
    for _, r in sub.iterrows():
        p_name = r["pasal_canon"]
        d_id = str(r["doc_id"]).strip()
        rel = int(r.get("relevance", 1))
        
        rel_map_fp[d_id] = rel
        
        # Simpan teks emas untuk BLEU score
        if isinstance(r["gold_passage_norm"], str) and len(r["gold_passage_norm"]) > 5:
            gold_texts_list.append(r["gold_passage_norm"])
        
        if isinstance(p_name, str):
            valid_pasals.add(p_name)
            if d_id in doc_id_to_text:
                if p_name not in texts_by_pasal: texts_by_pasal[p_name] = []
                texts_by_pasal[p_name].append(doc_id_to_text[d_id])
            
    GLOBAL_QRELS_ENHANCED[qid] = {
        "valid_pasals": valid_pasals,  
        "rel_map_fp": rel_map_fp,      
        "texts_by_pasal": texts_by_pasal, 
        "gold_texts": list(set(gold_texts_list))
    }

print("Ground Truth Enhanced Built.")

# =============================================================================
# BAGIAN 4: HELPER FUNCTIONS (METRICS & LOGIC)
# =============================================================================

def get_jaccard_sim(str1, str2):
    """Menghitung kemiripan kata antara dua teks"""
    a = set(str(str1).split())
    b = set(str(str2).split())
    if len(a) == 0 or len(b) == 0: return 0.0
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def dynamic_hop_predictor(scores, model_type):
    """
    LOGIKA "JUJUR" (Tanpa Oracle).
    Menebak Single/Multi berdasarkan pola skor yang dihasilkan model.
    """
    if len(scores) < 2: return "single"
    
    s1, s2 = scores[0], scores[1]
    
    # 1. Konversi Logit Reranker ke Probabilitas (0-1)
    if "RERANKER" in model_type:
        s1 = 1 / (1 + np.exp(-s1))
        s2 = 1 / (1 + np.exp(-s2))
    
    if s1 <= 1e-9: return "single" # Safety check
    
    ratio = s2 / s1
    
    # 2. Threshold Ketat (Agar dominan Single Hop)
    if model_type in ["DENSE_E5", "HYBRID_RRF"]:
        # Dense skornya sangat rapat (0.8 vs 0.79). 
        # Multi Hop hanya jika ratio > 99.8% (Sangat identik)
        if ratio > 0.998: return "multi"
            
    elif "RERANKER" in model_type:
        # Reranker biasanya tegas. Jika probabilitasnya dekat (>98%), baru Multi.
        if ratio > 0.98: return "multi"
            
    else: # BM25 / TFIDF
        # Skor tidak terbatas. Jika rasionya > 98%, baru Multi.
        if ratio > 0.98: return "multi"

    return "single"

def is_prediction_correct(pred_fp, pred_pasal, pred_text, ground_truth):
    """
    Menentukan Kebenaran Prediksi (Recall Hybrid).
    Benar jika: (ID Cocok) ATAU (Nama Pasal Cocok DAN Isi Konten Mirip)
    """
    # A. Cek ID Persis
    if pred_fp in ground_truth["rel_map_fp"]:
        return True
        
    # B. Cek Nama Pasal + Isi Konten (Menangkap Duplikat/Chunking)
    if pred_pasal in ground_truth["valid_pasals"]:
        valid_texts = ground_truth["texts_by_pasal"].get(pred_pasal, [])
        if not valid_texts: return False 
        
        # Bandingkan dengan setiap variasi teks kunci jawaban
        for val_text in valid_texts:
            sim = get_jaccard_sim(pred_text, val_text)
            if sim > 0.85: # Ambang batas 85% kemiripan
                return True
                
    return False

def dedup_results(idx, scores, corpus_docfp, corpus_pasal, corpus_texts, k=30):
    """
    Deduplikasi: Memastikan variasi pasal di output.
    """
    seen_fp = set()
    u_fps, u_pasals, u_texts, u_scores = [], [], [], []
    
    for i, s in zip(idx, scores):
        fp = str(corpus_docfp[i])
        if fp in seen_fp: continue
        seen_fp.add(fp)
        
        u_fps.append(fp)
        u_pasals.append(corpus_pasal[i])
        u_texts.append(corpus_texts[i])
        u_scores.append(s)
        
        if len(u_fps) >= k: break
            
    return u_fps, u_pasals, u_texts, np.array(u_scores)

def calculate_metrics_for_k(pred_fps, pred_pasals, pred_texts, top_scores, qid, model_name, k_val):
    """Menghitung metrik untuk nilai K tertentu"""
    if qid not in GLOBAL_QRELS_ENHANCED: return None
    gt = GLOBAL_QRELS_ENHANCED[qid]
    
    # Slice data sesuai K
    curr_fps = pred_fps[:k_val]
    curr_pasals = pred_pasals[:k_val]
    curr_texts = pred_texts[:k_val]
    
    # 1. Hitung Hits (Benar/Salah) dengan Smart Check
    hits = []
    for i in range(len(curr_fps)):
        is_hit = is_prediction_correct(curr_fps[i], curr_pasals[i], curr_texts[i], gt)
        hits.append(1 if is_hit else 0)
    
    tp = sum(hits)
    
    # 2. Precision & Recall
    precision = tp / k_val
    # Recall = TP / Jumlah Pasal Unik yang Valid (Adil)
    recall = tp / len(gt["valid_pasals"]) if len(gt["valid_pasals"]) > 0 else 0
    
    # 3. MRR
    mrr = 0
    for i, h in enumerate(hits):
        if h: 
            mrr = 1/(i+1)
            break
            
    # 4. NDCG
    def dcg(r): return sum((2**v - 1)/math.log2(idx+2) for idx,v in enumerate(r))
    ideal = [1] * len(gt["valid_pasals"]) # Idealnya semua pasal valid ditemukan
    ndcg = dcg(hits) / (dcg(ideal) + 1e-9)
    
    # 5. BLEU (Top-1)
    best_bleu = 0.0
    if gt["gold_texts"] and len(curr_texts) > 0:
        best_bleu = sacrebleu.sentence_bleu(curr_texts[0], gt["gold_texts"]).score

    # 6. Hop Pred (Menggunakan Top-2 Score, tidak terpengaruh K)
    hop_pred = dynamic_hop_predictor(top_scores, model_name)
    
    return {
        "model": model_name, "k": k_val, "question_id": qid,
        "hop_pred": hop_pred,
        "precision": precision, "recall": recall, 
        "mrr": mrr, "ndcg": ndcg, "bleu": best_bleu
    }

# =============================================================================
# BAGIAN 5: DEFINISI MODEL
# =============================================================================

# 1. BM25
print("Building BM25...")
tokenized_corpus = [doc.split() for doc in corpus_texts]
bm25_model = BM25Okapi(tokenized_corpus)
def run_bm25(q, k=MAX_K_EVAL):
    scores = bm25_model.get_scores(q.split())
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 2. TF-IDF
print("Building TF-IDF...")
tfidf_model = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfidf_matrix = tfidf_model.fit_transform(corpus_texts)
def run_tfidf(q, k=MAX_K_EVAL):
    q_vec = tfidf_model.transform([q])
    scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 3. DENSE E5-BASE
print("Encoding Dense E5-Base...")
dense_model = SentenceTransformer('intfloat/multilingual-e5-base', device=device)
dense_emb = dense_model.encode(["passage: " + t for t in corpus_texts], batch_size=64, show_progress_bar=True, normalize_embeddings=True)
def run_dense(q, k=MAX_K_EVAL):
    q_emb = dense_model.encode(["query: " + q], normalize_embeddings=True)[0]
    scores = dense_emb @ q_emb
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 4. HYBRID RRF
def run_hybrid(q, k=MAX_K_EVAL):
    idx_b, _ = run_bm25(q, k=100)
    idx_d, _ = run_dense(q, k=100)
    score_map = {}
    for r, i in enumerate(idx_b): score_map[i] = score_map.get(i, 0) + 1/(60+r)
    for r, i in enumerate(idx_d): score_map[i] = score_map.get(i, 0) + 1/(60+r)
    srt = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:k]
    return np.array([x[0] for x in srt]), np.array([x[1] for x in srt])

# 5. RERANKER
print("Loading Reranker...")
reranker = CrossEncoder('BAAI/bge-reranker-base', device=device)
def run_reranker(q, k=MAX_K_EVAL):
    cand_idx, _ = run_bm25(q, k=TOP_K_RERANK_CANDIDATES) 
    pairs = [[q, corpus_texts[i]] for i in cand_idx]
    scores = reranker.predict(pairs, batch_size=32, show_progress_bar=False)
    srt_loc = np.argsort(scores)[::-1][:k]
    return cand_idx[srt_loc], scores[srt_loc]

MODELS = {
    "BM25": run_bm25,
    "TFIDF_WORD": run_tfidf,
    "DENSE_E5": run_dense,
    "HYBRID_RRF": run_hybrid,
    "RERANKER_BGE": run_reranker
}

# =============================================================================
# BAGIAN 6: EKSEKUSI UTAMA (SEQUENTIAL K EVALUATION)
# =============================================================================
results_storage = {5: [], 15: [], 30: []}

print("\n>>> MEMULAI EVALUASI SISTEM <<<")
print("Strategi: Fetch 100 -> Dedup -> Cut to 30 -> Eval @5, @15, @30")

for name, func in MODELS.items():
    print(f"\nProcessing Model: {name} ...")
    
    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
        qid = row["question_id"]
        qtext = row["question_text_norm"]
        
        if qid not in GLOBAL_QRELS_ENHANCED: continue
        
        # 1. RETRIEVAL (Ambil banyak dulu)
        raw_idx, raw_scores = func(qtext, k=TOP_K_FETCH)
        
        # 2. DEDUPLIKASI (Ambil 30 terbaik yang unik)
        clean_fps, clean_pasals, clean_texts, clean_scores = dedup_results(
            raw_idx, raw_scores, corpus_docfp, corpus_pasal, corpus_texts, k=MAX_K_EVAL
        )
        
        # 3. HITUNG METRICS UNTUK SETIAP K
        for k_val in K_LEVELS:
            res = calculate_metrics_for_k(
                clean_fps, clean_pasals, clean_texts, clean_scores, 
                qid, name, k_val
            )
            if res: results_storage[k_val].append(res)

# =============================================================================
# BAGIAN 7: LAPORAN (REPORTING)
# =============================================================================

def print_k_report(k_val):
    df = pd.DataFrame(results_storage[k_val])
    summary = df.groupby("model")[["precision", "recall", "mrr", "ndcg", "bleu"]].mean()
    # Sort by Recall for Consistency
    summary = summary.sort_values("recall", ascending=False)
    
    print(f"\n" + "="*50)
    print(f" HASIL EVALUASI PADA K = {k_val}")
    print("="*50)
    print(summary)
    
    print(f"\n[Distribus Hop @ K={k_val}]")
    print(df.groupby(["model", "hop_pred"]).size().unstack(fill_value=0))
    print("-" * 50)
    
    # Save per K
    df.to_csv(f"{OUT_DIR}/results_k{k_val}.csv", index=False)

# Cetak Laporan Berurutan
print_k_report(5)
print_k_report(15)
print_k_report(30)

# Gabung semua untuk arsip
all_res = pd.concat([pd.DataFrame(v) for v in results_storage.values()])
all_res.to_csv(f"{OUT_DIR}/all_results_combined.csv", index=False)
print(f"\nSelesai! Semua file output tersimpan di folder: {OUT_DIR}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31

2025-12-16 01:05:11.119460: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765847111.326350      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765847111.390616      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Device yang digunakan: cuda
Loading Data...
Data Loaded. Corpus Size: 40724
Ground Truth Enhanced Built.
Building BM25...
Building TF-IDF...
Encoding Dense E5-Base...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/637 [00:00<?, ?it/s]

Loading Reranker...


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


>>> MEMULAI EVALUASI SISTEM <<<
Strategi: Fetch 100 -> Dedup -> Cut to 30 -> Eval @5, @15, @30

Processing Model: BM25 ...


  0%|          | 0/75 [00:00<?, ?it/s]


Processing Model: TFIDF_WORD ...


  0%|          | 0/75 [00:00<?, ?it/s]


Processing Model: DENSE_E5 ...


  0%|          | 0/75 [00:00<?, ?it/s]


Processing Model: HYBRID_RRF ...


  0%|          | 0/75 [00:00<?, ?it/s]


Processing Model: RERANKER_BGE ...


  0%|          | 0/75 [00:00<?, ?it/s]


 HASIL EVALUASI PADA K = 5
              precision    recall       mrr      ndcg       bleu
model                                                           
HYBRID_RRF     0.125333  0.520000  0.438444  0.427665  31.238513
RERANKER_BGE   0.125333  0.520000  0.464889  0.463212  34.543020
BM25           0.114667  0.473333  0.374667  0.375389  23.425236
TFIDF_WORD     0.114667  0.473333  0.379333  0.386721  22.101976
DENSE_E5       0.106667  0.453333  0.380889  0.374447  34.835756

[Distribus Hop @ K=5]
hop_pred      multi  single
model                      
BM25             35      40
DENSE_E5         26      49
HYBRID_RRF        9      66
RERANKER_BGE     45      30
TFIDF_WORD       27      48
--------------------------------------------------

 HASIL EVALUASI PADA K = 15
              precision    recall       mrr      ndcg       bleu
model                                                           
HYBRID_RRF     0.048889  0.606667  0.448683  0.455489  31.238513
TFIDF_WORD     0.049778

## Setelah Revisi Groundtruth

In [11]:
# =============================================================================
# HIGH-RECALL & RELIABLE RETRIEVAL SYSTEM (FINAL INTEGRATED CODE)
# =============================================================================

# 1. INSTALL & IMPORTS
import os
import re
import math
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Install libraries if not present
try:
    import rank_bm25
    import sentence_transformers
    import sacrebleu
except ImportError:
    os.system('pip install -q pandas numpy scikit-learn rank-bm25 sentence-transformers sacrebleu tqdm')
    import rank_bm25
    import sentence_transformers
    import sacrebleu

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder

# Setup Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INFO] Device used: {device}")

# =============================================================================
# 2. CONFIGURATION
# =============================================================================
# Konfigurasi High Recall (Wide Funnel)
TOP_K_FETCH = 300              # Fetch awal diperlebar ke 300
TOP_K_RERANK_CANDIDATES = 75   # Reranker menilai 75 kandidat teratas
MAX_K_EVAL = 30                # Evaluasi sampai rank 30
K_LEVELS = [5, 15, 30]         # Level pelaporan
TARGET_ERR_MODEL = "RERANKER_BGE" # Model untuk Error Analysis CSV

# PATHS (Sesuaikan jika perlu)
PASAL_CHUNKS_CSV = "/kaggle/input/dataset-clean-and-ground-truth/pasal_chunks.csv"
QUERIES_CSV      = "/kaggle/input/dataset-clean-and-ground-truth/ground truth uas  - queries.csv"
QRELS_CSV        = "/kaggle/input/gt-revisi/ground truth uas  - qrels (1).csv"

OUT_DIR = "ir_final_project"
os.makedirs(OUT_DIR, exist_ok=True)

# =============================================================================
# 3. PREPROCESSING & CLEANING UTILS
# =============================================================================
print("[INFO] Loading and cleaning data...")

def clean_doc_id_aggressive(doc_id):
    """Membersihkan ID dokumen agar variasi penamaan dianggap sama."""
    s = str(doc_id).lower().strip()
    s = re.sub(r'\.pdf$', '', s)
    s = re.sub(r'_\(code\)', '', s)
    s = re.sub(r'[^a-z0-9]', '', s)
    s = s.replace("nomor", "no")
    return s

def smart_norm(text: str) -> str:
    if not isinstance(text, str): text = str(text)
    text = re.sub(r"(?<=\d)O(?=\d)", "0", text)
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z0-9\.,\(\)\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def canon_pasal(p):
    p = str(p).strip()
    p = re.sub(r"p\s*a\s*s\s*a\s*l", "Pasal", p, flags=re.I)
    p = re.sub(r"\s+", " ", p).strip()
    m = re.match(r"(?i)pasal\s+(\d+)([a-z]?)", p)
    if not m: return p
    return f"Pasal {m.group(1)}{m.group(2).upper()}"

# Load DataFrames
corpus_df  = pd.read_csv(PASAL_CHUNKS_CSV)
queries_df = pd.read_csv(QUERIES_CSV)
qrels_df   = pd.read_csv(QRELS_CSV)

# Apply Normalization
corpus_df["pasal_canon"] = corpus_df["pasal"].apply(canon_pasal)
corpus_df["doc_fp"] = corpus_df["doc_id"].apply(clean_doc_id_aggressive)
corpus_df["chunk_text_norm"] = corpus_df["chunk_text"].apply(smart_norm)

queries_df["question_text_norm"] = queries_df["question_text"].apply(smart_norm)

qrels_df["pasal_canon"] = qrels_df["pasal"].apply(canon_pasal)
qrels_df["doc_fp"] = qrels_df["doc_id"].apply(clean_doc_id_aggressive)
qrels_df["gold_passage_norm"] = qrels_df["gold_passage"].astype(str).apply(smart_norm)

# Cache Lists for Speed
corpus_texts = corpus_df["chunk_text_norm"].tolist()
corpus_docfp = corpus_df["doc_fp"].tolist()
corpus_pasal = corpus_df["pasal_canon"].tolist()

print(f"[INFO] Corpus Size: {len(corpus_texts)}")

# =============================================================================
# 4. GROUND TRUTH BUILDER
# =============================================================================
# Mapping text lookup for verification
doc_id_to_text = {}
for _, row in corpus_df.iterrows():
    key = (row["doc_fp"], row["pasal_canon"])
    doc_id_to_text[key] = row["chunk_text_norm"]

GLOBAL_QRELS_ENHANCED = {}
qrels_sorted = qrels_df.sort_values(by=["question_id", "pasal_canon"])

for qid, sub in qrels_sorted.groupby("question_id"):
    valid_pasals = set()
    valid_clean_fps = set()
    texts_by_pasal = {}
    gold_texts_list = []
    
    for _, r in sub.iterrows():
        p_name = r["pasal_canon"]
        d_id = r["doc_fp"]
        
        valid_clean_fps.add(d_id)
        
        # Collect gold texts for BLEU
        if isinstance(r["gold_passage_norm"], str) and len(r["gold_passage_norm"]) > 5:
            gold_texts_list.append(r["gold_passage_norm"])
        
        if isinstance(p_name, str):
            valid_pasals.add(p_name)
            text_ref = doc_id_to_text.get((d_id, p_name))
            if text_ref:
                if p_name not in texts_by_pasal: texts_by_pasal[p_name] = []
                texts_by_pasal[p_name].append(text_ref)
            
    GLOBAL_QRELS_ENHANCED[qid] = {
        "valid_pasals": valid_pasals,
        "valid_clean_fps": valid_clean_fps,
        "texts_by_pasal": texts_by_pasal,
        "gold_texts": list(set(gold_texts_list))
    }

print("[INFO] Ground Truth Built Successfully.")

# =============================================================================
# 5. METRICS & LOGIC (STRICT NDCG + OVERLAP MATCHING)
# =============================================================================

def get_overlap_coefficient(text1, text2):
    """Calculates overlap coefficient to handle chunking issues."""
    tokens_a = set(str(text1).lower().split())
    tokens_b = set(str(text2).lower().split())
    if len(tokens_a) == 0 or len(tokens_b) == 0: return 0.0
    
    intersection = len(tokens_a.intersection(tokens_b))
    min_len = min(len(tokens_a), len(tokens_b))
    return intersection / min_len if min_len > 0 else 0

def extract_best_segment_bleu(full_text, ref_texts):
    """Finds best segment match for BLEU scoring."""
    if not isinstance(full_text, str) or not ref_texts: return 0.0
    text_clean = full_text.replace(" ayat ", ". ayat ").replace(" pasal ", ". pasal ")
    segments = re.split(r'[\.\?!]\s+', text_clean)
    
    best_segment_score = 0.0
    for seg in segments:
        seg = seg.strip()
        if len(seg) < 10: continue
        try:
            score = sacrebleu.sentence_bleu(seg, ref_texts).score
            if score > best_segment_score: best_segment_score = score
        except: pass
    return best_segment_score

def dynamic_hop_predictor(scores, model_type):
    if len(scores) < 2: return "single"
    s1, s2 = scores[0], scores[1]
    if "RERANKER" in model_type:
        s1 = 1/(1+np.exp(-s1)); s2 = 1/(1+np.exp(-s2))
    
    if s1 <= 1e-9: return "single"
    ratio = s2 / s1
    return "multi" if ratio > 0.95 else "single"

def is_prediction_correct(pred_fp, pred_pasal, pred_text, gt_data):
    # 1. Check ID Match (Clean Aggressive)
    if pred_fp in gt_data["valid_clean_fps"]: return True
    
    # 2. Check Content Overlap (Threshold 0.5 for reliability)
    if pred_pasal in gt_data["valid_pasals"]:
        valid_texts = gt_data["texts_by_pasal"].get(pred_pasal, [])
        for val_text in valid_texts:
            if get_overlap_coefficient(pred_text, val_text) >= 0.5: 
                return True
    return False

def calculate_metrics_strict(pred_fps, pred_pasals, pred_texts, top_scores, qid, model_name, k_val):
    if qid not in GLOBAL_QRELS_ENHANCED: return None
    gt = GLOBAL_QRELS_ENHANCED[qid]
    
    curr_fps = pred_fps[:k_val]
    curr_pasals = pred_pasals[:k_val]
    curr_texts = pred_texts[:k_val]
    
    hits_raw = []
    hits_unique_ndcg = []
    seen_pasals_ndcg = set()
    found_unique_pasals = set()
    
    for i in range(len(curr_fps)):
        is_hit = is_prediction_correct(curr_fps[i], curr_pasals[i], curr_texts[i], gt)
        
        if is_hit:
            hits_raw.append(1)
            
            # Identify logic for Recall & NDCG Deduplication
            detected_pasal = curr_pasals[i]
            if detected_pasal not in gt["valid_pasals"]:
                detected_pasal = f"detected_content_match_{i}"

            if curr_pasals[i] in gt["valid_pasals"]:
                found_unique_pasals.add(curr_pasals[i])
            else:
                found_unique_pasals.add(detected_pasal)

            # Strict NDCG: Only count first occurrence of a relevant pasal
            if detected_pasal in seen_pasals_ndcg:
                hits_unique_ndcg.append(0)
            else:
                hits_unique_ndcg.append(1)
                seen_pasals_ndcg.add(detected_pasal)
        else:
            hits_raw.append(0)
            hits_unique_ndcg.append(0)
            
    # --- RECALL ---
    true_matches = found_unique_pasals.intersection(gt["valid_pasals"])
    tp_unique = len(true_matches)
    if tp_unique == 0 and sum(hits_raw) > 0: tp_unique = 1 # Minimum 1 if content match exists
    
    total_gt = len(gt["valid_pasals"])
    recall = tp_unique / total_gt if total_gt > 0 else 0
    recall = min(recall, 1.0)
    
    # --- PRECISION ---
    precision = sum(hits_raw) / k_val
    
    # --- NDCG (Strict) ---
    def dcg(r): return sum((2**v - 1)/math.log2(idx+2) for idx,v in enumerate(r))
    ideal = [1] * total_gt
    actual_dcg = dcg(hits_unique_ndcg)
    ideal_dcg = dcg(ideal)
    ndcg = actual_dcg / (ideal_dcg + 1e-9)
    ndcg = min(ndcg, 1.0)
    
    # --- MRR ---
    mrr = 0
    for i, h in enumerate(hits_raw):
        if h: mrr = 1/(i+1); break
            
    # --- BLEU ---
    best_bleu = 0.0
    if gt["gold_texts"] and len(curr_texts) > 0:
        best_bleu = extract_best_segment_bleu(curr_texts[0], gt["gold_texts"])

    hop_pred = dynamic_hop_predictor(top_scores, model_name)
    
    return {
        "model": model_name, "k": k_val, "question_id": qid, "hop_pred": hop_pred,
        "precision": precision, "recall": recall, "mrr": mrr, "ndcg": ndcg, "bleu": best_bleu
    }

def dedup_results(idx, scores, corpus_docfp, corpus_pasal, corpus_texts, k=30):
    seen_fp = set(); u_fps, u_pasals, u_texts, u_scores = [], [], [], []
    for i, s in zip(idx, scores):
        fp = str(corpus_docfp[i])
        if fp in seen_fp: continue
        seen_fp.add(fp)
        u_fps.append(fp); u_pasals.append(corpus_pasal[i]); u_texts.append(corpus_texts[i]); u_scores.append(s)
        if len(u_fps) >= k: break
    return u_fps, u_pasals, u_texts, np.array(u_scores)

# =============================================================================
# 6. MODEL INITIALIZATION
# =============================================================================
print("[INFO] Initializing Retrieval Models...")

# 1. BM25
tokenized_corpus = [doc.split() for doc in corpus_texts]
bm25_model = BM25Okapi(tokenized_corpus)
def run_bm25(q, k=MAX_K_EVAL):
    scores = bm25_model.get_scores(q.split())
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 2. TF-IDF
tfidf_model = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfidf_matrix = tfidf_model.fit_transform(corpus_texts)
def run_tfidf(q, k=MAX_K_EVAL):
    q_vec = tfidf_model.transform([q])
    scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 3. DENSE E5
dense_model = SentenceTransformer('intfloat/multilingual-e5-base', device=device)
dense_emb = dense_model.encode(["passage: " + t for t in corpus_texts], batch_size=64, show_progress_bar=False, normalize_embeddings=True)
def run_dense(q, k=MAX_K_EVAL):
    q_emb = dense_model.encode(["query: " + q], normalize_embeddings=True)[0]
    scores = dense_emb @ q_emb
    top_n = np.argsort(scores)[::-1][:k]
    return top_n, scores[top_n]

# 4. HYBRID RRF
def run_hybrid(q, k=MAX_K_EVAL):
    idx_b, _ = run_bm25(q, k=TOP_K_FETCH) 
    idx_d, _ = run_dense(q, k=TOP_K_FETCH)
    score_map = {}
    for r, i in enumerate(idx_b): score_map[i] = score_map.get(i, 0) + 1/(60+r)
    for r, i in enumerate(idx_d): score_map[i] = score_map.get(i, 0) + 1/(60+r)
    srt = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:k]
    return np.array([x[0] for x in srt]), np.array([x[1] for x in srt])

# 5. RERANKER
reranker = CrossEncoder('BAAI/bge-reranker-base', device=device)
def run_reranker(q, k=MAX_K_EVAL):
    cand_idx, _ = run_hybrid(q, k=TOP_K_RERANK_CANDIDATES) 
    pairs = [[q, corpus_texts[i]] for i in cand_idx]
    scores = reranker.predict(pairs, batch_size=32, show_progress_bar=False)
    srt_loc = np.argsort(scores)[::-1][:k]
    return cand_idx[srt_loc], scores[srt_loc]

MODELS = {
    "BM25": run_bm25,
    "TFIDF_WORD": run_tfidf,
    "DENSE_E5": run_dense,
    "HYBRID_RRF": run_hybrid,
    "RERANKER_BGE": run_reranker
}

# =============================================================================
# 7. EXECUTE EVALUATION LOOP
# =============================================================================
results_storage = {5: [], 15: [], 30: []}
print("[INFO] Starting Evaluation...")

for name, func in MODELS.items():
    print(f"[PROCESSING] Model: {name}")
    for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), leave=False):
        qid = row["question_id"]
        if qid not in GLOBAL_QRELS_ENHANCED: continue
        
        # Retrieval (Wide Fetch)
        raw_idx, raw_scores = func(row["question_text_norm"], k=TOP_K_FETCH)
        
        # Deduplication & Cutoff
        clean_fps, clean_pasals, clean_texts, clean_scores = dedup_results(
            raw_idx, raw_scores, corpus_docfp, corpus_pasal, corpus_texts, k=MAX_K_EVAL
        )
        
        # Calculate Metrics
        for k_val in K_LEVELS:
            res = calculate_metrics_strict(
                clean_fps, clean_pasals, clean_texts, clean_scores, 
                qid, name, k_val
            )
            if res: results_storage[k_val].append(res)

# =============================================================================
# 8. GENERATE REPORTS & ERROR ANALYSIS
# =============================================================================


print(f"[INFO] Process Complete. Results saved to {OUT_DIR}")

[INFO] Device used: cuda
[INFO] Loading and cleaning data...
[INFO] Corpus Size: 40724
[INFO] Ground Truth Built Successfully.
[INFO] Initializing Retrieval Models...
[INFO] Starting Evaluation...
[PROCESSING] Model: BM25


  0%|          | 0/75 [00:00<?, ?it/s]

[PROCESSING] Model: TFIDF_WORD


  0%|          | 0/75 [00:00<?, ?it/s]

[PROCESSING] Model: DENSE_E5


  0%|          | 0/75 [00:00<?, ?it/s]

[PROCESSING] Model: HYBRID_RRF


  0%|          | 0/75 [00:00<?, ?it/s]

[PROCESSING] Model: RERANKER_BGE


  0%|          | 0/75 [00:00<?, ?it/s]


[METRICS SUMMARY K=5]
              precision    recall       mrr      ndcg       bleu
model                                                           
HYBRID_RRF     0.448000  0.477164  0.689778  0.531610  29.649171
BM25           0.421333  0.429386  0.668889  0.496616  28.929381
RERANKER_BGE   0.373333  0.407037  0.602222  0.455364  28.099498
DENSE_E5       0.360000  0.392831  0.593778  0.451754  25.333727
TFIDF_WORD     0.357333  0.381275  0.576000  0.445950  19.890320

[METRICS SUMMARY K=15]
              precision    recall       mrr      ndcg       bleu
model                                                           
HYBRID_RRF     0.310222  0.533720  0.696296  0.565677  29.649171
BM25           0.296889  0.522180  0.678063  0.544833  28.929381
RERANKER_BGE   0.285333  0.504307  0.611735  0.514190  28.099498
DENSE_E5       0.252444  0.484386  0.608289  0.503028  25.333727
TFIDF_WORD     0.250667  0.454942  0.583005  0.492984  19.890320

[METRICS SUMMARY K=30]
              preci

  0%|          | 0/75 [00:00<?, ?it/s]

[INFO] Process Complete. Results saved to ir_final_project


In [13]:
TARGET_ERR_MODEL = "HYBRID_RRF"

# Error Analysis (CSV Output)
print(f"\n[INFO] Generating Error Analysis for {TARGET_ERR_MODEL}...")
err_rows = []
retrieve_func = MODELS[TARGET_ERR_MODEL]

for _, row in tqdm(queries_df.iterrows(), total=len(queries_df), leave=False):
    qid = row["question_id"]
    if qid not in GLOBAL_QRELS_ENHANCED: continue
    gt = GLOBAL_QRELS_ENHANCED[qid]
    
    # Get Top-1
    raw_idx, raw_scores = retrieve_func(row["question_text_norm"], k=1)
    
    pred_doc = "-"
    pred_pasal = "-"
    pred_text = "-"
    status = "MISS"
    score = 0.0
    
    if len(raw_idx) > 0:
        idx = raw_idx[0]
        pred_doc = str(corpus_docfp[idx])
        pred_pasal = corpus_pasal[idx]
        pred_text = corpus_texts[idx]
        
        is_hit = is_prediction_correct(pred_doc, pred_pasal, pred_text, gt)
        
        # Determine status details
        if is_hit:
            if pred_doc in gt["valid_clean_fps"]:
                status = "CORRECT (ID MATCH)"
            else:
                status = "CORRECT (CONTENT MATCH)"
        else:
            # Check overlap for context
            best_ov = 0
            if gt["texts_by_pasal"]:
                for p_key in gt["texts_by_pasal"]:
                    for t in gt["texts_by_pasal"][p_key]:
                        ov = get_overlap_coefficient(pred_text, t)
                        if ov > best_ov: best_ov = ov
            
            if best_ov > 0.3: status = "WRONG (CLOSE)"
            else: status = "WRONG (IRRELEVANT)"

    err_rows.append({
        "Query": row["question_text"],
        "GT Pasal": ", ".join(gt["valid_pasals"]),
        "Pred Pasal": pred_pasal,
        "Pred Doc": pred_doc,
        "Status": status,
        "Pred Text": pred_text[:200]
    })

df_err = pd.DataFrame(err_rows)
df_err.to_csv(f"{OUT_DIR}/error_analysis_{TARGET_ERR_MODEL}.csv", index=False)
print(f"[INFO] Process Complete. Results saved to {OUT_DIR}")


[INFO] Generating Error Analysis for HYBRID_RRF...


  0%|          | 0/75 [00:00<?, ?it/s]

[INFO] Process Complete. Results saved to ir_final_project
