In [4]:
import os, time, csv, requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [5]:
BASE_URL = "https://jdih.setneg.go.id/Produk"
OUT_DIR = "JDIH_Setneg_PDF"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Setup Selenium (headless) ---
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--window-size=1366,800")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=opts)
wait = WebDriverWait(driver, 20)

driver.get(BASE_URL)

In [6]:
try:
    # Dropdown status biasanya punya id/name berisi 'status' (bisa berbeda).
    # Kita cari select yang mengandung opsi 'PIDANA'.
    select_elems = driver.find_elements(By.TAG_NAME, "select")
    for sel in select_elems:
        if "PIDANA" in sel.text:
            sel.click()
            # pilih opsi PIDANA
            for opt in sel.find_elements(By.TAG_NAME, "option"):
                if opt.text.strip().upper() == "PIDANA":
                    opt.click()
                    break
            # tunggu tabel reload
            time.sleep(2)
            break
except Exception:
    pass  # lanjut tanpa filter bila selector berbeda

# --- Helper untuk download file ---
def safe_filename(name: str) -> str:
    bad = '<>:"/\\|?*'
    for ch in bad:
        name = name.replace(ch, "_")
    return "_".join(name.split())

def download_pdf(url: str, title: str):
    try:
        fn = safe_filename(title) or os.path.basename(url).split("?")[0]
        if not fn.lower().endswith(".pdf"):
            fn += ".pdf"
        path = os.path.join(OUT_DIR, fn)
        if os.path.exists(path):
            return path
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(path, "wb") as f:
            f.write(r.content)
        return path
    except Exception as e:
        print(f"[WARN] Gagal unduh {url}: {e}")
        return ""


In [7]:
all_rows_data = []
page_idx = 1
while True:
    # Tunggu tabel muncul
    wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
    # Ambil semua baris klik-able (biasanya <tr> atau icon/detail link)
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    print(f"[INFO] Halaman {page_idx}: {len(rows)} baris")

    for i in range(len(rows)):
        # Re-locate rows tiap iterasi (DOM berubah setelah modal tutup)
        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
        row = rows[i]

        # Coba klik tombol/detail di baris (ikon “detail” atau link judul)
        # Beberapa situs pakai <a> dalam <td> pertama/kedua.
        click_target = None
        links = row.find_elements(By.TAG_NAME, "a")
        if links:
            click_target = links[0]
        else:
            click_target = row

        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", click_target)
        click_target.click()

        # Tunggu modal tampil
        try:
            modal = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".modal, .modal-dialog, .ui-dialog")))
        except Exception:
            # kalau selector modal beda, coba lanjut ke baris berikutnya
            continue

        # Ambil HTML modal, cari link .pdf
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Cari container modal paling atas yang sedang visible
        modal_soup = None
        for m in soup.select(".modal, .modal-dialog, .ui-dialog"):
            modal_soup = m
            break

        title = ""
        pdf_url = ""
        disahkan = ""
        diundangkan = ""
        ln = ""
        tln = ""

        if modal_soup:
            # Judul di header modal
            header = modal_soup.get_text(" ", strip=True)
            # coba ambil judul baris pertama (biasanya di area atas)
            title = header.split("FILE")[0].strip() if "FILE" in header.upper() else header[:200]

            # Link PDF (anchor yang akhiran .pdf)
            for a in modal_soup.select("a[href]"):
                href = a.get("href")
                if href and href.lower().endswith(".pdf"):
                    pdf_url = href if href.startswith("http") else f"https://jdih.setneg.go.id{href if href.startswith('/') else '/'+href}"
                    break

            # Ambil metadata sederhana dari tabel di modal
            # (field label bisa berbahasa Indonesia: Disahkan/Diundangkan/LN/TLN)
            tbl_text = modal_soup.get_text(" ", strip=True)
            # heuristik ekstraksi
            def pick(label):
                lab = label.lower()
                t = tbl_text.lower()
                if lab in t:
                    seg = tbl_text[t.index(lab):]
                    seg = seg.split(":")[-1].strip()
                    return seg.split("  ")[0]
                return ""
            disahkan = pick("Disahkan")
            diundangkan = pick("Diundangkan")
            ln = pick("Nomor LN")
            tln = pick("Nomor TLN")

        # Download PDF kalau ada tautan
        saved_path = ""
        if pdf_url:
            saved_path = download_pdf(pdf_url, title)

        # Tutup modal (klik tombol close “X” atau backdrop)
        try:
            # cari tombol X
            close_btn = driver.find_elements(By.CSS_SELECTOR, ".modal [data-dismiss='modal'], .modal .close, .ui-dialog-titlebar-close")
            if close_btn:
                close_btn[0].click()
            else:
                # klik di backdrop
                driver.execute_script("document.body.click();")
        except Exception:
            pass

        # Tambah ke list
        all_rows_data.append({
            "title": title,
            "pdf_url": pdf_url,
            "saved_path": saved_path,
            "disahkan": disahkan,
            "diundangkan": diundangkan,
            "LN": ln,
            "TLN": tln
        })

        # beri jeda kecil biar server aman
        time.sleep(0.6)

    # Cek tombol Next (pagination)
    next_btns = driver.find_elements(By.CSS_SELECTOR, ".paginate_button.next, a[aria-label='Next']")
    if next_btns and next_btns[0].is_enabled() and "disabled" not in next_btns[0].get_attribute("class"):
        next_btns[0].click()
        page_idx += 1
        time.sleep(1.5)
    else:
        break

driver.quit()

# Simpan metadata ke CSV
df = pd.DataFrame(all_rows_data)
df.drop_duplicates(subset=["pdf_url"], inplace=True)
df.to_csv("jdih_setneg_produk.csv", index=False, quoting=csv.QUOTE_MINIMAL)
print(f"[DONE] Tersimpan {len(df)} entri. PDF di folder: {OUT_DIR}  | Metadata: jdih_setneg_produk.csv")

[INFO] Halaman 1: 1 baris
[DONE] Tersimpan 0 entri. PDF di folder: JDIH_Setneg_PDF  | Metadata: jdih_setneg_produk.csv
