In [2]:
# ==========================================================
# BKF e-Flora Thailand Crawler (Vol. 2‚Äì16, Full + Resume)
# - Family discovery: a[href^='florafamily.html'] (dedup)
# - Family name: from URL ?factsheet=<FAMILY> (robust) + fallback <p class="intro">Family :</p>
# - Click "List of lower taxa" tab before reading Genus
# - Drilldown: Family -> Genus(link text in List of lower taxa) -> Species
# - Extract:
#     species_scientific_name = <p class="intro"><strong>...</strong></p> (‡∏´‡∏ô‡πâ‡∏≤ species)
#     accepted_name / Thailand / Distribution / Ecology ‡∏à‡∏≤‡∏Å label ‡πÄ‡∏î‡∏¥‡∏°
# - Sleep: 1‚Äì2s (randomized)
# - Resume: skip species_url ‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÉ‡∏ô CSV
# Updated: 2025-10-17
# ==========================================================

import os, re, time, random, requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, UTC
from urllib.parse import urlparse, parse_qs

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

# ---------- SETTINGS ----------
BASE        = "https://botany.dnp.go.th/eflora/"
VOLUMES     = range(16, 17)                 # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏•‡πà‡∏°‡∏ó‡∏µ‡πà‡∏ô‡∏µ‡πà (‡∏ó‡∏î‡∏™‡∏≠‡∏ö: range(2,3))
OUT_CSV     = "bkf_eflora_species_vol2_16.csv"
SLEEP_BASE  = 1.2
WAIT_SEC    = 25
HEADLESS    = True

# ---------- UTILITIES ----------
def nap(mult: float = 1.0):
    time.sleep(SLEEP_BASE * mult * (0.85 + random.random()*0.3))

def clean(t: str | None):
    return re.sub(r"\s+", " ", t).strip() if t else None

def to_abs(url: str | None):
    if not url:
        return None
    if url.startswith("http"):
        return url
    return requests.compat.urljoin(BASE, url)

def soup_from_driver(driver):
    return BeautifulSoup(driver.page_source, "html.parser")

def find_label_value(soup: BeautifulSoup, label_regex: str):
    tag = soup.find(string=re.compile(label_regex, re.I))
    if tag:
        cur = tag
        for _ in range(12):
            cur = cur.next_element
            if cur is None:
                break
            txt = clean(cur.get_text(" ", strip=True) if hasattr(cur, "get_text") else str(cur))
            if txt and not re.search(label_regex, txt, re.I):
                return re.sub(r"^[:\s]+", "", txt)
    full = soup.get_text("\n", strip=True)
    m = re.search(rf"{label_regex}\s*:?\s*(.+)", full, re.I)
    return clean(m.group(1)) if m else None

def setup_driver(headless: bool = True):
    opts = webdriver.ChromeOptions()
    opts.add_argument("--window-size=1280,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--user-agent=Mozilla/5.0")
    if headless:
        opts.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver

# ---------- FAMILY DISCOVERY (robust) ----------
def get_families_for_volume(driver, vol: int):
    vol_url = f"{BASE}floramainvol.html?vol={vol}"
    driver.get(vol_url)
    WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    time.sleep(3)

    soup = soup_from_driver(driver)
    families, seen = [], set()

    for a in soup.select("a[href^='florafamily.html']"):
        href = to_abs(a.get("href", ""))
        if not href or href in seen:
            continue
        seen.add(href)

        fam_name = None
        try:
            qs = parse_qs(urlparse(href).query)
            fam_name = qs.get("factsheet", [None])[0]
        except Exception:
            pass

        if not fam_name:
            driver.get(href)
            try:
                WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.CSS_SELECTOR, "p.intro")))
            except TimeoutException:
                pass
            s = soup_from_driver(driver)
            p = s.select_one("p.intro")
            if p and "Family" in p.get_text():
                txt = p.get_text(" ", strip=True)
                fam_name = re.sub(r".*Family\s*:\s*", "", txt)

        families.append((clean(fam_name) if fam_name else None, href))

    return families

# ---------- GENUS & SPECIES ----------
def get_genus_links(driver, family_url: str):
    """
    ‡πÄ‡∏Ç‡πâ‡∏≤ family ‡πÅ‡∏•‡πâ‡∏ß '‡∏Ñ‡∏•‡∏¥‡∏Å‡πÅ‡∏ó‡πá‡∏ö List of lower taxa' ‡∏à‡∏≤‡∏Å‡∏ô‡∏±‡πâ‡∏ô‡∏î‡∏∂‡∏á‡∏•‡∏¥‡∏á‡∏Å‡πå Genus
    genus_name = ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ö‡∏ô‡∏•‡∏¥‡∏á‡∏Å‡πå‡πÉ‡∏ô‡πÅ‡∏ó‡πá‡∏ö‡∏ô‡∏µ‡πâ (‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ú‡∏π‡πâ‡πÉ‡∏ä‡πâ‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£)
    """
    driver.get(to_abs(family_url))

    try:
        WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    except TimeoutException:
        time.sleep(2)

    # ‡∏Ñ‡∏•‡∏¥‡∏Å‡πÅ‡∏ó‡πá‡∏ö 'List of lower taxa'
    clicked = False
    for locator in [
        (By.XPATH, "//a[contains(., 'List of lower taxa')]"),
        (By.XPATH, "//li[contains(., 'List of lower taxa')]//a"),
        (By.CSS_SELECTOR, "a[href*='lower'], a[href*='#lower'], li a[href*='lower']")
    ]:
        try:
            el = WebDriverWait(driver, 5).until(EC.element_to_be_clickable(locator))
            el.click()
            clicked = True
            break
        except Exception:
            pass
    if not clicked:
        time.sleep(1)

    # ‡∏£‡∏≠‡πÉ‡∏´‡πâ‡∏•‡∏¥‡∏á‡∏Å‡πå Genus ‡πÇ‡∏ú‡∏•‡πà
    try:
        WebDriverWait(driver, WAIT_SEC).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='floragenus.html']"))
        )
    except TimeoutException:
        time.sleep(2)

    s = soup_from_driver(driver)
    gens = []
    for a in s.select("div#datalower a[href*='floragenus.html'], a[href*='floragenus.html']"):
        gens.append((clean(a.get_text()), to_abs(a.get("href", ""))))

    if not gens:
        elems = driver.find_elements(By.CSS_SELECTOR, "div#datalower a[href*='floragenus.html'], a[href*='floragenus.html']")
        gens = [(clean(e.text), to_abs(e.get_attribute("href"))) for e in elems]

    uniq, seen = [], set()
    for name, url in gens:
        if url and url not in seen:
            seen.add(url)
            uniq.append((name, url))
    return uniq

def get_species_links(driver, genus_url: str):
    driver.get(to_abs(genus_url))
    try:
        WebDriverWait(driver, WAIT_SEC).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='floraspecies.html']"))
        )
    except TimeoutException:
        time.sleep(2)
    s = soup_from_driver(driver)
    sp = [to_abs(a.get("href", "")) for a in s.select("a[href*='floraspecies.html']")]
    if not sp:
        elems = driver.find_elements(By.CSS_SELECTOR, "a[href*='floraspecies.html']")
        sp = [to_abs(e.get_attribute("href")) for e in elems]
    out, seen = [], set()
    for u in sp:
        if u and u not in seen:
            seen.add(u)
            out.append(u)
    return out

def parse_species_page(driver, sp_url: str):
    """
    species_scientific_name = ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡πÉ‡∏ô <p class="intro"><strong>...</strong></p> (‡∏ï‡∏±‡∏î‡πÄ‡∏•‡∏Ç‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏ô‡∏≥‡∏´‡∏ô‡πâ‡∏≤)
    accepted_name / thailand / distribution / ecology = ‡πÉ‡∏ä‡πâ‡∏ï‡∏±‡∏ß‡∏≠‡πà‡∏≤‡∏ô label ‡πÄ‡∏î‡∏¥‡∏°
    """
    driver.get(to_abs(sp_url))
    try:
        WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    except TimeoutException:
        pass
    s = soup_from_driver(driver)

    # --- species_scientific_name ‡∏à‡∏≤‡∏Å‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠ strong ---
    species_heading = None
    strong = s.select_one("p.intro > strong")
    if strong:
        species_heading = clean(strong.get_text())
        # ‡∏ï‡∏±‡∏î‡πÄ‡∏•‡∏Ç‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏ô‡∏≥‡∏´‡∏ô‡πâ‡∏≤ ‡πÄ‡∏ä‡πà‡∏ô "2. "
        species_heading = re.sub(r"^\s*\d+\.\s*", "", species_heading or "")

    # --- fields ‡∏≠‡∏∑‡πà‡∏ô ‡πÜ ‡∏ï‡∏≤‡∏°‡πÄ‡∏î‡∏¥‡∏° ---
    accepted     = find_label_value(s, r"Accepted\s*Name")
    thailand     = find_label_value(s, r"Thailand")
    distribution = find_label_value(s, r"Distribution")
    ecology      = find_label_value(s, r"Ecology")

    return {
        "species_scientific_name": clean(species_heading),
        "accepted_name": clean(accepted),
        "thailand": clean(thailand),
        "distribution": clean(distribution),
        "ecology": clean(ecology),
    }

# ---------- MAIN (with Resume) ----------
def crawl_all(volumes=VOLUMES, out_csv=OUT_CSV):
    rows, done_species = [], set()
    if os.path.exists(out_csv):
        try:
            old = pd.read_csv(out_csv)
            rows = old.to_dict("records")
            if "species_url" in old.columns:
                done_species = set(old["species_url"].dropna().astype(str).tolist())
            print(f"üß© Resume mode: ‡πÇ‡∏´‡∏•‡∏î {len(rows)} ‡πÅ‡∏ñ‡∏ß‡πÄ‡∏î‡∏¥‡∏°, species ‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÅ‡∏•‡πâ‡∏ß {len(done_species)}")
        except Exception as e:
            print(f"‚ö†Ô∏è ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏°‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à: {e}")

    driver = setup_driver(headless=HEADLESS)

    try:
        for vol in volumes:
            families = get_families_for_volume(driver, vol)
            print(f"\n=== Volume {vol} ===")
            print(f"  ‚Üí ‡∏û‡∏ö {len(families)} families (‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏£‡∏≠‡∏á‡∏ã‡πâ‡∏≥)")

            for fi, (family_name, family_url) in enumerate(families, 1):
                family_name = clean(family_name) or "Unknown Family"
                nap()
                genus_links = get_genus_links(driver, family_url)
                print(f"    [{fi}/{len(families)}] {family_name}: {len(genus_links)} genera")

                for gi, (genus_name, genus_url) in enumerate(genus_links, 1):
                    # genus_name = ‡∏ï‡∏±‡∏ß‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏ö‡∏ô‡∏•‡∏¥‡∏á‡∏Å‡πå‡πÉ‡∏ô List of lower taxa (‡∏ï‡∏≤‡∏° requirement ‡πÉ‡∏´‡∏°‡πà)
                    genus_name = clean(genus_name)
                    nap()
                    species_links = get_species_links(driver, genus_url)

                    for si, sp_url in enumerate(species_links, 1):
                        if not sp_url or sp_url in done_species:
                            continue
                        nap(1.1)
                        try:
                            data = parse_species_page(driver, sp_url)
                        except Exception:
                            data = {
                                "species_scientific_name": None,
                                "accepted_name": None,
                                "thailand": None,
                                "distribution": None,
                                "ecology": None,
                            }

                        rows.append({
                            "volume": vol,
                            "family_name": family_name,
                            "genus_name": genus_name,  # ‚Üê ‡∏à‡∏≤‡∏Å‡∏•‡∏¥‡∏á‡∏Å‡πå‡πÉ‡∏ô List of lower taxa
                            **data,
                            "family_url": to_abs(family_url),
                            "genus_url": to_abs(genus_url),
                            "species_url": to_abs(sp_url),
                            "scraped_at": datetime.now(UTC).isoformat()
                        })
                        done_species.add(sp_url)

                        if len(rows) % 50 == 0:
                            pd.DataFrame(rows).to_csv(out_csv, index=False)

            pd.DataFrame(rows).to_csv(out_csv, index=False)

        df = pd.DataFrame(rows)
        if not df.empty:
            df = df.drop_duplicates(subset=["species_url"]).reset_index(drop=True)
        df.to_csv(out_csv, index=False)
        print(f"\n‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô: Saved {len(df)} rows ‚Üí {out_csv}")

    finally:
        driver.quit()

# ---------- RUN ----------
if __name__ == "__main__":
    crawl_all()

üß© Resume mode: ‡πÇ‡∏´‡∏•‡∏î 5700 ‡πÅ‡∏ñ‡∏ß‡πÄ‡∏î‡∏¥‡∏°, species ‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÅ‡∏•‡πâ‡∏ß 5700

=== Volume 16 ===
  ‚Üí ‡∏û‡∏ö 2 families (‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏£‡∏≠‡∏á‡∏ã‡πâ‡∏≥)
    [1/2] Zingiberaceae: 28 genera
    [2/2] Annonaceae: 39 genera

‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô: Saved 6354 rows ‚Üí bkf_eflora_species_vol2_16.csv
