# **E-Flora Thailand Web Crawling**

In [None]:
# !pip install selenium webdriver-manager pandas beautifulsoup4 lxml html5lib

import os, re, time, random, requests, sys
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from urllib.parse import urlparse, parse_qs
from typing import Optional, Tuple, List, Dict

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# ============ CONFIG ============
BASE        = "https://botany.dnp.go.th/eflora/"

# ‚Üê ‡πÉ‡∏™‡πà‡πÑ‡∏ü‡∏•‡πå ‚Äúspecies ‡πÄ‡∏î‡∏¥‡∏°‚Äù ‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡∏™‡∏Å‡∏±‡∏î‡πÑ‡∏ß‡πâ ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏ä‡πâ‡∏Ç‡πâ‡∏≤‡∏°‡∏ã‡πâ‡∏≥ (resume)
RESUME_FROM = [
    "bkf_eflora_species_vol2_test.csv",
    # ‡∏à‡∏∞‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏Å‡πà‡∏≤‡∏´‡∏•‡∏≤‡∏¢‡πÑ‡∏ü‡∏•‡πå‡∏Å‡πá‡πÑ‡∏î‡πâ
]

# ‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï‡∏£‡∏≠‡∏ö‡∏ô‡∏µ‡πâ (‡∏≠‡∏¢‡πà‡∏≤‡πÉ‡∏ä‡πâ‡∏ä‡∏∑‡πà‡∏≠‡∏ä‡∏ô‡∏Å‡∏±‡∏ö‡∏Ç‡∏≠‡∏á‡πÄ‡∏î‡∏¥‡∏°)
OUT_CSV     = "bkf_eflora_species_vol2_16_resume.csv"    # species-level
OUT_AUDIT   = "bkf_genus_audit_vol2_16_resume.csv"       # audit ‡∏ï‡πà‡∏≠ family
VOLUMES     = list(range(2, 17))                         # ‡∏£‡∏±‡∏ô‡∏Ñ‡∏£‡∏ö 2‚Äì16

SLEEP_BASE  = 1.2
WAIT_SEC    = 30
HEADLESS    = True

# ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏î‡∏∑‡πâ‡∏≠‡πÄ‡∏°‡∏∑‡πà‡∏≠ DOM ‡∏ä‡πâ‡∏≤/‡∏ã‡πà‡∏≠‡∏ô
MAX_GENUS_RETRY  = 6
SCROLL_PER_TRY   = 10
SCROLL_PAUSE_SEC = 0.9

# ‡πÄ‡∏Å‡πá‡∏ö HTML snapshot ‡πÄ‡∏ß‡∏•‡∏≤‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠ element ‡∏ï‡∏≤‡∏°‡∏Ñ‡∏≤‡∏î
SNAP_DIR = "snapshots_html"
os.makedirs(SNAP_DIR, exist_ok=True)



In [2]:
def nap(mult: float = 1.0):
    time.sleep(SLEEP_BASE * mult * (0.85 + random.random()*0.3))

def clean(t: Optional[str]):
    return re.sub(r"\s+", " ", t).strip() if t else None

def to_abs(url: Optional[str]):
    if not url:
        return None
    if url.startswith(("http://", "https://")):
        return url
    return requests.compat.urljoin(BASE, url)

def soup_from_driver(driver) -> BeautifulSoup:
    return BeautifulSoup(driver.page_source, "html.parser")

def setup_driver(headless: bool = True):
    opts = webdriver.ChromeOptions()
    opts.add_argument("--window-size=1400,1200")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--user-agent=Mozilla/5.0")
    opts.add_argument("--disable-dev-shm-usage")
    if headless:
        opts.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver

def find_label_value(soup: BeautifulSoup, label_regex: str):
    # ‡∏´‡∏≤ label ‡πÅ‡∏ö‡∏ö‡∏¢‡∏∑‡∏î‡∏´‡∏¢‡∏∏‡πà‡∏ô (‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö text ‡∏Å‡∏£‡∏∞‡∏à‡∏±‡∏î‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢)
    tag = soup.find(string=re.compile(label_regex, re.I))
    if tag:
        cur = tag
        for _ in range(12):
            cur = cur.next_element
            if cur is None:
                break
            txt = clean(cur.get_text(" ", strip=True) if hasattr(cur, "get_text") else str(cur))
            if txt and not re.search(label_regex, txt, re.I):
                return re.sub(r"^[:\s]+", "", txt)
    full = soup.get_text("\n", strip=True)
    m = re.search(rf"{label_regex}\s*[:Ôºö]?\s*(.+)", full, re.I)
    return clean(m.group(1)) if m else None

In [3]:
def extract_labeled_from_intro(soup: BeautifulSoup, label: str) -> Optional[str]:
    """
    ‡∏î‡∏∂‡∏á‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏ï‡∏≤‡∏°‡∏´‡∏•‡∏±‡∏á <strong>LABEL</strong> ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô <p class="intro"> ‡πÄ‡∏ä‡πà‡∏ô
    <p class="intro"><strong>Thailand : </strong> SOUTH-EASTERN: ...</p>
    """
    # 1) ‡πÄ‡∏Ñ‡∏™‡∏õ‡∏Å‡∏ï‡∏¥: strong=LABEL ‡πÅ‡∏•‡πâ‡∏ß‡∏ï‡∏≤‡∏°‡∏î‡πâ‡∏ß‡∏¢‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°
    for p in soup.select("p.intro"):
        strongs = p.find_all("strong")
        for st in strongs:
            t = clean(st.get_text(" ", strip=True))
            if not t:
                continue
            if re.fullmatch(rf"{re.escape(label)}\s*[:Ôºö]?", t, flags=re.I):
                full = clean(p.get_text(" ", strip=True)) or ""
                val = re.sub(rf"^{re.escape(label)}\s*[:Ôºö]?\s*", "", full, flags=re.I)
                if val:
                    return clean(val)

    # 2) fallback: ‡∏´‡∏≤‡πÉ‡∏ô p.intro ‡∏î‡πâ‡∏ß‡∏¢ regex ‡∏ï‡∏£‡∏á ‡πÜ
    for p in soup.select("p.intro"):
        full = clean(p.get_text(" ", strip=True)) or ""
        m = re.search(rf"{re.escape(label)}\s*[:Ôºö]\s*(.+)", full, flags=re.I)
        if m:
            return clean(m.group(1))

    # 3) fallback ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏ô‡πâ‡∏≤
    full = soup.get_text("\n", strip=True)
    m = re.search(rf"{re.escape(label)}\s*[:Ôºö]\s*(.+)", full, flags=re.I)
    return clean(m.group(1)) if m else None

In [4]:
def get_families_for_volume(driver, vol: int) -> List[Tuple[str, str]]:
    vol_url = f"{BASE}floramainvol.html?vol={vol}"
    driver.get(vol_url)
    WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    nap(0.8)

    soup = soup_from_driver(driver)
    families, seen = [], set()

    anchors = soup.select("a[href^='florafamily.html']")
    if not anchors:
        anchors = soup.select("a[href*='florafamily']")

    for a in anchors:
        href = to_abs(a.get("href", ""))
        if not href or href in seen:
            continue
        seen.add(href)

        fam_name = None
        try:
            qs = parse_qs(urlparse(href).query)
            fam_name = qs.get("factsheet", [None])[0]
        except Exception:
            pass

        if not fam_name:
            driver.get(href)
            try:
                WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            except TimeoutException:
                pass
            s = soup_from_driver(driver)
            p = s.select_one("p.intro")
            if p:
                txt = p.get_text(" ", strip=True)
                m = re.search(r"Family\s*:\s*(.+)", txt, flags=re.I)
                if m:
                    fam_name = clean(m.group(1))
            if not fam_name:
                h = s.find(["h1","h2","h3"])
                if h:
                    fam_name = clean(h.get_text())

        families.append((clean(fam_name) if fam_name else None, href))

    return families

In [5]:
GENUS_IDX_RE = re.compile(r"^\s*(\d+)\s*[\.\)]?\s*(.*)$")

def parse_genus_text(text: Optional[str]) -> Tuple[Optional[int], Optional[str], Optional[str]]:
    t = clean(text)
    if not t:
        return None, None, None
    m = GENUS_IDX_RE.match(t)
    if not m:
        return None, t, t
    idx = int(m.group(1))
    label = clean(m.group(2) or "")
    return idx, (label or None), t

def scroll_page(driver, times: int = SCROLL_PER_TRY, pause: float = SCROLL_PAUSE_SEC):
    for _ in range(times):
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(pause)
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(0.25)
        except Exception:
            time.sleep(0.4)

def click_lower_tab_variants(driver):
    candidates = [
        (By.XPATH, "//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'list of lower taxa')]"),
        (By.XPATH, "//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'lower taxa')]"),
        (By.XPATH, "//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'list of genera')]"),
        (By.XPATH, "//a[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'genera')]"),
        (By.CSS_SELECTOR, "a[href*='#lower'], a[href*='lower']"),
        (By.CSS_SELECTOR, "li a[href*='#lower']"),
    ]
    for locator in candidates:
        try:
            el = WebDriverWait(driver, 4).until(EC.element_to_be_clickable(locator))
            try:
                el.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", el)
            nap(0.4)
            return True
        except Exception:
            continue
    try:
        driver.execute_script("if (location.hash !== '#lower') location.hash = '#lower';")
        nap(0.4)
        return True
    except Exception:
        return False

def collect_genus_links_from_dom(driver) -> List[Dict]:
    s = soup_from_driver(driver)
    out, seen_urls = [], set()
    order = 0

    for a in s.find_all("a", href=re.compile(r"floragenus\.html", re.I)):
        href = to_abs(a.get("href", ""))
        text = a.get_text(" ", strip=True)
        if not href or href in seen_urls:
            continue
        idx, label, raw = parse_genus_text(text)
        order += 1
        out.append({
            "genus_text": raw,
            "genus_label": label,
            "genus_index_parsed": idx,
            "genus_url": href,
            "genus_iter_order": order,
        })
        seen_urls.add(href)

    # fallback: WebDriver elements
    if not out:
        elems = driver.find_elements(By.CSS_SELECTOR, "a")
        for e in elems:
            href = e.get_attribute("href") or ""
            if not re.search(r"floragenus\.html", href, flags=re.I):
                continue
            href = to_abs(href)
            text = e.text
            if not href or href in seen_urls:
                continue
            idx, label, raw = parse_genus_text(text)
            order += 1
            out.append({
                "genus_text": raw,
                "genus_label": label,
                "genus_index_parsed": idx,
                "genus_url": href,
                "genus_iter_order": order,
            })
            seen_urls.add(href)
    return out

def collect_genus_links_fallback_direct_get(family_url: str) -> List[Dict]:
    out = []
    try:
        r = requests.get(family_url, timeout=20)
        r.raise_for_status()
        s = BeautifulSoup(r.text, "html.parser")
        seen, order = set(), 0
        for a in s.find_all("a", href=re.compile(r"floragenus\.html", re.I)):
            href = to_abs(a.get("href", ""))
            text = a.get_text(" ", strip=True)
            if not href or href in seen:
                continue
            idx, label, raw = parse_genus_text(text)
            order += 1
            out.append({
                "genus_text": raw,
                "genus_label": label,
                "genus_index_parsed": idx,
                "genus_url": href,
                "genus_iter_order": order,
            })
            seen.add(href)
    except Exception:
        pass
    return out

def get_genus_links_robust(driver, family_url: str) -> Tuple[List[Dict], Optional[int], List[int]]:
    best, best_missing, best_max_exp = [], None, None
    for attempt in range(1, MAX_GENUS_RETRY+1):
        driver.get(to_abs(family_url))
        WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        nap(0.6)

        click_lower_tab_variants(driver)
        scroll_page(driver, times=SCROLL_PER_TRY)

        gens = collect_genus_links_from_dom(driver)
        if not gens:
            gens = collect_genus_links_fallback_direct_get(to_abs(family_url))

        idxs = [g["genus_index_parsed"] for g in gens if g["genus_index_parsed"] is not None]
        max_expected = max(idxs) if idxs else None
        present = set([i for i in idxs if i is not None])
        expected = set(range(1, (max_expected or 0) + 1)) if max_expected else set()
        missing = sorted(list(expected - present)) if expected else []

        if (not best) or (len(gens) > len(best)) or (best_missing and len(missing) < len(best_missing)):
            best, best_missing, best_max_exp = gens, missing, max_expected

        if gens and (not missing or attempt >= 3):
            return gens, max_expected, missing

        nap(0.5 + attempt*0.2)

    try:
        fname = os.path.join(SNAP_DIR, f"family_missing_genus_{int(time.time())}.html")
        with open(fname, "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print(f"‚ö†Ô∏è  ‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏•‡∏¥‡∏á‡∏Å‡πå genus ‚Äî ‡πÄ‡∏ã‡∏ü HTML ‡πÑ‡∏ß‡πâ‡∏ó‡∏µ‡πà: {fname}")
    except Exception:
        pass

    return best, best_max_exp, (best_missing or [])

In [6]:
def get_species_links(driver, genus_url: str) -> List[str]:
    driver.get(to_abs(genus_url))
    try:
        WebDriverWait(driver, WAIT_SEC).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
    except TimeoutException:
        time.sleep(1.0)

    scroll_page(driver, times=4, pause=0.6)

    # ‡πÉ‡∏ä‡πâ regex case-insensitive ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏ó‡∏±‡πâ‡∏á floraSpecies.html ‡πÅ‡∏•‡∏∞ floraspecies.html
    s = soup_from_driver(driver)
    sp = [to_abs(a.get("href", "")) for a in s.find_all("a", href=re.compile(r"floraspecies\.html", re.I))]
    if not sp:
        elems = driver.find_elements(By.TAG_NAME, "a")
        sp = []
        for e in elems:
            href = e.get_attribute("href") or ""
            if re.search(r"floraspecies\.html", href, flags=re.I):
                sp.append(to_abs(href))

    out, seen = [], set()
    for u in sp:
        if u and u not in seen:
            seen.add(u)
            out.append(u)
    return out

def parse_species_page(driver, sp_url: str) -> Dict:
    driver.get(to_abs(sp_url))
    try:
        WebDriverWait(driver, WAIT_SEC).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    except TimeoutException:
        pass
    s = soup_from_driver(driver)

    # ‡∏ä‡∏∑‡πà‡∏≠‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå (‡∏ï‡∏±‡∏î‡πÄ‡∏•‡∏Ç‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏´‡∏ô‡πâ‡∏≤)
    strong = s.select_one("p.intro > strong")
    species_heading = clean(strong.get_text()) if strong else None
    if species_heading:
        species_heading = re.sub(r"^\s*\d+[\.\)]\s*", "", species_heading or "")

    # Thailand ‡πÅ‡∏ö‡∏ö‡πÄ‡∏à‡∏≤‡∏∞ <p class="intro"><strong>Thailand : </strong>...
    thailand     = extract_labeled_from_intro(s, "Thailand") or find_label_value(s, r"Thailand")
    accepted     = find_label_value(s, r"Accepted\s*Name")
    distribution = find_label_value(s, r"Distribution")
    ecology      = find_label_value(s, r"Ecology")

    return {
        "species_scientific_name": clean(species_heading),
        "accepted_name": clean(accepted),
        "thailand": clean(thailand),
        "distribution": clean(distribution),
        "ecology": clean(ecology),
    }

In [7]:
def load_done_species(paths: List[str]) -> set:
    done = set()
    for p in paths:
        if not os.path.exists(p):
            print(f"‚ö†Ô∏è  ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå resume: {p}")
            continue
        try:
            df = pd.read_csv(p, dtype=str)
            if "species_url" in df.columns:
                urls = df["species_url"].dropna().astype(str).str.strip()
                urls = urls[urls != ""]
                done.update(urls.tolist())
                print(f"‚Ä¢ ‡πÇ‡∏´‡∏•‡∏î {len(urls)} species_url ‡∏à‡∏≤‡∏Å {p}")
            else:
                print(f"‚ö†Ô∏è  ‡πÑ‡∏ü‡∏•‡πå {p} ‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå species_url")
        except Exception as e:
            print(f"‚ö†Ô∏è  ‡∏≠‡πà‡∏≤‡∏ô {p} ‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à: {e}")
    print(f"‚úÖ ‡∏£‡∏ß‡∏° species_url ‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {len(done)}")
    return done

In [8]:
def crawl_notebook_resume(volumes: List[int], out_csv: str, out_audit_csv: str,
                          headless: bool, resume_species_urls: set):
    # ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏°‡πà‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏£‡∏≠‡∏ö‡∏ô‡∏µ‡πâ
    if os.path.exists(out_csv):
        print(f"‚ÑπÔ∏è  ‡∏•‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï‡πÄ‡∏î‡∏¥‡∏°: {out_csv}")
        os.remove(out_csv)
    if os.path.exists(out_audit_csv):
        print(f"‚ÑπÔ∏è  ‡∏•‡∏ö‡πÑ‡∏ü‡∏•‡πå audit ‡πÄ‡∏î‡∏¥‡∏°: {out_audit_csv}")
        os.remove(out_audit_csv)

    rows_species = []
    audit_rows   = []
    genus_rows   = []

    # species_url ‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡∏ó‡∏≥‡∏°‡∏≤‡πÅ‡∏•‡πâ‡∏ß
    done_species = set(resume_species_urls)

    driver = setup_driver(headless=headless)
    try:
        for vol in volumes:
            families = get_families_for_volume(driver, vol)
            print(f"\n=== Volume {vol} ===")
            print(f"  ‚Üí ‡∏û‡∏ö {len(families)} families (‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏£‡∏≠‡∏á‡∏ã‡πâ‡∏≥)")

            for fi, (family_name, family_url) in enumerate(families, 1):
                family_name = clean(family_name) or "Unknown Family"
                nap(0.6)
                genus_list, max_expected, missing = get_genus_links_robust(driver, family_url)

                print(f"    [{fi}/{len(families)}] {family_name}: {len(genus_list)} genera (max_expected={max_expected})")
                if genus_list:
                    for g in genus_list:
                        print(f"       ‚Ä¢ {g.get('genus_label') or g.get('genus_text')}")
                if missing:
                    print(f"       ‚Üí Missing indices (1..max_expected): {missing}")

                audit_rows.append({
                    "volume": vol,
                    "family_name": family_name,
                    "family_url": to_abs(family_url),
                    "family_max_index_expected": max_expected,
                    "family_missing_indices": ", ".join(map(str, missing)) if missing else "",
                    "genus_count_observed": len(genus_list),
                })

                # GENUS TABLE: ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ó‡∏∏‡∏Å genus ‡∏ó‡∏µ‡πà‡∏û‡∏ö
                for g in genus_list:
                    genus_rows.append({
                        "volume": vol,
                        "family_name": family_name,
                        "genus_name_raw": g.get("genus_text"),
                        "genus_label": g.get("genus_label"),
                        "genus_index_parsed": g.get("genus_index_parsed"),
                        "genus_url": to_abs(g.get("genus_url")),
                        "family_url": to_abs(family_url),
                        "scraped_at": datetime.now(timezone.utc).isoformat(),
                    })

                # ‡∏ï‡πà‡∏≠ genus ‚Üí species
                for gi, g in enumerate(genus_list, 1):
                    genus_name_raw = g.get("genus_text")
                    genus_label    = g.get("genus_label")
                    genus_idx      = g.get("genus_index_parsed")
                    genus_url      = g.get("genus_url")

                    nap(0.6)
                    sp_links = get_species_links(driver, genus_url)

                    if not sp_links:
                        # stub row ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö genus ‡πÑ‡∏°‡πà‡∏°‡∏µ species link
                        rows_species.append({
                            "volume": vol,
                            "family_name": family_name,
                            "genus_name": genus_name_raw,
                            "genus_label": genus_label,
                            "genus_index_parsed": genus_idx,
                            "species_scientific_name": None,
                            "accepted_name": None,
                            "thailand": None,
                            "distribution": None,
                            "ecology": None,
                            "family_url": to_abs(family_url),
                            "genus_url": to_abs(genus_url),
                            "species_url": None,
                            "scraped_at": datetime.now(timezone.utc).isoformat()
                        })
                        continue

                    for si, sp_url in enumerate(sp_links, 1):
                        if not sp_url:
                            continue
                        sp_url = to_abs(sp_url)
                        # ‚úÖ ‡∏Ç‡πâ‡∏≤‡∏° species ‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡∏ó‡∏≥‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß
                        if sp_url in done_species:
                            continue

                        nap(0.9)
                        try:
                            data = parse_species_page(driver, sp_url)
                        except Exception:
                            data = {
                                "species_scientific_name": None,
                                "accepted_name": None,
                                "thailand": None,
                                "distribution": None,
                                "ecology": None,
                            }

                        rows_species.append({
                            "volume": vol,
                            "family_name": family_name,
                            "genus_name": genus_name_raw,
                            "genus_label": genus_label,
                            "genus_index_parsed": genus_idx,
                            **data,
                            "family_url": to_abs(family_url),
                            "genus_url": to_abs(genus_url),
                            "species_url": sp_url,
                            "scraped_at": datetime.now(timezone.utc).isoformat()
                        })
                        done_species.add(sp_url)

                        if len(rows_species) % 50 == 0:
                            pd.DataFrame(rows_species).to_csv(out_csv, index=False)

            # save ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡∏ó‡∏≤‡∏á
            pd.DataFrame(rows_species).to_csv(out_csv, index=False)
            pd.DataFrame(audit_rows).to_csv(out_audit_csv, index=False)

        # final dedupe ‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï‡πÉ‡∏´‡∏°‡πà
        df_sp = pd.DataFrame(rows_species)
        if not df_sp.empty:
            df_sp = df_sp.drop_duplicates(subset=["species_url","genus_url","family_name","volume"], keep="last").reset_index(drop=True)
        df_sp.to_csv(out_csv, index=False)
        pd.DataFrame(audit_rows).to_csv(out_audit_csv, index=False)

        # GENUS CSV ‡πÅ‡∏¢‡∏Å
        genus_csv = out_csv.replace("species", "genus")
        df_gen = pd.DataFrame(genus_rows).drop_duplicates(subset=["genus_url","family_name","volume"], keep="last").reset_index(drop=True)
        df_gen.to_csv(genus_csv, index=False)

        print(f"\n‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô (resume): species rows (new only) = {len(df_sp)} ‚Üí {out_csv}")
        print(f"üßæ Audit: {out_audit_csv}")
        print(f"üìò Genus table: {len(df_gen)} rows ‚Üí {genus_csv}")

    finally:
        driver.quit()

In [9]:
# 1) ‡πÇ‡∏´‡∏•‡∏î species_url ‡πÄ‡∏î‡∏¥‡∏°‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
done = load_done_species(RESUME_FROM)

# 2) ‡∏£‡∏±‡∏ô‡πÅ‡∏ö‡∏ö resume (‡∏à‡∏∞‡∏Ç‡πâ‡∏≤‡∏° species_url ‡∏ó‡∏µ‡πà‡πÄ‡∏à‡∏≠‡πÉ‡∏ô‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏°)
crawl_notebook_resume(
    volumes=VOLUMES,
    out_csv=OUT_CSV,
    out_audit_csv=OUT_AUDIT,
    headless=HEADLESS,
    resume_species_urls=done
)

# 3) ‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏Å‡πà‡∏≤‡∏Å‡∏±‡∏ö‡πÉ‡∏´‡∏°‡πà ‚Üí merged ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß (‡∏Å‡∏±‡∏ô‡∏ã‡πâ‡∏≥)
parts = [pd.read_csv(p, dtype=str) for p in RESUME_FROM if os.path.exists(p)]
if os.path.exists(OUT_CSV):
    parts.append(pd.read_csv(OUT_CSV, dtype=str))

if parts:
    merged = pd.concat(parts, ignore_index=True)
    merged = merged.drop_duplicates(subset=["species_url","genus_url","family_name","volume"], keep="last")
    MERGED_CSV = "bkf_eflora_species_vol2_16_merged_final.csv"
    merged.to_csv(MERGED_CSV, index=False)
    print(f"üì¶ ‡∏£‡∏ß‡∏°‡πÑ‡∏ü‡∏•‡πå‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à ‚Üí {MERGED_CSV} (rows={len(merged)})")
else:
    print("‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡πâ‡∏£‡∏ß‡∏°")

‚Ä¢ ‡πÇ‡∏´‡∏•‡∏î 8186 species_url ‡∏à‡∏≤‡∏Å bkf_eflora_species_vol2_test.csv
‚úÖ ‡∏£‡∏ß‡∏° species_url ‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÑ‡∏õ‡πÅ‡∏•‡πâ‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: 8186
‚ÑπÔ∏è  ‡∏•‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï‡πÄ‡∏î‡∏¥‡∏°: bkf_eflora_species_vol2_16_resume.csv
‚ÑπÔ∏è  ‡∏•‡∏ö‡πÑ‡∏ü‡∏•‡πå audit ‡πÄ‡∏î‡∏¥‡∏°: bkf_genus_audit_vol2_16_resume.csv

=== Volume 2 ===
  ‚Üí ‡∏û‡∏ö 46 families (‡∏´‡∏•‡∏±‡∏á‡∏Å‡∏£‡∏≠‡∏á‡∏ã‡πâ‡∏≥)
    [1/46] Haloragaceae: 2 genera (max_expected=2)
       ‚Ä¢ Myriophyllum
       ‚Ä¢ Haloragis
    [2/46] Rhizophoraceae: 7 genera (max_expected=7)
       ‚Ä¢ Rhizophora
       ‚Ä¢ Bruguiera
       ‚Ä¢ Ceriops
       ‚Ä¢ Kandelia
       ‚Ä¢ Pellacalyx
       ‚Ä¢ Carallia
       ‚Ä¢ Gynotroches
    [3/46] Oxalidaceae: 3 genera (max_expected=3)
       ‚Ä¢ Averrhoa
       ‚Ä¢ Oxalis
       ‚Ä¢ Biophytum
    [4/46] Ochnaceae: 4 genera (max_expected=4)
       ‚Ä¢ Euthemis
       ‚Ä¢ Ochna
       ‚Ä¢ Brackenridgea
       ‚Ä¢ Gomphia
    [5/46] Rosaceae: 21 genera (max_exp

In [1]:
import pandas as pd 
df_name = pd.read_csv('bkf_eflora_species_vol2_16_merged_final.csv', dtype=str)
#df_name.head()

#### Create a new column for species name check with accept name already

In [2]:
# Create new column 'specific_name'
df_name["specific_name"] = df_name["species_scientific_name"]

# Replace if accepted_name is not "This is currently accepted."
mask = df_name["accepted_name"].notna() & (df_name["accepted_name"] != "This is currently accepted.")
df_name.loc[mask, "specific_name"] = df_name.loc[mask, "accepted_name"]
df_name.head()
# Save result
#df_name.to_csv("bkf_eflora_species_vol2_16_with_specific_name.csv", index=False)

#print("‚úÖ Done. New column 'specific_name' created and saved!")

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu
1,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:36.633398+00:00,Myriophyllum tetrandrum Roxb.
2,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:17:39.543048+00:00,Myriophyllum siamense (Craib) Tardieu
3,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:17:42.567178+00:00,Myriophyllum tetrandrum Roxb.
4,2,Haloragaceae,2 Haloragis,Haloragis,2.0,Haloragis micrantha (Thunb.) R.Br. ex Sieb. & ...,Gonocarpus micranthus,NORTH-EASTERN: Loei (Phu Kradueng).,"India, S & E China, N Vietnam, Hainan, Formosa...","In marshy mountain turf, moist places along mo...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:50.049475+00:00,Gonocarpus micranthus


In [3]:
df_species_url = df_name[['specific_name', 'species_url']]
#df_species_url.to_csv('species_url.csv', index=False)

In [16]:
# -*- coding: utf-8 -*-
# ‡∏£‡∏±‡∏ô‡∏Å‡∏±‡∏ö‡∏ó‡∏∏‡∏Å‡∏•‡∏¥‡∏á‡∏Å‡πå‡πÉ‡∏ô species_url.csv -> ‡∏î‡∏∂‡∏á wfo-xxxxxxxx -> ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå wfo_id
# ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: selenium, webdriver-manager, bs4, pandas, lxml
# pip install selenium webdriver-manager pandas beautifulsoup4 lxml

import os
import re
import time
import random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ====== CONFIG ======
INPUT_CSV   = "species_url.csv"
OUTPUT_CSV  = "species_url_wfo.csv"
WAIT_SEC    = 12                 # explicit wait ‡∏ï‡πà‡∏≠‡πÄ‡∏û‡∏à
PAUSE_RANGE = (0.6, 1.5)         # ‡∏û‡∏±‡∏Å‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏û‡∏à ‡∏Å‡∏±‡∏ô‡πÇ‡∏î‡∏ô‡∏ö‡∏•‡πá‡∏≠‡∏Å
RETRY_PER_URL = 2                # retry ‡∏ï‡πà‡∏≠‡πÄ‡∏û‡∏à
SAVE_EVERY  = 50                 # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ó‡∏∏‡∏Å ‡πÜ N ‡πÅ‡∏ñ‡∏ß
WFO_REGEX   = re.compile(r"wfo-\d{7,}")

# ====== utils ======
def detect_url_column(df: pd.DataFrame) -> str:
    for col in df.columns:
        if df[col].astype(str).str.contains(r"^https?://", na=False).any():
            return col
    for c in ["url", "link", "href", "source_url", "page_url"]:
        if c in df.columns:
            return c
    raise ValueError("‡∏´‡∏≤ '‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå URL' ‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠‡πÉ‡∏ô‡πÑ‡∏ü‡∏•‡πå CSV")

def build_driver() -> webdriver.Chrome:
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1366,900")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
    )
    opts.add_argument("accept-language=th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    driver.set_page_load_timeout(25)
    return driver

def extract_wfo_from_html(html: str) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "lxml")

    # 1) ‡πÇ‡∏ã‡∏ô‡∏ó‡∏µ‡πà‡∏Ñ‡∏≤‡∏î‡∏´‡∏ß‡∏±‡∏á: p.intro
    for p in soup.select("p.intro"):
        for a in p.find_all("a"):
            href = a.get("href", "") or ""
            text = a.get_text(strip=True) or ""
            for candidate in (href, text):
                m = WFO_REGEX.search(candidate)
                if m:
                    return m.group(0)
        m = WFO_REGEX.search(p.get_text(" ", strip=True))
        if m:
            return m.group(0)

    # 2) ‡∏•‡∏¥‡∏á‡∏Å‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏ô‡πâ‡∏≤
    for a in soup.select('a[href*="worldfloraonline.org"]'):
        href = a.get("href", "") or ""
        text = a.get_text(strip=True) or ""
        for candidate in (href, text):
            m = WFO_REGEX.search(candidate)
            if m:
                return m.group(0)

    # 3) ‡∏™‡∏≥‡∏£‡∏≠‡∏á: ‡∏™‡πÅ‡∏Å‡∏ô‡∏ó‡∏±‡πâ‡∏á‡πÄ‡∏û‡∏à
    m = WFO_REGEX.search(soup.get_text(" ", strip=True))
    return m.group(0) if m else ""

def get_wfo_from_url(driver: webdriver.Chrome, url: str) -> str:
    # ‡πÅ‡∏ï‡πà‡∏•‡∏∞ URL ‡∏û‡∏¢‡∏≤‡∏¢‡∏≤‡∏°‡∏´‡∏•‡∏≤‡∏¢‡∏Ñ‡∏£‡∏±‡πâ‡∏á
    for attempt in range(1, RETRY_PER_URL + 1):
        try:
            driver.get(url)
            try:
                WebDriverWait(driver, WAIT_SEC).until(
                    EC.any_of(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "p.intro")),
                        EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="worldfloraonline.org"]'))
                    )
                )
            except Exception:
                # ‡∏ñ‡πâ‡∏≤ wait ‡πÑ‡∏°‡πà‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à‡∏¢‡∏±‡∏á‡∏•‡∏≠‡∏á‡∏à‡∏≤‡∏Å page_source ‡∏ï‡πà‡∏≠‡πÑ‡∏î‡πâ
                pass

            html = driver.page_source or ""
            # ‡∏ñ‡πâ‡∏≤‡πÄ‡∏´‡πá‡∏ô template token ‡πÅ‡∏™‡∏î‡∏á‡∏ß‡πà‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏£‡∏ô‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏î‡∏µ ‡∏´‡∏ô‡πà‡∏ß‡∏á‡πÅ‡∏•‡πâ‡∏ß‡∏≠‡πà‡∏≤‡∏ô‡πÉ‡∏´‡∏°‡πà
            if "{{" in html and "}}" in html:
                time.sleep(1.2)
                html = driver.page_source or ""

            wfo_id = extract_wfo_from_html(html)
            if wfo_id:
                return wfo_id
        except Exception:
            # ‡∏´‡∏≤‡∏Å‡πÇ‡∏´‡∏•‡∏î‡πÄ‡∏û‡∏à‡∏û‡∏±‡∏á ‡∏•‡∏≠‡∏á‡πÉ‡∏´‡∏°‡πà‡∏£‡∏≠‡∏ö‡∏ñ‡∏±‡∏î‡πÑ‡∏õ
            pass

        if attempt < RETRY_PER_URL:
            time.sleep(1.0 * attempt)

    return ""  # ‡πÑ‡∏°‡πà‡∏û‡∏ö

# ====== main ======
def main():
    df_in = pd.read_csv(INPUT_CSV)
    url_col = detect_url_column(df_in)

    # ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° DataFrame ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå + resume ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏Å‡πà‡∏≤
    if os.path.exists(OUTPUT_CSV):
        df_out = pd.read_csv(OUTPUT_CSV)
        # ‡∏£‡∏ß‡∏°‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô‡∏à‡∏≤‡∏Å input ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå input ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï
        missing_cols = [c for c in df_in.columns if c not in df_out.columns]
        for c in missing_cols:
            df_out[c] = df_in[c]
        # ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå: ‡∏Ç‡∏≠‡∏á‡πÄ‡∏î‡∏¥‡∏° + wfo_id ‡∏ï‡πà‡∏≠‡∏ó‡πâ‡∏≤‡∏¢
        if "wfo_id" not in df_out.columns:
            df_out["wfo_id"] = ""
    else:
        df_out = df_in.copy()
        if "wfo_id" not in df_out.columns:
            df_out["wfo_id"] = ""

    driver = build_driver()

    try:
        total = len(df_out)
        for idx in range(total):
            url = str(df_out.at[idx, url_col]) if pd.notna(df_out.at[idx, url_col]) else ""

            # ‡∏Ç‡πâ‡∏≤‡∏°‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà URL
            if not re.match(r"^https?://", url):
                continue

            # ‡∏Ç‡πâ‡∏≤‡∏°‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏Ñ‡πà‡∏≤ wfo_id ‡πÅ‡∏•‡πâ‡∏ß (resume)
            current = str(df_out.at[idx, "wfo_id"]) if pd.notna(df_out.at[idx, "wfo_id"]) else ""
            if current.strip():
                continue

            print(f"[{idx+1}/{total}] ‡∏î‡∏∂‡∏á: {url}")
            wfo_id = get_wfo_from_url(driver, url)
            print("   ‚Üí", wfo_id if wfo_id else "(‡πÑ‡∏°‡πà‡∏û‡∏ö)")
            df_out.at[idx, "wfo_id"] = wfo_id

            # ‡πÄ‡∏ã‡∏ü‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏¢‡∏∞
            if (idx + 1) % SAVE_EVERY == 0:
                df_out.to_csv(OUTPUT_CSV, index=False)
                print(f"   ‚úì ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß: {OUTPUT_CSV}")

            time.sleep(random.uniform(*PAUSE_RANGE))

    finally:
        driver.quit()

    # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
    df_out.to_csv(OUTPUT_CSV, index=False)
    print(f"\n‚úÖ ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏¥‡πâ‡∏ô ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÑ‡∏ß‡πâ‡∏ó‡∏µ‡πà: {OUTPUT_CSV}")

if __name__ == "__main__":
    main()

[1/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=00004
   ‚Üí (‡πÑ‡∏°‡πà‡∏û‡∏ö)
[2/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=00003
   ‚Üí wfo-0001261381
[3/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=00004
   ‚Üí (‡πÑ‡∏°‡πà‡∏û‡∏ö)
[4/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=00003
   ‚Üí wfo-0001261381
[5/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=00006
   ‚Üí wfo-0000715064
[6/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=00006
   ‚Üí wfo-0000715064
[7/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=00009
   ‚Üí wfo-0001131596
[8/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=00010
   ‚Üí (‡πÑ‡∏°‡πà‡∏û‡∏ö)
[9/13719] ‡∏î‡∏∂‡∏á: https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=00009
   ‚Üí wfo-0001131596
[10/13719] ‡∏î‡∏∂‡∏á: https://botany.

KeyboardInterrupt: 

In [17]:
# -*- coding: utf-8 -*-
# CELL: Resume ‡∏ï‡πà‡∏≠‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå species_url_wfo.csv ‡πÇ‡∏î‡∏¢‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡πÄ‡∏â‡∏û‡∏≤‡∏∞ wfo_id ‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡∏ß‡πà‡∏≤‡∏á
# ‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á: selenium, webdriver-manager, bs4, pandas, lxml

import os
import re
import time
import random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ====== CONFIG (‡∏õ‡∏£‡∏±‡∏ö‡πÑ‡∏î‡πâ) ======
INPUT_BASE   = "species_url.csv"         # ‡πÉ‡∏ä‡πâ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏î‡∏≤‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå URL ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏´‡∏≤‡∏¢‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå
RESUME_FILE  = "species_url_wfo.csv"     # ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏°‡∏ó‡∏µ‡πà‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß‡πÑ‡∏ß‡πâ
START_AT     = 10000                     # ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏≥‡∏ï‡πà‡∏≠‡∏à‡∏≤‡∏Å‡πÅ‡∏ñ‡∏ß‡∏ô‡∏µ‡πâ‡∏Ç‡∏∂‡πâ‡∏ô‡πÑ‡∏õ (0-based index)
WAIT_SEC     = 12
PAUSE_RANGE  = (0.6, 1.5)
RETRY_PER_URL= 2
SAVE_EVERY   = 50                        # ‡πÄ‡∏ã‡∏ü‡∏ó‡∏∏‡∏Å N ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•

WFO_REGEX = re.compile(r"wfo-\d{7,}")

def detect_url_column(df: pd.DataFrame, df_fallback: pd.DataFrame=None) -> str:
    # ‡∏•‡∏≠‡∏á‡πÄ‡∏î‡∏≤‡πÉ‡∏ô df ‡∏Å‡πà‡∏≠‡∏ô
    for col in df.columns:
        if df[col].astype(str).str.contains(r"^https?://", na=False).any():
            return col
    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠ ‡∏•‡∏≠‡∏á‡πÉ‡∏ä‡πâ df_fallback (‡πÄ‡∏ä‡πà‡∏ô species_url.csv)
    if df_fallback is not None:
        for col in df_fallback.columns:
            if df_fallback[col].astype(str).str.contains(r"^https?://", na=False).any():
                return col
    # ‡∏ä‡∏∑‡πà‡∏≠‡∏¢‡∏≠‡∏î‡∏ô‡∏¥‡∏¢‡∏°
    for c in ["url", "link", "href", "source_url", "page_url"]:
        if c in df.columns:
            return c
        if df_fallback is not None and c in df_fallback.columns:
            return c
    raise ValueError("‡∏´‡∏≤ '‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå URL' ‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠‡πÉ‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏ü‡∏•‡πå‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö")

def build_driver() -> webdriver.Chrome:
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1366,900")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
    )
    opts.add_argument("accept-language=th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    driver.set_page_load_timeout(25)
    return driver

def extract_wfo_from_html(html: str) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "lxml")

    # 1) p.intro > a / text
    for p in soup.select("p.intro"):
        for a in p.find_all("a"):
            href = a.get("href", "") or ""
            text = a.get_text(strip=True) or ""
            for candidate in (href, text):
                m = WFO_REGEX.search(candidate)
                if m:
                    return m.group(0)
        m = WFO_REGEX.search(p.get_text(" ", strip=True))
        if m:
            return m.group(0)

    # 2) ‡∏•‡∏¥‡∏á‡∏Å‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏ô‡πâ‡∏≤‡πÑ‡∏õ worldfloraonline
    for a in soup.select('a[href*="worldfloraonline.org"]'):
        href = a.get("href", "") or ""
        text = a.get_text(strip=True) or ""
        for candidate in (href, text):
            m = WFO_REGEX.search(candidate)
            if m:
                return m.group(0)

    # 3) ‡∏™‡πÅ‡∏Å‡∏ô‡∏ó‡∏±‡πâ‡∏á‡πÄ‡∏û‡∏à
    m = WFO_REGEX.search(soup.get_text(" ", strip=True))
    return m.group(0) if m else ""

def get_wfo_from_url(driver: webdriver.Chrome, url: str) -> str:
    for attempt in range(1, RETRY_PER_URL + 1):
        try:
            driver.get(url)
            try:
                WebDriverWait(driver, WAIT_SEC).until(
                    EC.any_of(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "p.intro")),
                        EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="worldfloraonline.org"]'))
                    )
                )
            except Exception:
                pass

            html = driver.page_source or ""
            if "{{" in html and "}}" in html:
                time.sleep(1.2)
                html = driver.page_source or ""

            wfo_id = extract_wfo_from_html(html)
            if wfo_id:
                return wfo_id
        except Exception:
            pass

        if attempt < RETRY_PER_URL:
            time.sleep(1.0 * attempt)
    return ""

# ====== MAIN (Resume) ======
# 1) ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏î‡∏¥‡∏° + ‡πÑ‡∏ü‡∏•‡πå‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö‡πÑ‡∏ß‡πâ‡∏ä‡πà‡∏ß‡∏¢‡∏´‡∏≤ url_col
if not os.path.exists(RESUME_FILE):
    raise FileNotFoundError(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏î‡∏¥‡∏°: {RESUME_FILE}")

df_out = pd.read_csv(RESUME_FILE)
df_in  = pd.read_csv(INPUT_BASE) if os.path.exists(INPUT_BASE) else None

url_col = detect_url_column(df_out, df_fallback=df_in)

# 2) ‡∏´‡∏≤ index ‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•: ‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà START_AT ‡∏Ç‡∏∂‡πâ‡∏ô‡πÑ‡∏õ ‡πÅ‡∏•‡∏∞ wfo_id ‡∏¢‡∏±‡∏á‡∏ß‡πà‡∏≤‡∏á
def is_empty(x):
    return (pd.isna(x)) or (str(x).strip() == "")

indices = [i for i in range(len(df_out))
           if i >= START_AT and is_empty(df_out.at[i, "wfo_id"])]

print(f"‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡∏ï‡πà‡∏≠ {len(indices)} ‡πÅ‡∏ñ‡∏ß (‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà index >= {START_AT})")

if not indices:
    print("‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏ó‡∏≥‡∏ï‡πà‡∏≠‡πÅ‡∏•‡πâ‡∏ß‡∏à‡πâ‡∏≤ üéâ")
else:
    driver = build_driver()
    try:
        for k, idx in enumerate(indices, start=1):
            url = str(df_out.at[idx, url_col]) if pd.notna(df_out.at[idx, url_col]) else ""
            if not re.match(r"^https?://", url):
                continue

            print(f"[{k}/{len(indices)}] idx={idx} ‚Üí {url}")
            wfo_id = get_wfo_from_url(driver, url)
            print("   ‚Üí", wfo_id if wfo_id else "(‡πÑ‡∏°‡πà‡∏û‡∏ö)")
            if wfo_id:  # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏û‡∏ö (‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏õ‡∏•‡πà‡∏≠‡∏¢‡∏ß‡πà‡∏≤‡∏á‡πÑ‡∏ß‡πâ)
                df_out.at[idx, "wfo_id"] = wfo_id

            # ‡πÄ‡∏ã‡∏ü‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏¢‡∏∞‡πÇ‡∏î‡∏¢‡πÑ‡∏°‡πà‡∏ó‡∏±‡∏ö‡∏Ñ‡πà‡∏≤‡πÄ‡∏î‡∏¥‡∏°‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß (‡πÄ‡∏£‡∏≤‡πÄ‡∏û‡∏¥‡πà‡∏á‡πÄ‡∏ï‡∏¥‡∏°‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ñ‡πà‡∏≤‡∏ß‡πà‡∏≤‡∏á)
            if (k % SAVE_EVERY) == 0:
                df_out.to_csv(RESUME_FILE, index=False)
                print(f"   ‚úì ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß: {RESUME_FILE}")

            time.sleep(random.uniform(*PAUSE_RANGE))
    finally:
        driver.quit()

    # ‡πÄ‡∏ã‡∏ü‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
    df_out.to_csv(RESUME_FILE, index=False)
    print(f"\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢ (‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÅ‡∏ñ‡∏ß‡∏ß‡πà‡∏≤‡∏á) ‚Üí {RESUME_FILE}")

‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏°‡∏ß‡∏•‡∏ú‡∏•‡∏ï‡πà‡∏≠ 3667 ‡πÅ‡∏ñ‡∏ß (‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà index >= 10000)
[1/3667] idx=10052 ‚Üí https://botany.dnp.go.th/eflora/floraspecies.html?tdcode=06001
   ‚Üí wfo-0001062631
[2/3667] idx=10053 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=05997
   ‚Üí wfo-0001253378
[3/3667] idx=10054 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=05998
   ‚Üí wfo-0000926427
[4/3667] idx=10055 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=05999
   ‚Üí wfo-0001327571
[5/3667] idx=10056 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=06000
   ‚Üí wfo-0001327576
[6/3667] idx=10057 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=06001
   ‚Üí wfo-0001062631
[7/3667] idx=10058 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=06002
   ‚Üí wfo-0001327579
[8/3667] idx=10059 ‚Üí https://botany.dnp.go.th/eflora/floraSpecies.html?tdcode=06003
   ‚Üí wfo-0001327581
[9/3667] idx=10060 ‚Üí https://botany.dnp.

Crawling all WFO-id from bkf already

# Data Cleaning

In [31]:
#df_name.to_csv("bkf_eflora_species_vol2_16_with_specific_name.csv", index=False)
df_bkf = pd.read_csv('bkf_eflora_species_new.csv', dtype=str)
display(df_bkf.head(3))
print(df_bkf.shape)

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu
1,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:36.633398+00:00,Myriophyllum tetrandrum Roxb.
2,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:17:39.543048+00:00,Myriophyllum siamense (Craib) Tardieu


(13719, 15)


In [32]:
df_wfo = pd.read_csv('species_url_wfo.csv')
display(df_wfo.head(3))
print(df_wfo.shape)

Unnamed: 0,specific_name,species_url,wfo_id
0,Myriophyllum siamense (Craib) Tardieu,https://botany.dnp.go.th/eflora/floraspecies.h...,
1,Myriophyllum tetrandrum Roxb.,https://botany.dnp.go.th/eflora/floraspecies.h...,wfo-0001261381
2,Myriophyllum siamense (Craib) Tardieu,https://botany.dnp.go.th/eflora/floraSpecies.h...,


(13719, 3)


#### Mapping WFO ID into df

In [28]:
import pandas as pd
from pathlib import Path

# ========= CONFIG =========
BKF_PATH = "bkf_eflora_species_new.csv"
WFO_PATH = "species_url_wfo.csv"
OUT_DIR  = "."

KEY_COLS = ["species_url", "specific_name"]   # ‡∏Ñ‡∏µ‡∏¢‡πå‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°
ADD_COLS = ["wfo_id"]                         # ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏à‡∏≤‡∏Å‡∏ù‡∏±‡πà‡∏á WFO ‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏î‡∏∂‡∏á‡∏°‡∏≤

# ========= UTILS =========
def normalize_text(x: object) -> str:
    """‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏ï‡∏£‡∏¥‡∏á, ‡∏ï‡∏±‡∏î‡∏ä‡πà‡∏≠‡∏á‡∏ß‡πà‡∏≤‡∏á, ‡πÅ‡∏Å‡πâ non-breaking space, ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏±‡∏ß‡∏û‡∏¥‡∏°‡∏û‡πå‡πÄ‡∏•‡πá‡∏Å"""
    if pd.isna(x):
        return ""
    s = str(x).replace("\u00A0", " ")  # non-breaking space -> space
    return s.strip().lower()

def normalize_df_keys(df: pd.DataFrame, key_cols):
    df = df.copy()
    for c in key_cols:
        if c not in df.columns:
            raise KeyError(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå '{c}' ‡πÉ‡∏ô DataFrame")
        df[c + "_clean"] = df[c].apply(normalize_text)
    return df

# ========= LOAD =========
df_bkf = pd.read_csv(BKF_PATH)
df_wfo = pd.read_csv(WFO_PATH)

# ========= NORMALIZE KEYS =========
df_bkf = normalize_df_keys(df_bkf, KEY_COLS)
df_wfo = normalize_df_keys(df_wfo, KEY_COLS)

KEY_CLEAN = [c + "_clean" for c in KEY_COLS]

# ========= DIAGNOSTICS: DUP KEYS =========
def key_dups(df, key_cols, count_name):
    return (df
            .groupby(key_cols, dropna=False).size()
            .reset_index(name=count_name)
            .query(f"{count_name} > 1")
            .sort_values(count_name, ascending=False))

dup_bkf = key_dups(df_bkf, KEY_CLEAN, "count_bkf")
dup_wfo = key_dups(df_wfo, KEY_CLEAN, "count_wfo")

print(f"BKF rows               : {len(df_bkf)}")
print(f"WFO rows               : {len(df_wfo)}")
print(f"BKF duplicate keys     : {len(dup_bkf)} ‡∏ä‡∏∏‡∏î")
print(f"WFO duplicate keys     : {len(dup_wfo)} ‡∏ä‡∏∏‡∏î")

# ========= RESOLVE WFO MULTI-MAP =========
# ‡∏ñ‡πâ‡∏≤‡∏Ñ‡∏µ‡∏¢‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢ wfo_id ‡πÉ‡∏´‡πâ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà NaN (‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç‡πÑ‡∏î‡πâ‡∏ï‡∏≤‡∏°‡∏Å‡∏ï‡∏¥‡∏Å‡∏≤‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì ‡πÄ‡∏ä‡πà‡∏ô mode/most frequent)
def pick_first_non_null(s: pd.Series):
    s = s.dropna()
    return s.iloc[0] if len(s) else None

cols_to_keep = KEY_CLEAN + [c for c in ADD_COLS if c in df_wfo.columns]
df_wfo_grouped = (df_wfo
                  .sort_values(by=KEY_CLEAN)  # deterministic
                  .groupby(KEY_CLEAN, as_index=False)
                  .agg(**{col: (col, pick_first_non_null) for col in ADD_COLS}))

# ‡∏ï‡∏£‡∏ß‡∏à‡∏ß‡πà‡∏≤‡∏Ñ‡∏µ‡∏¢‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏Å‡∏±‡∏ô‡πÉ‡∏ô WFO ‡πÄ‡∏Ñ‡∏¢‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢‡∏Ñ‡πà‡∏≤ wfo_id ‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà (‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô)
wfo_conflict = (df_wfo.groupby(KEY_CLEAN)["wfo_id"]
                .nunique().reset_index(name="n_wfo").query("n_wfo > 1"))

if len(wfo_conflict):
    print(f"‚ö†Ô∏è ‡∏û‡∏ö‡∏Ñ‡∏µ‡∏¢‡πå‡∏ó‡∏µ‡πà‡∏ú‡∏π‡∏Å‡∏´‡∏•‡∏≤‡∏¢ wfo_id ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î {len(wfo_conflict)} ‡∏ä‡∏∏‡∏î (‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å‡∏°‡∏≤‡πÉ‡∏´‡πâ)")
    # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ß‡πâ‡∏ï‡∏£‡∏ß‡∏à
    Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
    wfo_conflict.to_csv(Path(OUT_DIR, "wfo_key_conflicts.csv"), index=False)
    print("   ‚Üí ‡πÄ‡∏ã‡∏ü‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏Ñ‡∏µ‡∏¢‡πå‡∏Ç‡∏±‡∏î‡πÅ‡∏¢‡πâ‡∏á‡∏ó‡∏µ‡πà 'wfo_key_conflicts.csv'")

# ========= RESOLVE BKF MULTI-ROWS PER KEY =========
# ‡∏ñ‡πâ‡∏≤‡∏ù‡∏±‡πà‡∏á BKF ‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢‡πÅ‡∏ñ‡∏ß‡∏ï‡πà‡∏≠‡∏Ñ‡∏µ‡∏¢‡πå ‡πÉ‡∏´‡πâ ‚Äú‡πÄ‡∏Å‡πá‡∏ö‡πÅ‡∏ñ‡∏ß‡πÅ‡∏£‡∏Å‡πÑ‡∏ß‡πâ‚Äù ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ one-to-one ‡∏à‡∏£‡∏¥‡∏á‡∏Å‡πà‡∏≠‡∏ô merge
df_bkf_one = (df_bkf
              .sort_values(by=KEY_CLEAN)  # deterministic
              .drop_duplicates(subset=KEY_CLEAN, keep="first"))

print(f"BKF unique by key      : {len(df_bkf_one)} ‡πÅ‡∏ñ‡∏ß (‡∏à‡∏≤‡∏Å {len(df_bkf)})")
print(f"WFO unique by key      : {len(df_wfo_grouped)} ‡πÅ‡∏ñ‡∏ß (‡∏à‡∏≤‡∏Å {len(df_wfo)})")

# ========= STRICT MERGE (ONE-TO-ONE) =========
# validate='one_to_one' ‡∏à‡∏∞ error ‡∏ñ‡πâ‡∏≤‡πÄ‡∏ú‡∏•‡∏≠‡∏ó‡∏≥ many-to-many
df_merged = pd.merge(
    df_bkf_one,
    df_wfo_grouped,
    how="left",
    on=KEY_CLEAN,
    validate="one_to_one"
)

matched = df_merged[ADD_COLS[0]].notna().sum()
print(f"‚úÖ ‡πÅ‡∏°‡∏õ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à           : {matched} / {len(df_merged)} ‡πÅ‡∏ñ‡∏ß")

# ========= POST: CLEANUP / OUTPUT =========
# ‡πÄ‡∏≠‡∏≤‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå *_clean ‡∏≠‡∏≠‡∏Å ‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏ã‡∏ü‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå
drop_clean = [c for c in df_merged.columns if c.endswith("_clean")]
df_out = df_merged.drop(columns=drop_clean)

out_ok = Path(OUT_DIR, "bkf_eflora_species_with_wfo_one2one.csv")
df_out.to_csv(out_ok, index=False)
print(f"üìÑ ‡πÄ‡∏ã‡∏ü‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå          : {out_ok}")

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏ä‡πà‡∏ß‡∏¢‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö
if len(dup_bkf):
    dup_bkf.to_csv(Path(OUT_DIR, "bkf_duplicate_keys.csv"), index=False)
    print("üìù ‡πÄ‡∏ã‡∏ü‡∏Ñ‡∏µ‡∏¢‡πå‡∏ã‡πâ‡∏≥‡∏ù‡∏±‡πà‡∏á BKF  : bkf_duplicate_keys.csv")
if len(dup_wfo):
    dup_wfo.to_csv(Path(OUT_DIR, "wfo_duplicate_keys.csv"), index=False)
    print("üìù ‡πÄ‡∏ã‡∏ü‡∏Ñ‡∏µ‡∏¢‡πå‡∏ã‡πâ‡∏≥‡∏ù‡∏±‡πà‡∏á WFO  : wfo_duplicate_keys.csv")

# ========= (OPTIONAL) ‡∏ñ‡πâ‡∏≤‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÉ‡∏´‡πâ‡πÅ‡∏ñ‡∏ß BKF ‚Äú‡∏Ñ‡∏á‡∏Ñ‡∏£‡∏ö‡πÄ‡∏ó‡πà‡∏≤‡πÄ‡∏î‡∏¥‡∏°‚Äù =========
# ‡πÉ‡∏ä‡πâ‡∏Å‡∏≤‡∏£ map ‡∏Ñ‡πà‡∏≤ wfo_id ‡∏Å‡∏•‡∏±‡∏ö‡πÄ‡∏Ç‡πâ‡∏≤ BKF ‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö‡∏ï‡∏≤‡∏°‡∏Ñ‡∏µ‡∏¢‡πå ‡πÇ‡∏î‡∏¢‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏Ñ‡∏π‡∏ì‡πÅ‡∏ñ‡∏ß
# (‡πÄ‡∏´‡∏°‡∏≤‡∏∞‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏Ñ‡∏∏‡∏ì‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß‡πÄ‡∏ó‡πà‡∏≤ BKF ‡πÄ‡∏î‡∏¥‡∏°‡πÄ‡∏™‡∏°‡∏≠)
map_dict = (df_wfo_grouped
            .set_index(KEY_CLEAN)["wfo_id"]
            .to_dict())

df_bkf_full = df_bkf.copy()
df_bkf_full["wfo_id"] = df_bkf_full.set_index(KEY_CLEAN).index.map(map_dict)
df_bkf_full = df_bkf_full.drop(columns=[c for c in df_bkf_full.columns if c.endswith("_clean")])

out_full = Path(OUT_DIR, "bkf_eflora_species_with_wfo_preserve_rows.csv")
df_bkf_full.to_csv(out_full, index=False)
print(f"üìÑ ‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏ä‡∏±‡∏ô‡∏Ñ‡∏á‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß  : {out_full} (‡πÅ‡∏ñ‡∏ß‡πÄ‡∏ó‡πà‡∏≤ BKF ‡πÄ‡∏î‡∏¥‡∏°)")

BKF rows               : 13719
WFO rows               : 13719
BKF duplicate keys     : 6251 ‡∏ä‡∏∏‡∏î
WFO duplicate keys     : 6251 ‡∏ä‡∏∏‡∏î
BKF unique by key      : 7389 ‡πÅ‡∏ñ‡∏ß (‡∏à‡∏≤‡∏Å 13719)
WFO unique by key      : 7389 ‡πÅ‡∏ñ‡∏ß (‡∏à‡∏≤‡∏Å 13719)
‚úÖ ‡πÅ‡∏°‡∏õ‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à           : 6571 / 7389 ‡πÅ‡∏ñ‡∏ß
üìÑ ‡πÄ‡∏ã‡∏ü‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå          : bkf_eflora_species_with_wfo_one2one.csv
üìù ‡πÄ‡∏ã‡∏ü‡∏Ñ‡∏µ‡∏¢‡πå‡∏ã‡πâ‡∏≥‡∏ù‡∏±‡πà‡∏á BKF  : bkf_duplicate_keys.csv
üìù ‡πÄ‡∏ã‡∏ü‡∏Ñ‡∏µ‡∏¢‡πå‡∏ã‡πâ‡∏≥‡∏ù‡∏±‡πà‡∏á WFO  : wfo_duplicate_keys.csv
üìÑ ‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏ä‡∏±‡∏ô‡∏Ñ‡∏á‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß  : bkf_eflora_species_with_wfo_preserve_rows.csv (‡πÅ‡∏ñ‡∏ß‡πÄ‡∏ó‡πà‡∏≤ BKF ‡πÄ‡∏î‡∏¥‡∏°)


In [33]:
df_need_clean = pd.read_csv('bkf_eflora_species_with_wfo_preserve_rows.csv', dtype=str)
df_need_clean.head()

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu,
1,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:36.633398+00:00,Myriophyllum tetrandrum Roxb.,wfo-0001261381
2,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:17:39.543048+00:00,Myriophyllum siamense (Craib) Tardieu,
3,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:17:42.567178+00:00,Myriophyllum tetrandrum Roxb.,wfo-0001261381
4,2,Haloragaceae,2 Haloragis,Haloragis,2.0,Haloragis micrantha (Thunb.) R.Br. ex Sieb. & ...,Gonocarpus micranthus,NORTH-EASTERN: Loei (Phu Kradueng).,"India, S & E China, N Vietnam, Hainan, Formosa...","In marshy mountain turf, moist places along mo...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:50.049475+00:00,Gonocarpus micranthus,wfo-0000715064


In [34]:
import pandas as pd

# ====== CONFIG ======
INPUT_FILE = "bkf_eflora_species_with_wfo_preserve_rows.csv"
OUTPUT_FILE = "bkf_eflora_species_dedup.csv"

# ====== LOAD DATA ======
df = pd.read_csv(INPUT_FILE)

# ====== STEP 1: ‡∏£‡∏∞‡∏ö‡∏∏‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡∏ï‡∏£‡∏ß‡∏à‡∏ã‡πâ‡∏≥ (‡∏¢‡∏Å‡πÄ‡∏ß‡πâ‡∏ô scraped_at) ======
cols_to_check = [c for c in df.columns if c != "scraped_at"]

# ====== STEP 2: ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° ======
def normalize_str(x):
    """‡πÅ‡∏õ‡∏•‡∏á‡∏Ñ‡πà‡∏≤‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô string, ‡∏ï‡∏±‡∏î‡∏ä‡πà‡∏≠‡∏á‡∏ß‡πà‡∏≤‡∏á, ‡πÅ‡∏Å‡πâ non-breaking space, ‡πÅ‡∏•‡∏∞‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏±‡∏ß‡∏û‡∏¥‡∏°‡∏û‡πå‡πÄ‡∏•‡πá‡∏Å"""
    if pd.isna(x):
        return ""
    s = str(x).replace("\u00A0", " ")  # ‡πÅ‡∏õ‡∏•‡∏á space ‡∏û‡∏¥‡πÄ‡∏®‡∏©‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô space ‡∏õ‡∏Å‡∏ï‡∏¥
    return s.strip().lower()

# ====== STEP 3: ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏™‡∏≥‡πÄ‡∏ô‡∏≤ cleaned ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ï‡∏£‡∏ß‡∏à‡∏ã‡πâ‡∏≥ ======
df_clean = df.copy()
for col in cols_to_check:
    df_clean[col] = df_clean[col].apply(normalize_str)

# ====== STEP 4: ‡∏´‡∏≤‡∏ß‡πà‡∏≤‡πÅ‡∏ñ‡∏ß‡πÑ‡∏´‡∏ô‡∏ã‡πâ‡∏≥ (‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏¢‡∏Å‡πÄ‡∏ß‡πâ‡∏ô scraped_at) ======
is_dup = df_clean.duplicated(subset=cols_to_check, keep=False)

# ‡∏ñ‡πâ‡∏≤‡∏≠‡∏¢‡∏≤‡∏Å‡∏î‡∏π‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î (‡∏Å‡πà‡∏≠‡∏ô‡∏•‡∏ö)
df_dupes = df[is_dup].copy()

# ====== STEP 5: ‡∏•‡∏ö‡πÅ‡∏ñ‡∏ß‡∏ã‡πâ‡∏≥‡πÇ‡∏î‡∏¢‡πÄ‡∏Å‡πá‡∏ö‡πÅ‡∏ñ‡∏ß‡πÅ‡∏£‡∏Å‡πÑ‡∏ß‡πâ ======
df_out = df[~df_clean.duplicated(subset=cols_to_check, keep='first')].reset_index(drop=True)

# ====== STEP 6: ‡πÅ‡∏™‡∏î‡∏á‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏• ======
print(f"‡∏Å‡πà‡∏≠‡∏ô‡∏•‡∏ö‡∏ã‡πâ‡∏≥: {len(df)} ‡πÅ‡∏ñ‡∏ß")
print(f"‡∏´‡∏•‡∏±‡∏á‡∏•‡∏ö‡∏ã‡πâ‡∏≥: {len(df_out)} ‡πÅ‡∏ñ‡∏ß")
print(f"‡∏•‡∏ö‡∏ã‡πâ‡∏≥‡πÑ‡∏õ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {len(df) - len(df_out)} ‡πÅ‡∏ñ‡∏ß")
print(f"‡πÉ‡∏ä‡πâ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ï‡∏£‡∏ß‡∏à‡∏ã‡πâ‡∏≥ (‡∏¢‡∏Å‡πÄ‡∏ß‡πâ‡∏ô scraped_at): {len(cols_to_check)} ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå")

# ====== STEP 7: ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå ======
df_out.to_csv(OUTPUT_FILE, index=False)
print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢: {OUTPUT_FILE}")

# ====== (‡∏ó‡∏≤‡∏á‡πÄ‡∏•‡∏∑‡∏≠‡∏Å) ‡πÅ‡∏™‡∏î‡∏á‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥ ======
# print(df_dupes.head(20))

‡∏Å‡πà‡∏≠‡∏ô‡∏•‡∏ö‡∏ã‡πâ‡∏≥: 13719 ‡πÅ‡∏ñ‡∏ß
‡∏´‡∏•‡∏±‡∏á‡∏•‡∏ö‡∏ã‡πâ‡∏≥: 7469 ‡πÅ‡∏ñ‡∏ß
‡∏•‡∏ö‡∏ã‡πâ‡∏≥‡πÑ‡∏õ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: 6250 ‡πÅ‡∏ñ‡∏ß
‡πÉ‡∏ä‡πâ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ï‡∏£‡∏ß‡∏à‡∏ã‡πâ‡∏≥ (‡∏¢‡∏Å‡πÄ‡∏ß‡πâ‡∏ô scraped_at): 15 ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå
‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢: bkf_eflora_species_dedup.csv


In [36]:
df_output = pd.read_csv('bkf_eflora_species_dedup.csv', dtype=str)
display(df_output.head(3))
print(df_output.shape)

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu,
1,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:36.633398+00:00,Myriophyllum tetrandrum Roxb.,wfo-0001261381
2,2,Haloragaceae,2 Haloragis,Haloragis,2.0,Haloragis micrantha (Thunb.) R.Br. ex Sieb. & ...,Gonocarpus micranthus,NORTH-EASTERN: Loei (Phu Kradueng).,"India, S & E China, N Vietnam, Hainan, Formosa...","In marshy mountain turf, moist places along mo...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:50.049475+00:00,Gonocarpus micranthus,wfo-0000715064


(7469, 16)


### Dataframe Deduplicate Already -> 7469 rows But there are 898 rows with no wfo-id

In [39]:
df_output[df_output['wfo_id'].isna() | (df_output['wfo_id'] == '')]

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu,
4,2,Rhizophoraceae,1 Rhizophora,Rhizophora,1.0,Rhizophora mucronata Poir.,This is currently accepted.,SOUTH-EASTERN: Chanthaburi; SOUTH-WESTERN: Pra...,"In the Old World tropics, occurring from the c...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:19.615290+00:00,Rhizophora mucronata Poir.,
5,2,Rhizophoraceae,2 Bruguiera,Bruguiera,2.0,Bruguiera gymnorhiza (L.) Savigny,This is currently accepted.,"SOUTH-EASTERN: Chanthaburi, Trat; PENINSULAR: ...","In the Old World tropics, from S & E Africa th...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:33.101996+00:00,Bruguiera gymnorhiza (L.) Savigny,
11,2,Rhizophoraceae,3 Ceriops,Ceriops,3.0,Ceriops decandra (Griff.) Ding Hou,This is currently accepted.,"SOUTH-EASTERN: Chon Buri (Si Racha), Chathabur...","India (type), Burma, Cambodia, S Vietnam, Male...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:19:14.152028+00:00,Ceriops decandra (Griff.) Ding Hou,
32,2,Rosaceae,5 Stranvaesia,Stranvaesia,5.0,Stranvaesia nussia (Buch.-Ham. ex D.Don) Decne.,Photinia nussia,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:23:26.198148+00:00,Photinia nussia,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7193,16,Annonaceae,36. Uvaria,Uvaria,36.0,Uvaria ferruginea Buch.-Ham. ex Hook.f. & Thomson,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-20T23:40:07.128790+00:00,Uvaria ferruginea Buch.-Ham. ex Hook.f. & Thomson,
7194,16,Annonaceae,36. Uvaria,Uvaria,36.0,Uvaria pauciovulata Hook.f. & Thomson,,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-20T23:40:29.298457+00:00,Uvaria pauciovulata Hook.f. & Thomson,
7196,16,Annonaceae,36. Uvaria,Uvaria,36.0,"Uvaria wrayi (King) L.L.Zhou, Y.C.F.Su & R.M.K...",This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-20T23:40:52.077334+00:00,"Uvaria wrayi (King) L.L.Zhou, Y.C.F.Su & R.M.K...",
7420,16,Annonaceae,5. Cananga (in part),Cananga (in part),5.0,Cananga odorata (Lam.) Hook.f. & Thomson,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-21T00:02:51.997760+00:00,Cananga odorata (Lam.) Hook.f. & Thomson,


## Result from WFO 

In [5]:
df_wfo = pd.read_csv('result_wfo.csv', dtype=str)
df_wfo

Unnamed: 0,wfo_id,wfo_full_name,wfo_check,specific_name
0,wfo-0000373567,Myriophyllum siamense (Craib) Tardieu,Code/Plantae/Pteridobiotina/Angiosperms/Saxifr...,Myriophyllum siamense (Craib) Tardieu
1,wfo-0001261381,Myriophyllum tetrandrum Roxb.,Code/Plantae/Pteridobiotina/Angiosperms/Saxifr...,Myriophyllum tetrandrum Roxb.
2,wfo-0000373567,Myriophyllum siamense (Craib) Tardieu,Code/Plantae/Pteridobiotina/Angiosperms/Saxifr...,Myriophyllum siamense (Craib) Tardieu
3,wfo-0001261381,Myriophyllum tetrandrum Roxb.,Code/Plantae/Pteridobiotina/Angiosperms/Saxifr...,Myriophyllum tetrandrum Roxb.
4,wfo-0000706666,Gonocarpus micranthus Thunb.,Code/Plantae/Pteridobiotina/Angiosperms/Saxifr...,Gonocarpus micranthus
...,...,...,...,...
13714,wfo-0000394130,Popowia fusca King,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Popowia fusca King
13715,wfo-0000393922,Popowia pisocarpa (Blume) Endl. ex Walp.,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Popowia pisocarpa (Blume) Endl. ex Walp.
13716,wfo-0000394130,Popowia fusca King,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Popowia fusca King
13717,wfo-0000393922,Popowia pisocarpa (Blume) Endl. ex Walp.,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Popowia pisocarpa (Blume) Endl. ex Walp.


In [7]:
candidate_wfo = pd.read_csv('candidates_wfo.csv', dtype=str)
candidate_wfo

Unnamed: 0,score,wfo_id,wfo_full_name,wfo_check,specific_name
0,1,wfo-0000572747,Bruguiera gymnorhiza (L.) Lam. ex Savigny,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Bruguiera gymnorhiza (L.) Savigny
1,1,wfo-0000572747,Bruguiera gymnorhiza (L.) Lam. ex Savigny,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Bruguiera gymnorhiza (L.) Savigny
2,1,wfo-0000597936,Ceriops decandra (Griff.) W.Theob.,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Ceriops decandra (Griff.) Ding Hou
3,1,wfo-0000597936,Ceriops decandra (Griff.) W.Theob.,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Ceriops decandra (Griff.) Ding Hou
4,1,wfo-0000472091,Pellacalyx parkinsonii C.E.C.Fisch.,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Pellacalyx parkinsonii C.E.C.Fischer
...,...,...,...,...,...
3331,5,wfo-0000428715,Xylopia salicifolia Kunth,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3332,4,wfo-1000036272,Xylopia schroederi Sleumer ex Herter,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Xylopia subdehiscens (King) J.Sinclair
3333,3,wfo-1000025243,Xylopia sclerophylla D.M.Johnson & N.A.Murray,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3334,2,wfo-0000428695,Xylopia scortechinii King,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair


In [8]:
candidate_wfo.columns

Index(['score', 'wfo_id', 'wfo_full_name', 'wfo_check', 'specific_name'], dtype='object')

In [9]:
# ‡∏•‡∏ö‡πÅ‡∏ñ‡∏ß‡∏ã‡πâ‡∏≥‡πÇ‡∏î‡∏¢‡∏≠‡∏¥‡∏á‡∏à‡∏≤‡∏Å‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏£‡∏∑‡∏≠ ID
df_unique = candidate_wfo.drop_duplicates(subset=["specific_name", "wfo_id"], keep="first")

#df_unique.to_csv("candidates_wfo_cleaned.csv", index=False)
print(f"‚úÖ Removed duplicates ‚Üí saved {len(df_unique)} rows.")
df_unique

‚úÖ Removed duplicates ‚Üí saved 1829 rows.


Unnamed: 0,score,wfo_id,wfo_full_name,wfo_check,specific_name
0,1,wfo-0000572747,Bruguiera gymnorhiza (L.) Lam. ex Savigny,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Bruguiera gymnorhiza (L.) Savigny
2,1,wfo-0000597936,Ceriops decandra (Griff.) W.Theob.,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Ceriops decandra (Griff.) Ding Hou
4,1,wfo-0000472091,Pellacalyx parkinsonii C.E.C.Fisch.,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Pellacalyx parkinsonii C.E.C.Fischer
6,2,wfo-0001016263,Eriobotrya bengalensis Hook.f.,unplaced,Eriobotrya bengalensis (Roxb.) Hook.f.
7,1,wfo-0001017662,Eriobotrya bengalensis (Roxb.) Kurz,Code/Plantae/Pteridobiotina/Angiosperms/Rosale...,Eriobotrya bengalensis (Roxb.) Hook.f.
...,...,...,...,...,...
3321,5,wfo-0000428715,Xylopia salicifolia Kunth,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3322,4,wfo-1000036272,Xylopia schroederi Sleumer ex Herter,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Xylopia subdehiscens (King) J.Sinclair
3323,3,wfo-1000025243,Xylopia sclerophylla D.M.Johnson & N.A.Murray,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3324,2,wfo-0000428695,Xylopia scortechinii King,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair


In [15]:
dup_groups = (
    df_unique.groupby("specific_name")["wfo_id"]
    .nunique()
    .reset_index()
    .query("wfo_id > 1")
)

# ‡πÅ‡∏™‡∏î‡∏á‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ä‡∏∑‡πà‡∏≠‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥
multi_id_species = df_unique[df_unique["specific_name"].isin(dup_groups["specific_name"])]

# ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•
print(f"üîç ‡∏û‡∏ö species ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤ 1 WFO ID: {dup_groups.shape[0]} ‡∏ä‡∏ô‡∏¥‡∏î")
display(multi_id_species)#[["specific_name", "wfo_id", "wfo_full_name"]])

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏ü‡∏•‡πå‡πÉ‡∏´‡∏°‡πà (‡∏ñ‡πâ‡∏≤‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£)
multi_id_species.to_csv("name_multiple_candidate.csv", index=False)
#print("üíæ Saved as 'species_with_multiple_wfo_id.csv'")

üîç ‡∏û‡∏ö species ‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤ 1 WFO ID: 288 ‡∏ä‡∏ô‡∏¥‡∏î


Unnamed: 0,score,wfo_id,wfo_full_name,wfo_check,specific_name
6,2,wfo-0001016263,Eriobotrya bengalensis Hook.f.,unplaced,Eriobotrya bengalensis (Roxb.) Hook.f.
7,1,wfo-0001017662,Eriobotrya bengalensis (Roxb.) Kurz,Code/Plantae/Pteridobiotina/Angiosperms/Rosale...,Eriobotrya bengalensis (Roxb.) Hook.f.
8,2,wfo-0001016263,Eriobotrya bengalensis Hook.f.,unplaced,Eriobotrya bengalensis
9,1,wfo-0001017662,Eriobotrya bengalensis (Roxb.) Kurz,Code/Plantae/Pteridobiotina/Angiosperms/Rosale...,Eriobotrya bengalensis
15,2,wfo-0000994970,Micromeles cuspidata (Bertol.) C.K.Schneid.,Code/Plantae/Pteridobiotina/Angiosperms/Rosale...,Micromeles cuspidata
...,...,...,...,...,...
3321,5,wfo-0000428715,Xylopia salicifolia Kunth,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3322,4,wfo-1000036272,Xylopia schroederi Sleumer ex Herter,Code/Plantae/Pteridobiotina/Angiosperms/Malpig...,Xylopia subdehiscens (King) J.Sinclair
3323,3,wfo-1000025243,Xylopia sclerophylla D.M.Johnson & N.A.Murray,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
3324,2,wfo-0000428695,Xylopia scortechinii King,Code/Plantae/Pteridobiotina/Angiosperms/Magnol...,Xylopia subdehiscens (King) J.Sinclair
