# Data Cleaningbb

## Load Data

In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import Counter

# ========= CONFIG =========
BKF_PATH   = "bkf_eflora_species_dedup.csv"   # BKF ‡∏´‡∏•‡∏±‡∏á dedup
OUT7_PATH  = "output (7).csv"                 # ‡∏à‡∏≤‡∏Å WFO map tools
CAND_PATH  = "candidates (3).csv"             # candidates ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°
OUT_DIR    = "."

In [None]:
# ========= HELPERS =========
def norm(x):
    if pd.isna(x): return ""
    return str(x).replace("\u00A0", " ").strip().lower()

def find_col(cols, candidates):
    """‡∏´‡∏≤‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡πÅ‡∏ö‡∏ö‡∏ó‡∏ô‡∏ó‡∏≤‡∏ô‡∏ï‡πà‡∏≠‡πÄ‡∏Ñ‡∏™‡∏ï‡∏±‡∏ß‡∏û‡∏¥‡∏°‡∏û‡πå"""
    cols_list = list(cols)
    # ‡∏´‡∏≤‡πÅ‡∏ö‡∏ö‡∏ï‡∏£‡∏á‡∏ä‡∏∑‡πà‡∏≠‡∏Å‡πà‡∏≠‡∏ô
    for c in candidates:
        if c in cols_list:
            return c
    # ‡∏´‡∏≤‡πÅ‡∏ö‡∏ö lower-case map
    lower_map = {c.lower(): c for c in cols_list}
    for c in candidates:
        if c.lower() in lower_map:
            return lower_map[c.lower()]
    return None

def nonempty_series_like(df, col):
    """‡∏Ñ‡∏∑‡∏ô Series ‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå (‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏´‡πâ‡∏Ñ‡∏∑‡∏ô‡∏ã‡∏µ‡∏£‡∏µ‡∏™‡πå‡∏ß‡πà‡∏≤‡∏á) ‡πÅ‡∏•‡∏∞‡πÅ‡∏õ‡∏•‡∏á nan/None ‡πÄ‡∏õ‡πá‡∏ô '' """
    if col is None or col not in df.columns:
        return pd.Series("", index=df.index)
    return (df[col].astype(str)
                 .str.strip()
                 .replace({"nan": "", "None": "", "NaT": ""}))

def wfo_clean(x):
    """‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î WFO -> ‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö wfo-xxxx (‡∏ï‡∏±‡∏ß‡∏û‡∏¥‡∏°‡∏û‡πå‡πÄ‡∏•‡πá‡∏Å, ‡∏ï‡∏±‡∏î‡∏≠‡∏±‡∏Å‡∏Ç‡∏£‡∏∞‡πÅ‡∏õ‡∏•‡∏Å)"""
    if pd.isna(x): return ""
    s = str(x).strip()
    if s == "" or s.lower() in {"none","nan"}:
        return ""
    s2 = s.lower()
    if not s2.startswith("wfo-"):
        s2 = "wfo-" + s2
    s2 = re.sub(r"[^a-z0-9\-]", "", s2)
    return s2

def prefer(left, right):
    """‡πÄ‡∏•‡∏∑‡∏≠‡∏Å left ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏ß‡πà‡∏≤‡∏á ‡∏°‡∏¥‡∏â‡∏∞‡∏ô‡∏±‡πâ‡∏ô‡πÉ‡∏ä‡πâ right"""
    ls = left.astype(str).str.strip()
    rs = right.astype(str).str.strip()
    return np.where(ls != "", ls, rs)

def resolve_eff(df, colname, suffix):
    """‡∏Ñ‡∏∑‡∏ô‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡∏à‡∏£‡∏¥‡∏á‡∏´‡∏•‡∏±‡∏á merge (‡πÄ‡∏ä‡πá‡∏Ñ‡∏ß‡πà‡∏≤‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡πÄ‡∏ï‡∏¥‡∏° suffix ‡∏´‡∏£‡∏∑‡∏≠‡πÑ‡∏°‡πà)"""
    if not colname:
        return None
    if f"{colname}{suffix}" in df.columns:
        return f"{colname}{suffix}"
    return colname if colname in df.columns else None

# ========= LOAD =========
df_bkf = pd.read_csv(BKF_PATH)
df_o7  = pd.read_csv(OUT7_PATH)
df_c   = pd.read_csv(CAND_PATH)

# ========= IDENTIFY KEY COLUMNS =========
bkf_spec = find_col(df_bkf.columns, ["specific_name","Specific_name","specific","name","species"])
bkf_wfo  = find_col(df_bkf.columns, ["wfo_id","wfo","wfo_code"])
bkf_url  = find_col(df_bkf.columns, ["species_url","Species_url","url"])  # optional
bkf_full = find_col(df_bkf.columns, ["wfo_full_name","wfo_name","full_name","accepted_name"])

if bkf_spec is None:
    raise KeyError("‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå specific_name ‡πÉ‡∏ô BKF")

if bkf_wfo is None:
    # ‡∏ñ‡πâ‡∏≤ BKF ‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå WFO ‡πÉ‡∏´‡πâ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ß‡πà‡∏≤‡∏á‡πÑ‡∏ß‡πâ
    df_bkf["wfo_id"] = np.nan
    bkf_wfo = "wfo_id"

o7_spec  = find_col(df_o7.columns, ["specific_name","specific","name","query","bkf_specific_name"])
o7_wfo   = find_col(df_o7.columns, ["wfo_id","wfo","wfoCode","WFO_ID"])
o7_full  = find_col(df_o7.columns, ["wfo_full_name","wfo_name","full_name","accepted_name","canonicalName"])

c_spec   = find_col(df_c.columns, ["specific_name","specific","name","query","bkf_specific_name"])
c_wfo    = find_col(df_c.columns, ["wfo_id","wfo","candidate_wfo_id","WFO_ID"])
c_full   = find_col(df_c.columns, ["wfo_full_name","wfo_name","full_name","accepted_name","canonicalName"])

# ========= NORMALIZE KEY & WFO =========
df_bkf["specific_name_clean"] = df_bkf[bkf_spec].apply(norm)
if bkf_url: df_bkf["species_url_clean"] = df_bkf[bkf_url].apply(norm)

if o7_spec: df_o7["specific_name_clean"] = df_o7[o7_spec].apply(norm)
if c_spec:  df_c["specific_name_clean"]  = df_c[c_spec].apply(norm)

# ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î WFO ‡πÉ‡∏ô‡∏ó‡∏∏‡∏Å‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏µ
df_bkf[bkf_wfo] = nonempty_series_like(df_bkf, bkf_wfo).apply(wfo_clean)
if o7_wfo:
    df_o7[o7_wfo] = nonempty_series_like(df_o7, o7_wfo).apply(wfo_clean)
if c_wfo:
    df_c[c_wfo]   = nonempty_series_like(df_c, c_wfo).apply(wfo_clean)

# ========= STEP 0: BASELINE =========
missing_before = df_bkf[bkf_wfo].eq("") | df_bkf[bkf_wfo].isna()
print(f"[BASELINE] Missing WFO (BKF): {int(missing_before.sum())} / {len(df_bkf)}")

# ========= STEP 1: JOIN output(7) ‡∏î‡πâ‡∏ß‡∏¢ specific_name_clean =========
o7_use_cols = ["specific_name_clean"]
if o7_wfo:  o7_use_cols.append(o7_wfo)
if o7_full: o7_use_cols.append(o7_full)
df_o7_use = (df_o7[o7_use_cols].drop_duplicates("specific_name_clean")
             if o7_spec else pd.DataFrame(columns=o7_use_cols))

df1 = df_bkf.merge(df_o7_use, on="specific_name_clean", how="left", suffixes=("","_o7"))

# ========= STEP 2: PREP candidates ‚Äî ‡πÄ‡∏Å‡πá‡∏ö‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ä‡∏∑‡πà‡∏≠‡∏ó‡∏µ‡πà‡∏°‡∏µ WFO ‡πÄ‡∏î‡∏µ‡∏¢‡∏ß =========
if (c_spec is not None) and (c_wfo is not None):
    cg = df_c.dropna(subset=[c_wfo]).copy()
    cg[c_wfo] = cg[c_wfo].astype(str).str.strip()
    uniq_one_key = (
        cg.groupby("specific_name_clean")[c_wfo].nunique()
          .reset_index(name="n").query("n == 1")[["specific_name_clean"]]
    )
    c_use = (
        cg.merge(uniq_one_key, on="specific_name_clean", how="inner")
          .drop_duplicates(subset=["specific_name_clean"])
    )
    c_use_cols = ["specific_name_clean", c_wfo]
    if c_full: c_use_cols.append(c_full)
    c_use = c_use[c_use_cols]
else:
    c_use = pd.DataFrame(columns=["specific_name_clean"])

df2 = df1.merge(c_use, on="specific_name_clean", how="left", suffixes=("","_cand"))

# ========= STEP 2.1: RESOLVE EFFECTIVE NAMES ‡∏´‡∏•‡∏±‡∏á merge =========
o7_wfo_eff  = resolve_eff(df2, o7_wfo,  "_o7")
o7_full_eff = resolve_eff(df2, o7_full, "_o7")
c_wfo_eff   = resolve_eff(df2, c_wfo,   "_cand")
c_full_eff  = resolve_eff(df2, c_full,  "_cand")

# ========= STEP 3: ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å wfo_id ‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç =========
bkf_val = nonempty_series_like(df2, bkf_wfo)
o7_val  = nonempty_series_like(df2, o7_wfo_eff) if o7_wfo_eff else pd.Series("", index=df2.index)
c_val   = nonempty_series_like(df2, c_wfo_eff)  if c_wfo_eff  else pd.Series("", index=df2.index)

bkf_has = bkf_val != ""
o7_has  = o7_val  != ""
c_has   = c_val   != ""

df2["wfo_id_final"] = np.select(
    [bkf_has, (~bkf_has) & o7_has, (~bkf_has) & (~o7_has) & c_has],
    [bkf_val, o7_val, c_val],
    default=""
)

df2["wfo_source"] = np.select(
    [bkf_has, (~bkf_has) & o7_has, (~bkf_has) & (~o7_has) & c_has],
    ["bkf_original","output7_exact","candidates_unique"],
    default="unfilled"
)

# ========= STEP 3.1: FLAG CONFLICTS =========
def diff_nonempty(a, b):
    a = a.astype(str).str.strip(); b = b.astype(str).str.strip()
    return (a != "") & (b != "") & (a != b)

df2["conflict_bkf_vs_o7"]   = diff_nonempty(bkf_val, o7_val)
df2["conflict_bkf_vs_cand"] = diff_nonempty(bkf_val, c_val)
df2["conflict_o7_vs_cand"]  = diff_nonempty(o7_val, c_val)
df2["any_conflict"] = df2[["conflict_bkf_vs_o7","conflict_bkf_vs_cand","conflict_o7_vs_cand"]].any(axis=1)


# ========= STEP 4: wfo_full_name = ‡πÉ‡∏ä‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏à‡∏≤‡∏Å output(7) ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô =========
# ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡∏≠‡πâ‡∏≤‡∏á‡∏≠‡∏¥‡∏á‡∏à‡∏≤‡∏Å‡πÑ‡∏ü‡∏•‡πå output(7): wfo_id -> wfo_full_name
# ‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏: ‡πÇ‡∏Ñ‡πâ‡∏î‡∏Å‡πà‡∏≠‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î o7_wfo ‡πÅ‡∏•‡πâ‡∏ß (wfo_clean)
if o7_wfo and o7_full:
    # ‡∏™‡∏£‡πâ‡∏≤‡∏á lookup ‡πÇ‡∏î‡∏¢‡∏≠‡∏≤‡∏®‡∏±‡∏¢ wfo_id ‡∏ó‡∏µ‡πà‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡πÅ‡∏•‡πâ‡∏ß
    o7_id_name = (
        df_o7[[o7_wfo, o7_full]]
        .dropna(subset=[o7_wfo])
        .copy()
    )
    o7_id_name[o7_wfo]  = o7_id_name[o7_wfo].astype(str).str.strip()
    o7_id_name[o7_full] = o7_id_name[o7_full].astype(str).str.strip()

    # ‡∏Å‡∏£‡∏ì‡∏µ‡∏´‡∏ô‡∏∂‡πà‡∏á wfo_id ‡∏°‡∏µ‡∏´‡∏•‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠ ‡πÉ‡∏´‡πâ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å (‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÄ‡∏õ‡πá‡∏ô .agg('first') ‡∏Å‡πá‡πÑ‡∏î‡πâ)
    o7_id_name = (
        o7_id_name[o7_id_name[o7_wfo] != ""]
        .drop_duplicates(subset=[o7_wfo], keep="first")
    )

    # ‡∏ó‡∏≥ dict ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö map
    o7_map = dict(zip(o7_id_name[o7_wfo], o7_id_name[o7_full]))

    # ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ final ‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏ï‡πá‡∏°‡πÇ‡∏î‡∏¢ map ‡∏à‡∏≤‡∏Å wfo_id_final ‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß
    df2["wfo_full_name_final"] = df2["wfo_id_final"].map(o7_map).fillna("")

else:
    # ‡∏ñ‡πâ‡∏≤‡πÑ‡∏ü‡∏•‡πå output(7) ‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô ‡πÉ‡∏´‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡πà‡∏≤‡∏ß‡πà‡∏≤‡∏á‡πÑ‡∏ß‡πâ‡∏Å‡πà‡∏≠‡∏ô
    df2["wfo_full_name_final"] = ""

# ========= STEP 5: ‡∏à‡∏±‡∏î‡∏£‡∏π‡∏õ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå & ‡πÑ‡∏°‡πà‡∏•‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î =========
df_out = df2.copy()
df_out["wfo_id"] = df_out["wfo_id_final"].replace({"": np.nan})
df_out["wfo_full_name"] = df_out["wfo_full_name_final"].replace({"": np.nan})

# ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏•‡∏ö (‡πÄ‡∏ß‡πâ‡∏ô‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô‡πÑ‡∏ß‡πâ)
drop_cols = []
for col in [
    o7_wfo_eff, o7_full_eff, c_wfo_eff, c_full_eff,
    "wfo_id_final", "wfo_full_name_final",
    "specific_name_clean", "species_url_clean",
    "conflict_bkf_vs_o7","conflict_bkf_vs_cand","conflict_o7_vs_cand","any_conflict"
]:
    if col and (col in df_out.columns) and (col not in ["wfo_id","wfo_full_name"]):
        drop_cols.append(col)

df_out = df_out.drop(columns=drop_cols, errors="ignore")

# ========= STEP 6: SAVE MAIN =========
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
out_enriched = Path(OUT_DIR, "bkf_wfo_priority_bkf_then_output_then_unique_cand.csv")
df_out.to_csv(out_enriched, index=False)
print(f"Saved enriched: {out_enriched}")

unmatched = df_out[df_out["wfo_id"].isna() | (df_out["wfo_id"].astype(str).str.strip()=="")]
out_unmatched = Path(OUT_DIR, "bkf_wfo_unmatched_after_priority.csv")
unmatched.to_csv(out_unmatched, index=False)
print(f"Unmatched rows: {len(unmatched)}  -> {out_unmatched}")

# ========= STEP 7: AUDIT / DIAGNOSTIC =========
audit_cols = [bkf_spec, bkf_wfo, "specific_name_clean"]
if bkf_full: audit_cols.append(bkf_full)
if o7_wfo_eff:  audit_cols.append(o7_wfo_eff)
if o7_full_eff: audit_cols.append(o7_full_eff)
if c_wfo_eff:   audit_cols.append(c_wfo_eff)
if c_full_eff:  audit_cols.append(c_full_eff)

# unique & keep order
audit_cols = [c for i,c in enumerate(audit_cols) if c and (audit_cols.index(c) == i)]

df_audit = df2[audit_cols + [
    "wfo_id_final","wfo_source",
    "conflict_bkf_vs_o7","conflict_bkf_vs_cand","conflict_o7_vs_cand","any_conflict",
    "wfo_full_name_final"
]].copy()

# ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡∏∑‡πà‡∏≠‡∏´‡∏±‡∏ß‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡πÉ‡∏´‡πâ‡∏≠‡πà‡∏≤‡∏ô‡∏á‡πà‡∏≤‡∏¢
rename_map = {}
if bkf_spec: rename_map[bkf_spec] = "bkf_specific_name"
if bkf_wfo:  rename_map[bkf_wfo]  = "bkf_wfo_id"
if o7_wfo_eff:  rename_map[o7_wfo_eff]  = "o7_wfo_id"
if c_wfo_eff:   rename_map[c_wfo_eff]   = "cand_wfo_id"
if bkf_full:    rename_map[bkf_full]    = "bkf_wfo_full_name"
if o7_full_eff: rename_map[o7_full_eff] = "o7_wfo_full_name"
if c_full_eff:  rename_map[c_full_eff]  = "cand_wfo_full_name"
rename_map["wfo_full_name_final"] = "wfo_full_name_final"

df_audit.rename(columns=rename_map, inplace=True)

out_audit = Path(OUT_DIR, "bkf_wfo_audit_conflicts.csv")
df_audit.to_csv(out_audit, index=False)
print(f"Audit conflicts file: {out_audit}")

# Name -> multi WFO (‡∏Ñ‡∏ß‡∏£ 0 ‡∏ñ‡πâ‡∏≤ unique ‡∏à‡∏£‡∏¥‡∏á)
if "specific_name_clean" in df2.columns:
    dup_name = (df2.assign(_wfo=df2["wfo_id_final"].replace({"": np.nan}))
                   .dropna(subset=["_wfo"])
                   .groupby("specific_name_clean")["_wfo"].nunique()
                   .reset_index().query("_wfo > 1"))
    out_dup_name = Path(OUT_DIR, "audit_name_to_multiple_wfo.csv")
    dup_name.to_csv(out_dup_name, index=False)
    print(f"Name -> multi-WFO list: {out_dup_name}")
else:
    print("Name -> multi-WFO list: NA")

# WFO -> ‡∏´‡∏•‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠ (‡∏û‡∏ö‡∏ö‡πà‡∏≠‡∏¢‡πÉ‡∏ô synonym)
wfo_counts = (df2.assign(_wfo=df2["wfo_id_final"].replace({"": np.nan}))
                .dropna(subset=["_wfo"])
                .groupby("_wfo")["specific_name_clean"].nunique()
                .reset_index(name="n_names").sort_values("n_names", ascending=False))
out_wfo_multi = Path(OUT_DIR, "audit_wfo_to_multiple_names.csv")
wfo_counts.to_csv(out_wfo_multi, index=False)
print(f"WFO -> multi-names list: {out_wfo_multi}")

# ========= STEP 8: SUMMARY =========
total = len(df_bkf)
filled = total - len(unmatched)
src_counts = Counter(df2["wfo_source"])
conflicts = int(df2["any_conflict"].sum())

print("\n=== COVERAGE & SOURCE ===")
print(f"Total BKF rows: {total}")
print(f"Filled (any source): {filled}  |  Coverage: {filled/total:.1%}")
for k in ["bkf_original","output7_exact","candidates_unique","unfilled"]:
    print(f"  {k:18s}: {src_counts.get(k,0)}")
print(f"Any conflicts flagged: {conflicts}")

[BASELINE] Missing WFO (BKF): 898 / 7469
Saved enriched: bkf_wfo_priority_bkf_then_output_then_unique_cand.csv
Unmatched rows: 145  -> bkf_wfo_unmatched_after_priority.csv
Audit conflicts file: bkf_wfo_audit_conflicts.csv
Name -> multi-WFO list: audit_name_to_multiple_wfo.csv
WFO -> multi-names list: audit_wfo_to_multiple_names.csv

=== COVERAGE & SOURCE ===
Total BKF rows: 7469
Filled (any source): 7324  |  Coverage: 98.1%
  bkf_original      : 6571
  output7_exact     : 679
  candidates_unique : 74
  unfilled          : 145
Any conflicts flagged: 642


In [2]:
# ‡∏î‡∏π‡∏ú‡∏•‡∏´‡∏•‡∏±‡∏á map ‡∏Ñ‡∏£‡∏ö (‡∏Å‡πà‡∏≠‡∏ô export)
display(df2[["specific_name_clean", "wfo_id_final", "wfo_source", "wfo_full_name_final"]].head())

Unnamed: 0,specific_name_clean,wfo_id_final,wfo_source,wfo_full_name_final
0,myriophyllum siamense (craib) tardieu,wfo-0000373567,output7_exact,Myriophyllum siamense (Craib) Tardieu
1,myriophyllum tetrandrum roxb.,wfo-0001261381,bkf_original,Myriophyllum tetrandrum Roxb.
2,gonocarpus micranthus,wfo-0000715064,bkf_original,
3,rhizophora apiculata blume,wfo-0001131596,bkf_original,Rhizophora apiculata Blume
4,rhizophora mucronata poir.,wfo-0001131556,output7_exact,Rhizophora mucronata Poir.


In [3]:
# ‡∏î‡∏π‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏°‡∏µ WFO ID ‡πÅ‡∏•‡πâ‡∏ß ‡πÅ‡∏ï‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ full name
missing_full = df2[
    (df2["wfo_full_name_final"].isna() | (df2["wfo_full_name_final"].astype(str).str.strip() == "")) &
    (df2["wfo_id_final"].astype(str).str.strip() != "")
]

# ‡πÅ‡∏™‡∏î‡∏á‡∏ö‡∏≤‡∏á‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏™‡∏≥‡∏Ñ‡∏±‡∏ç‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏ä‡πá‡∏Ñ
cols_show = ["specific_name_clean", "wfo_id_final", "wfo_source", "wfo_full_name_final"]
print(f"Missing full name but has WFO ID: {len(missing_full)} rows")
display(missing_full[cols_show].head(20))

Missing full name but has WFO ID: 1509 rows


Unnamed: 0,specific_name_clean,wfo_id_final,wfo_source,wfo_full_name_final
2,gonocarpus micranthus,wfo-0000715064,bkf_original,
5,bruguiera gymnorhiza (l.) savigny,wfo-0000572747,candidates_unique,
11,ceriops decandra (griff.) ding hou,wfo-0000597936,candidates_unique,
13,pellacalyx parkinsonii c.e.c.fischer,wfo-0000472091,bkf_original,
20,biophytum umbraculum,wfo-0000565517,bkf_original,
27,brackenridgea elegantissima,wfo-0000570704,bkf_original,
29,campylospermum serratum,wfo-0000705577,bkf_original,
38,eriobotrya bengalensis,wfo-0001017659,bkf_original,
39,rhaphiolepis indica (l.) lindl. ex ker,wfo-0001016273,bkf_original,
41,micromeles cuspidata,wfo-0000996758,bkf_original,


In [4]:
missing_full.to_csv("bkf_missing_fullname_with_wfoid.csv", index=False)
print("Saved -> bkf_missing_fullname_with_wfoid.csv")

Saved -> bkf_missing_fullname_with_wfoid.csv


In [5]:
# ‡πÄ‡∏≠‡∏≤‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà‡∏°‡∏≤‡∏à‡∏≤‡∏Å BKF ‡πÄ‡∏î‡∏¥‡∏°
missing_full_bkf = missing_full.query('wfo_source == "bkf_original"').copy()
print(missing_full_bkf.shape)
display(missing_full_bkf[cols_show].head(20))

(1435, 29)


Unnamed: 0,specific_name_clean,wfo_id_final,wfo_source,wfo_full_name_final
2,gonocarpus micranthus,wfo-0000715064,bkf_original,
13,pellacalyx parkinsonii c.e.c.fischer,wfo-0000472091,bkf_original,
20,biophytum umbraculum,wfo-0000565517,bkf_original,
27,brackenridgea elegantissima,wfo-0000570704,bkf_original,
29,campylospermum serratum,wfo-0000705577,bkf_original,
38,eriobotrya bengalensis,wfo-0001017659,bkf_original,
39,rhaphiolepis indica (l.) lindl. ex ker,wfo-0001016273,bkf_original,
41,micromeles cuspidata,wfo-0000996758,bkf_original,
43,pyrus pyrifolia,wfo-0001017269,bkf_original,
45,rubus alpestris blume,wfo-0000990767,bkf_original,


In [7]:
# -*- coding: utf-8 -*-
import time, random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, WebDriverException

# ===== CONFIG =====
BASE_URL = "https://www.worldfloraonline.org/taxon/"
MAX_ATTEMPTS = 4           # retry ‡∏ï‡πà‡∏≠‡∏£‡∏´‡∏±‡∏™
WAIT_SEC    = 15           # ‡∏£‡∏≠‡πÇ‡∏´‡∏•‡∏î element
SAVE_EVERY  = 10           # ‡πÄ‡∏ã‡∏ü‡∏ú‡∏•‡∏ó‡∏∏‡∏Å 25 ‡πÅ‡∏ñ‡∏ß

# ===== Chrome Options =====
chrome_opts = Options()
chrome_opts.add_argument("--headless=new")  # ‡∏ñ‡πâ‡∏≤‡∏≠‡∏¢‡∏≤‡∏Å‡πÄ‡∏´‡πá‡∏ô‡∏´‡∏ô‡πâ‡∏≤‡∏à‡∏≠‡∏à‡∏£‡∏¥‡∏á comment ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î‡∏ô‡∏µ‡πâ‡∏≠‡∏≠‡∏Å
chrome_opts.add_argument("--no-sandbox")
chrome_opts.add_argument("--disable-dev-shm-usage")
chrome_opts.add_argument("--disable-gpu")
chrome_opts.add_argument("--disable-blink-features=AutomationControlled")
chrome_opts.add_argument("--window-size=1280,900")
chrome_opts.add_argument("--lang=en-US")
chrome_opts.add_argument("--disable-features=NetworkService")
chrome_opts.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 "
                         "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)
driver.set_page_load_timeout(WAIT_SEC + 5)

# ===== Helper =====
def extract_name_from_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    sci = soup.select_one("em[itemprop='scientificName'].taxonName")
    if not sci:
        sci = soup.select_one("em[itemprop='scientificName']") or soup.select_one("em.taxonName")
    auth = soup.select_one("span[itemprop='scientificNameAuthorship']")
    if not sci:
        return None
    sci_name = sci.get_text(strip=True)
    auth_name = auth.get_text(strip=True) if auth else ""
    return f"{sci_name} {auth_name}".strip()

def scrape_one_wfo(wfo_id: str):
    url = BASE_URL + str(wfo_id).strip()
    attempt = 0
    while attempt < MAX_ATTEMPTS:
        attempt += 1
        try:
            driver.get(url)
            WebDriverWait(driver, WAIT_SEC).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "em[itemprop='scientificName'], em.taxonName"))
            )
            time.sleep(0.8)
            full_name = extract_name_from_html(driver.page_source)
            if full_name:
                return full_name, "ok"
            else:
                raise TimeoutException("no_scientificName_tag")
        except (TimeoutException, WebDriverException) as e:
            backoff = min(2.0 * attempt, 6.0) + random.uniform(0.5, 1.5)
            msg = str(e)
            if "ERR_CONNEC" in msg or "ERR_TIMED_OUT" in msg or "ERR_FAILED" in msg:
                backoff += 2.0
            time.sleep(backoff)
            last_err = f"retry_{attempt}: {e}"
            continue
        except Exception as e:
            return None, f"error: {e}"
    return None, last_err if 'last_err' in locals() else "unknown_error"

# ===== ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° dataframe =====
mask_target = (
    (missing_full_bkf["wfo_id_final"].astype(str).str.strip() != "") &
    (missing_full_bkf["wfo_full_name_final"].isna() |
     (missing_full_bkf["wfo_full_name_final"].astype(str).str.strip() == ""))
)
targets = missing_full_bkf[mask_target].copy()
targets["scraped_name"] = ""
targets["scrape_status"] = ""

print(f"ü™¥ Total to scrape: {len(targets)} rows")

# ===== ‡∏£‡∏±‡∏ô‡∏à‡∏£‡∏¥‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î =====
for i, (idx, row) in enumerate(targets.iterrows(), start=1):
    wfo_id = str(row["wfo_id_final"]).strip()
    if not wfo_id:
        targets.at[idx, "scrape_status"] = "empty_wfo_id"
        continue

    full_name, status = scrape_one_wfo(wfo_id)
    targets.at[idx, "scraped_name"] = full_name if full_name else ""
    targets.at[idx, "scrape_status"] = status

    if status == "ok" and full_name:
        missing_full_bkf.loc[missing_full_bkf["wfo_id_final"] == wfo_id, "wfo_full_name_final"] = full_name

    # ‡∏û‡∏±‡∏Å‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡πÇ‡∏´‡∏•‡∏î‡∏´‡∏ô‡πâ‡∏≤ (‡∏™‡∏∏‡πà‡∏°‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢)
    time.sleep(random.uniform(1.0, 2.2))

    # ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ä‡∏±‡πà‡∏ß‡∏Ñ‡∏£‡∏≤‡∏ß‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏∞‡∏¢‡∏∞
    if i % SAVE_EVERY == 0:
        temp_out = f"bkf_scrape_progress_{i}.csv"
        targets.to_csv(temp_out, index=False)
        print(f"üíæ Progress saved ({i}/{len(targets)}) -> {temp_out}")

# ===== ‡∏õ‡∏¥‡∏î browser =====
driver.quit()

# ===== ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏ú‡∏•‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢ =====
targets.to_csv("bkf_wfo_scrape_all_results.csv", index=False)
missing_full_bkf.to_csv("bkf_full_after_scrape.csv", index=False)

print("‚úÖ DONE scraping all missing_full_bkf")
print(targets["scrape_status"].value_counts())
print("Saved -> bkf_full_after_scrape.csv")



ü™¥ Total to scrape: 1435 rows
üíæ Progress saved (10/1435) -> bkf_scrape_progress_10.csv
üíæ Progress saved (20/1435) -> bkf_scrape_progress_20.csv
üíæ Progress saved (30/1435) -> bkf_scrape_progress_30.csv
üíæ Progress saved (40/1435) -> bkf_scrape_progress_40.csv
üíæ Progress saved (50/1435) -> bkf_scrape_progress_50.csv
üíæ Progress saved (60/1435) -> bkf_scrape_progress_60.csv
üíæ Progress saved (70/1435) -> bkf_scrape_progress_70.csv
üíæ Progress saved (80/1435) -> bkf_scrape_progress_80.csv
üíæ Progress saved (90/1435) -> bkf_scrape_progress_90.csv
üíæ Progress saved (100/1435) -> bkf_scrape_progress_100.csv
üíæ Progress saved (110/1435) -> bkf_scrape_progress_110.csv
üíæ Progress saved (120/1435) -> bkf_scrape_progress_120.csv
üíæ Progress saved (130/1435) -> bkf_scrape_progress_130.csv
üíæ Progress saved (140/1435) -> bkf_scrape_progress_140.csv
üíæ Progress saved (150/1435) -> bkf_scrape_progress_150.csv
üíæ Progress saved (160/1435) -> bkf_scrape_progress_16

In [27]:
import pandas as pd 
targets = pd.read_csv("../bkf_wfo_scrape_all_results.csv")
targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1435 entries, 0 to 1434
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   volume                   1435 non-null   int64  
 1   family_name              1435 non-null   object 
 2   genus_name               1435 non-null   object 
 3   genus_label              1435 non-null   object 
 4   genus_index_parsed       1302 non-null   float64
 5   species_scientific_name  1435 non-null   object 
 6   accepted_name            1431 non-null   object 
 7   thailand                 1435 non-null   object 
 8   distribution             1405 non-null   object 
 9   ecology                  1378 non-null   object 
 10  family_url               1435 non-null   object 
 11  genus_url                1435 non-null   object 
 12  species_url              1435 non-null   object 
 13  scraped_at               1435 non-null   object 
 14  specific_name           

In [10]:
missing_full_bkf = pd.read_csv("../Processed data/bkf_full_after_scrape.csv")
missing_full_bkf

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,...,wfo_full_name,wfo_id_cand,wfo_full_name_cand,wfo_id_final,wfo_source,conflict_bkf_vs_o7,conflict_bkf_vs_cand,conflict_o7_vs_cand,any_conflict,wfo_full_name_final
0,2,Haloragaceae,2 Haloragis,Haloragis,2.0,Haloragis micrantha (Thunb.) R.Br. ex Sieb. & ...,Gonocarpus micranthus,NORTH-EASTERN: Loei (Phu Kradueng).,"India, S & E China, N Vietnam, Hainan, Formosa...","In marshy mountain turf, moist places along mo...",...,Gonocarpus micranthus Thunb.,,,wfo-0000715064,bkf_original,True,False,False,True,Haloragis micrantha R.Br. ex Siebold & Zucc.
1,2,Rhizophoraceae,7 Pellacalyx,Pellacalyx,7.0,Pellacalyx parkinsonii C.E.C.Fischer,This is currently accepted.,"PENINSULAR: Phuket, Surat Thani, Nakhon Si Tha...",Burma (type).,"In lowland evergreen or bamboo forests, up to ...",...,,wfo-0000472091,Pellacalyx parkinsonii C.E.C.Fisch.,wfo-0000472091,bkf_original,False,False,False,False,Pellacalyx parkinsonii C.E.C.Fisch.
2,2,Oxalidaceae,2 Biophytum,Biophytum,2.0,Biophytum petersianum Klotzsch,Biophytum umbraculum,"NORTHERN: Chiang Mai (Doi Inthanon), Chiang Ra...","Tropical Africa (type), Madagascar, tropical S...",Preferring areas with a dry season and helioph...,...,Biophytum umbraculum Welw.,,,wfo-0000565517,bkf_original,True,False,False,True,Biophytum petersianum Klotzsch
3,2,Ochnaceae,2 Brackenridgea,Brackenridgea,2.0,Brackenridgea hookeri (Planch.) A.Gray,Brackenridgea elegantissima,SOUTH-EASTERN: Trat (Ko Chang); PENINSULAR: Su...,"Andaman Islands, the Malay Peninsula (type), B...","In primary, evergreen forests, from sea level ...",...,Brackenridgea elegantissima (Wall.) Kanis,,,wfo-0000570704,bkf_original,True,False,False,True,Brackenridgea hookeri (Planch.) A.Gray
4,2,Ochnaceae,3 Gomphia,Gomphia,3.0,Gomphia serrata (Gaertn.) Kanis,Campylospermum serratum,"NORTH-EASTERN: Udon Thani; EASTERN: Surin, Si ...","South peninsular India, Ceylon (type), Laos, C...","From sea level up to 1,200 m alt., in moist, p...",...,Campylospermum serratum (Gaertn.) Bittrich & M...,,,wfo-0000705577,bkf_original,True,False,False,True,Gomphia serrata (Gaertn.) Kanis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,16,Annonaceae,14. Goniothalamus,Goniothalamus,14.0,Goniothalamus sukhirinensis Leerat.,This is currently accepted.,"PENINSULAR: Narathiwat (Sukhirin, Ban Ya De vi...",Endemic.,Shady and moist areas of tropical rain forests...,...,,wfo-1000065421,"Goniothalamus sukhirinensis Leerat., Chalermgl...",wfo-1000065421,bkf_original,False,False,False,False,"Goniothalamus sukhirinensis Leerat., Chalermgl..."
1431,16,Annonaceae,14. Goniothalamus,Goniothalamus,14.0,Goniothalamus macrophyllus (Blume) Hook.f. & T...,This is currently accepted.,SOUTH-WESTERN: Phetchaburi; PENINSULAR: Ranong...,"Peninsular Malaysia, Singapore, Sumatra, Borne...","Variably dry or wet, primary and secondary, lo...",...,,wfo-0000706521,Goniothalamus macrophyllus Hook.f. & Thomson,wfo-0000706521,bkf_original,False,False,False,False,Goniothalamus macrophyllus Hook.f. & Thomson
1432,16,Annonaceae,14. Goniothalamus,Goniothalamus,14.0,Goniothalamus roseipetalus Leerat.,This is currently accepted.,"PENINSULAR: Yala, Narathiwat (Cha Nae, Du Son ...",Endemic.,Shady and moist areas of tropical rain forests...,...,,wfo-1000065420,"Goniothalamus roseipetalus Leerat., Chalermgli...",wfo-1000065420,bkf_original,False,False,False,False,"Goniothalamus roseipetalus Leerat., Chalermgli..."
1433,16,Annonaceae,24. Monoon,Monoon,24.0,Monoon simiarum (Wall. ex Hook.f. & Thomson) B...,This is currently accepted.,"NORTHERN: Chiang Mai, Lampang, Tak, Sukhothai,...","India, Bhutan, Bangladesh, Myanmar, China, Lao...","Dry evergreen and deciduous forests, 275‚Äí1050 ...",...,,wfo-0001334317,Monoon simiarum (Buch.-Ham. ex Hook.f. & Thoms...,wfo-0001334317,bkf_original,False,False,False,False,Monoon simiarum (Buch.-Ham. ex Hook.f. & Thoms...


In [28]:
missing_full_bkf[missing_full_bkf['wfo_full_name_final'].isna() | (missing_full_bkf['wfo_full_name_final'].astype(str).str.strip() == "")]

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,...,wfo_full_name,wfo_id_cand,wfo_full_name_cand,wfo_id_final,wfo_source,conflict_bkf_vs_o7,conflict_bkf_vs_cand,conflict_o7_vs_cand,any_conflict,wfo_full_name_final
9,2,Rosaceae,10 Rubus,Rubus,10.0,Rubus alpestris Blume,This is currently accepted.,NORTHERN: Chiang Mai (Fang).,"N Vietnam, Java (type).","In lower montane forests, 1,000‚Äì2,000 m alt.",...,Rubus alpestris Blume,,,wfo-0000990767,bkf_original,True,False,False,True,
417,4,Leguminosae-Caesalpinioideae,8 Caesalpinia,Caesalpinia,8.0,Caesalpinia pubescens (Desf.) Hattink,Mezoneuron pubescens,PENINSULAR: Surat Thani.,"S Vietnam, Malay Archipelago (type from Java).",In thickets. Flowering: August.,...,,,,wfo-0000199400,bkf_original,False,False,False,False,
418,4,Leguminosae-Caesalpinioideae,8 Caesalpinia,Caesalpinia,8.0,Caesalpinia cucullata Roxb.,Mezoneuron cucullatum,NORTHERN: Lampang; NORTH-EASTERN: Loei; EASTER...,"India (Assam ‚Äì type), Sikkim, Nepal, Andaman I...",In evergreen forests along streams and on frin...,...,Mezoneuron cucullatum (Roxb.) Wight & Arn.,,,wfo-0000187461,bkf_original,True,False,False,True,
419,4,Leguminosae-Caesalpinioideae,8 Caesalpinia,Caesalpinia,8.0,Caesalpinia godefroyana Kuntze,Biancaea godefroyana,"EASTERN: Nakhon Ratchasima, Buriram; CENTRAL: ...","Cambodia, S Vietnam (type).","In scrub jungle, arid and sandy soils. Floweri...",...,Biancaea godefroyana (Kuntze) Molinari & Mayta,,,wfo-0000199378,bkf_original,True,False,False,True,
420,4,Leguminosae-Caesalpinioideae,10 Koompassia,Koompassia,10.0,Koompassia malaccensis Maingay ex Benth.,This is currently accepted.,"PENINSULAR: Yala; Trang (Chong). Frequently, o...","Malay Peninsula (type), Sumatra, Borneo.","Lowland evergreen forests, up to 700 m alt., a...",...,,wfo-0000193202,Koompassia malaccensis Maingay,wfo-0000193202,bkf_original,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,10,Moraceae,4. Ficus,Ficus,4.0,URO12. Ficus consociata Blume,This is currently accepted.,"SOUTH-EASTERN: Chanthaburi, Trat; PENINSULAR: ...","Burma, Vietnam, Cambodia, Peninsular and Borne...","Evergreen forests, to 1,000 m alt.",...,,,,wfo-0000687905,bkf_original,False,False,False,False,
1104,10,Moraceae,4. Ficus,Ficus,4.0,URO14. Ficus cucurbitina King,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Narathiwat.","Peninsular and Bornean Malaysia (type), Indone...","Evergreen forests, at low altitudes.",...,,,,wfo-0000687982,bkf_original,False,False,False,False,
1105,10,Moraceae,4. Ficus,Ficus,4.0,URO15. Ficus curtipes Corner,This is currently accepted.,"NORTHERN: Mae Hong Son, Chiang Mai, Lamphun; N...","Sikkim, Bhutan, NE India, Bangladesh (type), B...","Evergreen, dry evergreen and mixed deciduous f...",...,,,,wfo-0000688008,bkf_original,False,False,False,False,
1106,10,Moraceae,4. Ficus,Ficus,4.0,SYC3. Ficus cuspidata Reinw. ex Blume,This is currently accepted.,"PENINSULAR: Phangnga, Nakhon Si Thammarat.","Peninsular Malaysia, Indonesia (type, Sumatra,...","Evergreen forests, 700‚Äì1,200 m alt.",...,,,,wfo-0000688009,bkf_original,False,False,False,False,


‡πÅ‡∏Å‡πâ‡∏™‡πà‡∏ß‡∏ô‡∏ó‡∏µ‡πà‡∏î‡∏∂‡∏á‡∏°‡∏≤‡πÑ‡∏°‡πà‡∏Ñ‡∏£‡∏ö

In [None]:
# -*- coding: utf-8 -*-
import os, re, time, random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, WebDriverException

# ============== PATHS (‡∏õ‡∏£‡∏±‡∏ö‡πÑ‡∏î‡πâ‡∏ñ‡πâ‡∏≤‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô) =================
# Notebook ‡∏≠‡∏¢‡∏π‡πà‡∏ó‡∏µ‡πà: /Users/.../dsi314/Scripts/
# ‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡∏ó‡∏±‡∏ö:
FINAL_PATH = "../Processed data/bkf_full_after_scrape.csv"
# ‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏• scrape ‡∏£‡∏≤‡∏¢‡πÅ‡∏ñ‡∏ß (‡πÄ‡∏Ñ‡∏¢‡∏£‡∏±‡∏ô‡πÅ‡∏•‡πâ‡∏ß): ‡πÉ‡∏´‡πâ‡πÄ‡∏Å‡πá‡∏ö‡πÑ‡∏ß‡πâ‡∏ó‡∏µ‡πà‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå Processed data ‡πÄ‡∏ä‡πà‡∏ô‡∏Å‡∏±‡∏ô (‡∏à‡∏∞ resume ‡πÑ‡∏î‡πâ)
RESULTS_PATH = "../bkf_wfo_scrape_all_results.csv"
# ‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå log
LOG_DIR = "/Users/thanwaratkeratipasuwat/Desktop/dsi314/results/log"
# ==========================================================

# ===== CONFIG =====
BASE_URL      = "https://www.worldfloraonline.org/taxon/"
MAX_ATTEMPTS  = 4           # retry ‡∏ï‡πà‡∏≠‡∏£‡∏´‡∏±‡∏™
WAIT_SEC      = 20          # ‡∏£‡∏≠‡πÇ‡∏´‡∏•‡∏î element
SAVE_EVERY    = 10          # ‡πÄ‡∏ã‡∏ü‡∏ú‡∏•‡∏ó‡∏∏‡∏Å 10 ‡πÅ‡∏ñ‡∏ß
SLEEP_EACH    = (1.2, 2.5)  # ‡∏´‡∏ô‡πà‡∏ß‡∏á‡∏™‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≠‡πÅ‡∏ñ‡∏ß
SLEEP_BLOCK   = (6.0, 8.0)  # ‡∏´‡∏ô‡πà‡∏ß‡∏á‡∏¢‡∏≤‡∏ß‡∏ó‡∏∏‡∏Å 20 ‡πÅ‡∏ñ‡∏ß

# ---------- Utils ----------
def ensure_dirs():
    os.makedirs(os.path.dirname(FINAL_PATH), exist_ok=True)
    os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
    os.makedirs(LOG_DIR, exist_ok=True)

def make_driver():
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--window-size=1280,900")
    opts.add_argument("--lang=en-US")
    opts.add_argument("--disable-features=NetworkService")
    opts.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    d = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    d.set_page_load_timeout(WAIT_SEC + 10)
    try:
        d.execute_cdp_cmd("Page.enable", {})
    except Exception:
        pass
    return d

def safe_quit(drv):
    try:
        drv.quit()
    except Exception:
        pass

def extract_name_from_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    sci = soup.select_one("em[itemprop='scientificName'].taxonName")
    if not sci:
        sci = soup.select_one("em[itemprop='scientificName']") or soup.select_one("em.taxonName")
    auth = soup.select_one("span[itemprop='scientificNameAuthorship']")
    if not sci:
        return None
    sci_name  = sci.get_text(strip=True)
    auth_name = auth.get_text(strip=True) if auth else ""
    full = f"{sci_name} {auth_name}".strip()
    full = re.sub(r"\s+", " ", full).strip()
    return full

# -------- Health check (‡∏Å‡∏±‡∏ô‡πÄ‡∏ß‡πá‡∏ö‡∏•‡πà‡∏°/‡πÄ‡∏ô‡πá‡∏ï‡∏™‡∏∞‡∏î‡∏∏‡∏î) --------
def wfo_is_up(driver, timeout=WAIT_SEC):
    try:
        driver.set_page_load_timeout(timeout)
        driver.get("https://www.worldfloraonline.org/")
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        html = driver.page_source or ""
        return len(html) > 500
    except Exception:
        return False
    finally:
        driver.set_page_load_timeout(WAIT_SEC + 10)

# -------- Scrape 1 ‡∏£‡∏´‡∏±‡∏™ (‡∏ó‡∏ô‡∏ó‡∏≤‡∏ô) --------
def scrape_one_wfo(driver, wfo_id: str):
    url = BASE_URL + str(wfo_id).strip()
    attempt, last_err = 0, "unknown_error"
    while attempt < MAX_ATTEMPTS:
        attempt += 1
        try:
            if not wfo_is_up(driver):
                time.sleep(10 + random.uniform(0.5, 1.5))
                continue
            driver.get(url)
            WebDriverWait(driver, WAIT_SEC).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "em[itemprop='scientificName'], em.taxonName, body"))
            )
            time.sleep(0.8)
            full_name = extract_name_from_html(driver.page_source)
            if full_name:
                return full_name, "ok"
            # ‡πÄ‡∏Å‡πá‡∏ö‡∏™‡∏Å‡∏£‡∏µ‡∏ô‡∏ä‡πá‡∏≠‡∏ï‡πÑ‡∏ß‡πâ‡∏î‡∏π
            try:
                driver.save_screenshot(os.path.join(LOG_DIR, f"wfo_{wfo_id}_no_name.png"))
            except Exception:
                pass
            raise TimeoutException("no_scientificName_tag")
        except (TimeoutException, WebDriverException) as e:
            last_err = f"retry_{attempt}: {e}"
            try:
                driver.save_screenshot(os.path.join(LOG_DIR, f"wfo_{wfo_id}_try{attempt}.png"))
            except Exception:
                pass
            backoff = min(5.0 * attempt, 20.0) + random.uniform(0.5, 1.5)
            if any(tok in str(e) for tok in ["ERR_SOCKET", "ERR_CONNEC", "ERR_TIMED_OUT", "ERR_FAILED"]):
                backoff += 5.0
            time.sleep(backoff)
            # ‡∏£‡∏µ‡∏™‡∏ï‡∏≤‡∏£‡πå‡∏ó driver ‡∏Å‡∏•‡∏≤‡∏á‡∏ó‡∏≤‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏ó‡∏µ‡πà 2 ‡πÅ‡∏•‡∏∞‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
            if attempt in (2, MAX_ATTEMPTS):
                safe_quit(driver)
                time.sleep(1.0)
                driver = make_driver()
                time.sleep(2.0 + random.uniform(0.5, 1.5))
            continue
        except Exception as e:
            last_err = f"error: {e}"
            break
    return None, last_err

# ----------------- MAIN -----------------
ensure_dirs()

# 1) ‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å (bkf_full_after_scrape.csv)
if not os.path.exists(FINAL_PATH):
    raise FileNotFoundError(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å: {FINAL_PATH}")

missing_full_bkf = pd.read_csv(FINAL_PATH)

# 2) ‡πÇ‡∏´‡∏•‡∏î‡∏ú‡∏•‡πÄ‡∏Å‡πà‡∏≤ (bkf_wfo_scrape_all_results.csv) ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ
if os.path.exists(RESULTS_PATH):
    prev_results = pd.read_csv(RESULTS_PATH)
    # ensure cols
    if "wfo_id_final" not in prev_results.columns:
        # ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÄ‡∏Ñ‡∏¢‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÉ‡∏ä‡πâ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏≠‡∏∑‡πà‡∏ô
        raise ValueError("RESULTS_PATH ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå 'wfo_id_final' ‡πÅ‡∏•‡∏∞ 'scrape_status'")
    # ‡πÄ‡∏≠‡∏≤‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà status != ok
    need_ids = prev_results.loc[prev_results["scrape_status"].astype(str).str.lower() != "ok", "wfo_id_final"].dropna().astype(str).str.strip().unique().tolist()
else:
    prev_results = None
    need_ids = []

# 3) ‡∏ô‡∏¥‡∏¢‡∏≤‡∏° target
if need_ids:
    # ‡πÉ‡∏ä‡πâ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢ fail
    mask_target = missing_full_bkf["wfo_id_final"].astype(str).str.strip().isin(need_ids)
else:
    # fallback: ‡πÄ‡∏≠‡∏≤‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏ï‡πá‡∏°
    mask_target = (
        (missing_full_bkf["wfo_id_final"].astype(str).str.strip() != "") &
        (missing_full_bkf["wfo_full_name_final"].isna() |
         (missing_full_bkf["wfo_full_name_final"].astype(str).str.strip() == ""))
    )

targets = missing_full_bkf[mask_target].copy()
targets = targets.reset_index(drop=False).rename(columns={"index":"orig_index"})
# ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ú‡∏• (‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö resume)
if "scraped_name" not in targets.columns:
    targets["scraped_name"] = ""
if "scrape_status" not in targets.columns:
    targets["scrape_status"] = ""

print(f"ü™¥ Total to (re)scrape: {len(targets)} rows")

# 4) ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° driver ‡πÅ‡∏•‡∏∞‡∏£‡∏±‡∏ô
driver = make_driver()
bad_rows = []
done = 0

try:
    for i, row in targets.iterrows():
        idx_global = int(row["orig_index"])
        wfo_id = str(row["wfo_id_final"]).strip() if "wfo_id_final" in row else ""

        print(f"‚Üí Row {idx_global}: fetching {wfo_id} ...")
        if not wfo_id:
            targets.at[i, "scrape_status"] = "empty_wfo_id"
            bad_rows.append({"row": idx_global, "wfo_id": "", "status": "empty_wfo_id"})
            continue

        full_name, status = scrape_one_wfo(driver, wfo_id)
        targets.at[i, "scraped_name"] = full_name if full_name else ""
        targets.at[i, "scrape_status"] = status

        if status == "ok" and full_name:
            # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å‡πÉ‡∏ô‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏à‡∏≥
            missing_full_bkf.loc[missing_full_bkf["wfo_id_final"] == wfo_id, "wfo_full_name_final"] = full_name
        else:
            bad_rows.append({"row": idx_global, "wfo_id": wfo_id, "status": status})

        # ‡∏´‡∏ô‡πà‡∏ß‡∏á
        time.sleep(random.uniform(*SLEEP_EACH))
        if (i+1) % 20 == 0:
            time.sleep(random.uniform(*SLEEP_BLOCK))

        done += 1
        if (done % SAVE_EVERY == 0) or (i+1 == len(targets)):
            # 5) ‡πÄ‡∏ã‡∏ü progress ‚Üí ‡∏ó‡∏±‡πâ‡∏á‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏£‡∏ß‡∏° ‡πÅ‡∏•‡∏∞‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å
            try:
                # ‡∏£‡∏ß‡∏°‡∏ú‡∏•‡πÉ‡∏´‡∏°‡πà‡∏Å‡∏±‡∏ö‡∏ú‡∏•‡πÄ‡∏Å‡πà‡∏≤ (‡∏ñ‡πâ‡∏≤‡∏°‡∏µ) ‡πÇ‡∏î‡∏¢‡∏≠‡∏¥‡∏á wfo_id_final
                if prev_results is not None:
                    # drop duplicate key ‡πÅ‡∏•‡πâ‡∏ß concat
                    keep_cols = ["wfo_id_final", "scraped_name", "scrape_status"]
                    new_part = targets[keep_cols].copy()
                    # ‡∏•‡∏ö‡∏ã‡πâ‡∏≥‡πÉ‡∏ô prev ‡∏ï‡∏≤‡∏° key
                    prev_results = prev_results.drop_duplicates(subset=["wfo_id_final"], keep="last")
                    new_part   = new_part.drop_duplicates(subset=["wfo_id_final"], keep="last")
                    merged = pd.concat([prev_results[keep_cols], new_part], ignore_index=True)
                    merged = merged.drop_duplicates(subset=["wfo_id_final"], keep="last")
                    merged.to_csv(RESULTS_PATH, index=False, encoding="utf-8-sig")
                else:
                    # ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡∏°‡∏µ‡∏ú‡∏•‡πÄ‡∏Å‡πà‡∏≤ ‚Üí ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÉ‡∏´‡∏°‡πà‡∏à‡∏≤‡∏Å targets
                    targets[["wfo_id_final","scraped_name","scrape_status"]].drop_duplicates(subset=["wfo_id_final"]).to_csv(
                        RESULTS_PATH, index=False, encoding="utf-8-sig"
                    )

                # ‡πÄ‡∏ã‡∏ü‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏±‡∏Å‡∏ó‡∏±‡∏ö
                missing_full_bkf.to_csv(FINAL_PATH, index=False, encoding="utf-8-sig")

                # ‡πÄ‡∏ã‡∏ü bad rows log
                if bad_rows:
                    pd.DataFrame(bad_rows).to_csv(os.path.join(LOG_DIR, "bkf_bad_rows_rescrape.csv"), index=False)

                print(f"üíæ Progress saved ‚Üí\n  - {RESULTS_PATH}\n  - {FINAL_PATH}\n  ({done}/{len(targets)})")
            except Exception as e:
                print("‚ö†Ô∏è Save error:", e)

finally:
    safe_quit(driver)

print("‚úÖ DONE rescraping failed rows.")
if bad_rows:
    print(f"‚ö†Ô∏è ‡∏¢‡∏±‡∏á‡∏°‡∏µ {len(bad_rows)} ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏î‡∏∂‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ ‚Äî ‡∏î‡∏π {os.path.join(LOG_DIR, 'bkf_bad_rows_rescrape.csv')}")
else:
    print("üéâ ‡∏î‡∏∂‡∏á‡∏Ñ‡∏£‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÅ‡∏•‡πâ‡∏ß")

## Candidates Clean

In [2]:
import pandas as pd

In [5]:
df_cleaned = pd.read_csv("../Raw data/bkf_wfo_priority_bkf_then_output_then_unique_cand.csv")
df_cleaned.head()

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id,wfo_full_name,wfo_source
0,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum siamense (Craib) Tardieu,This is currently accepted.,"PENINSULAR: Nakhon Si Thammarat, Songkhla (type).","Known from 3 localities: the type-locality, an...","In small mats on damp sandy ground, edge of ma...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:33.566587+00:00,Myriophyllum siamense (Craib) Tardieu,wfo-0000373567,Myriophyllum siamense (Craib) Tardieu,output7_exact
1,2,Haloragaceae,1 Myriophyllum,Myriophyllum,1.0,Myriophyllum tetrandrum Roxb.,This is currently accepted.,SOUTH-WESTERN: Prachuap Khiri Khan (Bang Sapha...,"E India (type), Indochina, Malay Peninsula.","In rather shallow, open water of ditches, cana...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:36.633398+00:00,Myriophyllum tetrandrum Roxb.,wfo-0001261381,Myriophyllum tetrandrum Roxb.,bkf_original
2,2,Haloragaceae,2 Haloragis,Haloragis,2.0,Haloragis micrantha (Thunb.) R.Br. ex Sieb. & ...,Gonocarpus micranthus,NORTH-EASTERN: Loei (Phu Kradueng).,"India, S & E China, N Vietnam, Hainan, Formosa...","In marshy mountain turf, moist places along mo...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:17:50.049475+00:00,Gonocarpus micranthus,wfo-0000715064,,bkf_original
3,2,Rhizophoraceae,1 Rhizophora,Rhizophora,1.0,Rhizophora apiculata Blume,This is currently accepted.,CENTRAL: Chon Buri (Si Racha); SOUTH-EASTERN: ...,In tropical SE Asia throughout Malesia (type) ...,Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:15.874773+00:00,Rhizophora apiculata Blume,wfo-0001131596,Rhizophora apiculata Blume,bkf_original
4,2,Rhizophoraceae,1 Rhizophora,Rhizophora,1.0,Rhizophora mucronata Poir.,This is currently accepted.,SOUTH-EASTERN: Chanthaburi; SOUTH-WESTERN: Pra...,"In the Old World tropics, occurring from the c...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:19.615290+00:00,Rhizophora mucronata Poir.,wfo-0001131556,Rhizophora mucronata Poir.,output7_exact


#### Manual Check WFO-ID of Those Flowers

In [8]:
import os
import pandas as pd

# ‡πÄ‡∏£‡∏≤‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå scripts/
# ‡∏î‡∏±‡∏á‡∏ô‡∏±‡πâ‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏¢‡πâ‡∏≠‡∏ô‡∏Ç‡∏∂‡πâ‡∏ô‡πÑ‡∏õ‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏£‡∏∞‡∏î‡∏±‡∏ö‡πÑ‡∏õ‡∏´‡∏≤ data_processed/
save_dir = "../Processed data"

# ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏°‡∏≤‡∏à‡∏≤‡∏Å candidates_unique
df_unican = df_cleaned[df_cleaned['wfo_source'] == 'candidates_unique']

# ‡∏Å‡∏≥‡∏´‡∏ô‡∏î path ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå
save_path = os.path.join(save_dir, "check_unicandidates.csv")

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå
df_unican.to_csv(save_path, index=False, encoding='utf-8-sig')

print(f"‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡∏ó‡∏µ‡πà: {os.path.abspath(save_path)}")

‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡∏ó‡∏µ‡πà: /Users/thanwaratkeratipasuwat/Desktop/dsi314/Processed data/check_unicandidates.csv


In [11]:
df_unican.head()

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id,wfo_full_name,wfo_source
5,2,Rhizophoraceae,2 Bruguiera,Bruguiera,2.0,Bruguiera gymnorhiza (L.) Savigny,This is currently accepted.,"SOUTH-EASTERN: Chanthaburi, Trat; PENINSULAR: ...","In the Old World tropics, from S & E Africa th...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:33.101996+00:00,Bruguiera gymnorhiza (L.) Savigny,wfo-0000572747,,candidates_unique
11,2,Rhizophoraceae,3 Ceriops,Ceriops,3.0,Ceriops decandra (Griff.) Ding Hou,This is currently accepted.,"SOUTH-EASTERN: Chon Buri (Si Racha), Chathabur...","India (type), Burma, Cambodia, S Vietnam, Male...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:19:14.152028+00:00,Ceriops decandra (Griff.) Ding Hou,wfo-0000597936,,candidates_unique
134,2,Connaraceae,4 Rourea (in part),Rourea (in part),4.0,Rourea sp.,This is currently accepted.,SOUTH-EASTERN: Chanthaburi.,,Evergreen forests. Fruiting April‚ÄìJune.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:36:30.322756+00:00,Rourea sp.,wfo-4000033545,,candidates_unique
145,2,Connaraceae,6 Connarus,Connarus,6.0,Connarus monocarpus L.,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:37:53.618260+00:00,Connarus monocarpus L.,wfo-0000618164,,candidates_unique
148,2,Apostasiaceae,1 Neuwiedia,Neuwiedia,1.0,Neuwiedia zollingeri Reichb.f.,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:38:43.553646+00:00,Neuwiedia zollingeri Reichb.f.,wfo-0000251829,,candidates_unique


In [None]:
# -*- coding: utf-8 -*-
import os, re, time, random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, WebDriverException

# ======================================================
# ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡∏û‡∏≤‡∏ò‡πÄ‡∏≠‡∏≤‡∏ï‡πå‡∏û‡∏∏‡∏ï (‡∏õ‡∏£‡∏±‡∏ö‡πÑ‡∏î‡πâ‡∏ï‡∏≤‡∏°‡πÇ‡∏Ñ‡∏£‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏õ‡∏£‡πÄ‡∏à‡∏Å‡∏ï‡πå)
OUTPUT_PATH = "../data_processed/unican_fillname.csv"
# ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô
BASE_URL     = "https://www.worldfloraonline.org/taxon/"
MAX_ATTEMPTS = 4
WAIT_SEC     = 15
SAVE_EVERY   = 10   # ‡πÄ‡∏ã‡∏ü‡∏ó‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ó‡∏∏‡∏Å ‡πÜ 10 ‡πÅ‡∏ñ‡∏ß
# ======================================================

# ---- df_unican ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå 'wfo_id', 'wfo_full_name'
# ‡∏™‡∏°‡∏°‡∏ï‡∏¥ df_unican ‡∏°‡∏µ‡∏≠‡∏¢‡∏π‡πà‡πÅ‡∏•‡πâ‡∏ß‡πÉ‡∏ô‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏à‡∏≥
assert {"wfo_id", "wfo_full_name"}.issubset(df_unican.columns), "df_unican ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå wfo_id ‡πÅ‡∏•‡∏∞ wfo_full_name"

# ===== Chrome Options =====
chrome_opts = Options()
chrome_opts.add_argument("--headless=new")
chrome_opts.add_argument("--no-sandbox")
chrome_opts.add_argument("--disable-dev-shm-usage")
chrome_opts.add_argument("--disable-gpu")
chrome_opts.add_argument("--disable-blink-features=AutomationControlled")
chrome_opts.add_argument("--window-size=1280,900")
chrome_opts.add_argument("--lang=en-US")
chrome_opts.add_argument("--disable-features=NetworkService")
chrome_opts.add_argument(
    "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_opts)
driver.set_page_load_timeout(WAIT_SEC + 5)

def extract_name_from_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    sci = soup.select_one("em[itemprop='scientificName'].taxonName")
    if not sci:
        sci = soup.select_one("em[itemprop='scientificName']") or soup.select_one("em.taxonName")
    auth = soup.select_one("span[itemprop='scientificNameAuthorship']")
    if not sci:
        return None
    sci_name  = sci.get_text(strip=True)
    auth_name = auth.get_text(strip=True) if auth else ""
    full = f"{sci_name} {auth_name}".strip()
    full = re.sub(r"\s+", " ", full).strip()
    return full

def scrape_one_wfo(wfo_id: str):
    url = BASE_URL + str(wfo_id).strip()
    attempt, last_err = 0, "unknown_error"
    while attempt < MAX_ATTEMPTS:
        attempt += 1
        try:
            driver.get(url)
            WebDriverWait(driver, WAIT_SEC).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "em[itemprop='scientificName'], em.taxonName"))
            )
            time.sleep(0.8)
            full_name = extract_name_from_html(driver.page_source)
            if full_name:
                return full_name, "ok"
            raise TimeoutException("no_scientificName_tag")
        except (TimeoutException, WebDriverException) as e:
            last_err = f"retry_{attempt}: {e}"
            backoff = min(2.0 * attempt, 6.0) + random.uniform(0.5, 1.5)
            if any(err in str(e) for err in ["ERR_CONNEC", "ERR_TIMED_OUT", "ERR_FAILED"]):
                backoff += 2.0
            time.sleep(backoff)
        except Exception as e:
            last_err = f"error: {e}"
            break
    return None, last_err

# ====== ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° DataFrame ‡πÅ‡∏•‡∏∞ resume ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏° ======
df_unican["wfo_id"] = df_unican["wfo_id"].astype(str).str.strip()
if "wfo_full_name" not in df_unican.columns:
    df_unican["wfo_full_name"] = ""

if os.path.exists(OUTPUT_PATH):
    prev = pd.read_csv(OUTPUT_PATH)
    # ‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏ï‡∏¥‡∏°‡∏ä‡∏∑‡πà‡∏≠‡πÅ‡∏•‡πâ‡∏ß‡∏°‡∏≤‡∏ó‡∏±‡∏ö (‡∏≠‡∏¥‡∏á wfo_id)
    prev = prev.drop_duplicates(subset=["wfo_id"], keep="last")
    df_unican = (
        df_unican.drop_duplicates(subset=["wfo_id"], keep="last")
        .merge(prev[["wfo_id", "wfo_full_name"]], on="wfo_id", how="left", suffixes=("", "_prev"))
    )
    df_unican["wfo_full_name"] = df_unican.apply(
        lambda r: r["wfo_full_name"] if isinstance(r["wfo_full_name"], str) and r["wfo_full_name"].strip()
        else (r["wfo_full_name_prev"] if isinstance(r["wfo_full_name_prev"], str) else ""),
        axis=1
    )
    df_unican = df_unican.drop(columns=["wfo_full_name_prev"])

# ====== ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏ä‡∏∑‡πà‡∏≠‡πÄ‡∏ï‡πá‡∏° ======
mask = (df_unican["wfo_id"] != "") & (
    df_unican["wfo_full_name"].isna() | (df_unican["wfo_full_name"].astype(str).str.strip() == "")
)
targets_idx = list(df_unican[mask].index)
print(f"ü™¥ Total to fill: {len(targets_idx)} rows")

# ====== ‡∏£‡∏±‡∏ô‡∏à‡∏£‡∏¥‡∏á‡πÅ‡∏•‡∏∞‡πÄ‡∏ã‡∏ü‡∏ó‡∏∏‡∏Å ‡πÜ 10 ‡πÅ‡∏ñ‡∏ß ======
done = 0
try:
    for i, idx in enumerate(targets_idx, start=1):
        wfo_id = df_unican.at[idx, "wfo_id"]
        if not wfo_id:
            continue

        full_name, status = scrape_one_wfo(wfo_id)
        if status == "ok" and full_name:
            df_unican.at[idx, "wfo_full_name"] = full_name

        time.sleep(random.uniform(1.0, 2.2))
        done += 1

        if (done % SAVE_EVERY == 0) or (i == len(targets_idx)):
            os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
            df_unican.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")
            print(f"üíæ Saved ‚Üí {OUTPUT_PATH}  ({done}/{len(targets_idx)})")
finally:
    try:
        driver.quit()
    except Exception:
        pass

print("‚úÖ DONE. Final file:", os.path.abspath(OUTPUT_PATH))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unican["wfo_id"] = df_unican["wfo_id"].astype(str).str.strip()


ü™¥ Total to fill: 74 rows
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (10/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (20/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (30/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (40/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (50/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (60/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (70/74)
üíæ Saved ‚Üí ../data_processed/unican_fillname.csv  (74/74)
‚úÖ DONE. Final file: /Users/thanwaratkeratipasuwat/Desktop/dsi314/data_processed/unican_fillname.csv


#### ‡πÄ‡∏ô‡∏∑‡πà‡∏≠‡∏á‡∏à‡∏≤‡∏Å‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡πÄ‡∏Å‡πá‡∏ö‡∏°‡∏≤‡πÑ‡∏°‡πà‡∏´‡∏°‡∏î ‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ‡πÄ‡∏Å‡πá‡∏ö‡πÉ‡∏´‡∏°‡πà

In [23]:
# -*- coding: utf-8 -*-
import os, re, time, random
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, WebDriverException

# ======================================================
# PATH & CONFIG (‡∏õ‡∏£‡∏±‡∏ö‡πÑ‡∏î‡πâ)
OUTPUT_PATH     = "../Processed data/unican_fillname.csv"
LOG_DIR         = "../results/log"
BASE_URL        = "https://www.worldfloraonline.org/taxon/"
MAX_ATTEMPTS    = 4
WAIT_SEC        = 20   # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ß‡∏•‡∏≤‡∏£‡∏≠‡πÉ‡∏´‡πâ‡∏ó‡∏ô‡πÄ‡∏ô‡πá‡∏ï‡∏ä‡πâ‡∏≤‡∏Ç‡∏∂‡πâ‡∏ô
SAVE_EVERY      = 10   # ‡πÄ‡∏ã‡∏ü‡∏ó‡∏±‡∏ö‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß‡∏ó‡∏∏‡∏Å ‡πÜ 10 ‡πÅ‡∏ñ‡∏ß
START_FROM_ROW  = None # ‡πÄ‡∏ä‡πà‡∏ô 34 ‡∏à‡∏∞‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏≥‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà 34 ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡πâ‡∏ô‡πÑ‡∏õ
SLEEP_EACH_MIN  = (1.0, 2.2)  # ‡∏û‡∏±‡∏Å‡∏™‡∏±‡πâ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞‡πÅ‡∏ñ‡∏ß
SLEEP_EACH_20   = (6.0, 8.0)  # ‡∏û‡∏±‡∏Å‡∏¢‡∏≤‡∏ß‡∏ó‡∏∏‡∏Å‡πÜ 20 ‡πÅ‡∏ñ‡∏ß
# ======================================================

# ---- df_unican ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå 'wfo_id', 'wfo_full_name'
assert {"wfo_id", "wfo_full_name"}.issubset(df_unican.columns), "df_unican ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå wfo_id ‡πÅ‡∏•‡∏∞ wfo_full_name"

# ===== Utilities =====
def ensure_dirs():
    out_dir = os.path.dirname(OUTPUT_PATH)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)
    os.makedirs(LOG_DIR, exist_ok=True)

def make_driver():
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--window-size=1280,900")
    opts.add_argument("--lang=en-US")
    opts.add_argument("--disable-features=NetworkService")
    opts.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    d = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    d.set_page_load_timeout(WAIT_SEC + 10)
    try:
        d.execute_cdp_cmd("Page.enable", {})
    except Exception:
        pass
    return d

def safe_quit(drv):
    try:
        drv.quit()
    except Exception:
        pass

def extract_name_from_html(html: str):
    soup = BeautifulSoup(html, "lxml")
    sci = soup.select_one("em[itemprop='scientificName'].taxonName")
    if not sci:
        sci = soup.select_one("em[itemprop='scientificName']") or soup.select_one("em.taxonName")
    auth = soup.select_one("span[itemprop='scientificNameAuthorship']")
    if not sci:
        return None
    sci_name  = sci.get_text(strip=True)
    auth_name = auth.get_text(strip=True) if auth else ""
    full = f"{sci_name} {auth_name}".strip()
    full = re.sub(r"\s+", " ", full).strip()
    return full

# ===== Driver (‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡∏±‡∏ß‡πÅ‡∏£‡∏Å‡πÑ‡∏ß‡πâ‡∏Å‡πà‡∏≠‡∏ô) =====
ensure_dirs()
driver = make_driver()

def scrape_one_wfo(wfo_id: str):
    """‡πÇ‡∏´‡∏•‡∏î‡∏´‡∏ô‡πâ‡∏≤ WFO, ‡∏Ñ‡∏∑‡∏ô (full_name, status='ok') ‡∏´‡∏£‡∏∑‡∏≠ (None, 'retry_.../error')"""
    global driver
    url = BASE_URL + str(wfo_id).strip()
    attempt, last_err = 0, "unknown_error"
    while attempt < MAX_ATTEMPTS:
        attempt += 1
        try:
            driver.get(url)
            WebDriverWait(driver, WAIT_SEC).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "em[itemprop='scientificName'], em.taxonName"))
            )
            time.sleep(0.8)
            full_name = extract_name_from_html(driver.page_source)
            if full_name:
                return full_name, "ok"
            raise TimeoutException("no_scientificName_tag")
        except (TimeoutException, WebDriverException) as e:
            last_err = f"retry_{attempt}: {e}"
            # ‡πÄ‡∏ã‡∏ü‡∏™‡∏Å‡∏£‡∏µ‡∏ô‡∏ä‡πá‡∏≠‡∏ï‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏µ‡∏ö‡∏±‡∏Å
            try:
                driver.save_screenshot(os.path.join(LOG_DIR, f"wfo_{wfo_id}_try{attempt}.png"))
            except Exception:
                pass
            # backoff ‡∏´‡∏ô‡∏±‡∏Å‡∏Ç‡∏∂‡πâ‡∏ô‡∏ñ‡πâ‡∏≤‡πÄ‡∏õ‡πá‡∏ô network error
            backoff = min(3.0 * attempt, 12.0) + random.uniform(0.5, 1.5)
            if any(err in str(e) for err in ["ERR_CONNEC", "ERR_TIMED_OUT", "ERR_FAILED"]):
                backoff += 3.0
            time.sleep(backoff)

            # ‡∏£‡∏µ‡∏™‡∏ï‡∏≤‡∏£‡πå‡∏ó‡πÑ‡∏î‡∏£‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏Å‡∏•‡∏≤‡∏á‡∏ó‡∏≤‡∏á‡∏Å‡∏±‡∏ô session ‡∏Ñ‡πâ‡∏≤‡∏á
            if attempt in (3, MAX_ATTEMPTS):
                safe_quit(driver)
                time.sleep(1.0)
                driver = make_driver()
                time.sleep(2.0 + random.uniform(0.5, 1.5))
            continue
        except Exception as e:
            last_err = f"error: {e}"
            break
    return None, last_err

# ====== ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° DataFrame ‡πÅ‡∏•‡∏∞ resume ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏¥‡∏° ======
df_unican = df_unican.copy()
df_unican["wfo_id"] = df_unican["wfo_id"].astype(str).str.strip()
if "wfo_full_name" not in df_unican.columns:
    df_unican["wfo_full_name"] = ""

if os.path.exists(OUTPUT_PATH):
    prev = pd.read_csv(OUTPUT_PATH)
    prev = prev.drop_duplicates(subset=["wfo_id"], keep="last")
    df_unican = (
        df_unican.drop_duplicates(subset=["wfo_id"], keep="last")
        .merge(prev[["wfo_id", "wfo_full_name"]], on="wfo_id", how="left", suffixes=("", "_prev"))
    )

    def _pick(a, b):
        a = a if isinstance(a, str) else ("" if pd.isna(a) else str(a))
        b = b if isinstance(b, str) else ("" if pd.isna(b) else str(b))
        return a.strip() if a.strip() else b.strip()

    df_unican["wfo_full_name"] = df_unican.apply(lambda r: _pick(r["wfo_full_name"], r["wfo_full_name_prev"]), axis=1)
    df_unican = df_unican.drop(columns=["wfo_full_name_prev"])

# ====== ‡∏´‡∏≤ row ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡πâ‡∏ô‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î ======
if START_FROM_ROW is None:
    null_mask = df_unican["wfo_full_name"].isna() | (df_unican["wfo_full_name"].astype(str).str.strip() == "")
    null_indices = list(df_unican[null_mask].index)
    START_FROM_ROW = (null_indices[0] if null_indices else len(df_unican))

print(f"‚ñ∂Ô∏è START_FROM_ROW = {START_FROM_ROW}")

# ====== ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢: ‡∏ß‡πà‡∏≤‡∏á ‡πÅ‡∏•‡∏∞ index >= START_FROM_ROW ======
mask_null = df_unican["wfo_full_name"].isna() | (df_unican["wfo_full_name"].astype(str).str.strip() == "")
mask_idx  = df_unican.index >= START_FROM_ROW
targets_idx = list(df_unican[mask_null & mask_idx].index)
targets_idx.sort()
print(f"ü™¥ Total to (re)fill: {len(targets_idx)} rows")

# ====== ‡∏£‡∏±‡∏ô‡∏à‡∏£‡∏¥‡∏á‡πÅ‡∏•‡∏∞‡πÄ‡∏ã‡∏ü‡∏ó‡∏∏‡∏Å ‡πÜ 10 ‡πÅ‡∏ñ‡∏ß ======
bad_rows = []
done = 0
try:
    for i, idx in enumerate(targets_idx, start=1):
        wfo_id = df_unican.at[idx, "wfo_id"]
        print(f"‚Üí Row {idx}: fetching {wfo_id} ...")
        if not wfo_id:
            bad_rows.append({"row": int(idx), "wfo_id": "", "status": "empty_wfo_id"})
            continue

        full_name, status = scrape_one_wfo(wfo_id)
        if status == "ok" and full_name:
            df_unican.at[idx, "wfo_full_name"] = full_name
            print(f"   ‚úì {full_name}")
        else:
            print(f"   ‚ö†Ô∏è {status}")
            bad_rows.append({"row": int(idx), "wfo_id": wfo_id, "status": status})

        # ‡∏û‡∏±‡∏Å‡∏™‡∏±‡πâ‡∏ô‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß
        time.sleep(random.uniform(*SLEEP_EACH_MIN))
        # ‡∏û‡∏±‡∏Å‡∏¢‡∏≤‡∏ß‡∏ó‡∏∏‡∏Å ‡πÜ 20 ‡πÅ‡∏ñ‡∏ß ‡∏Å‡∏±‡∏ô rate-limit
        if i % 20 == 0:
            time.sleep(random.uniform(*SLEEP_EACH_20))

        done += 1
        if (done % SAVE_EVERY == 0) or (i == len(targets_idx)):
            ensure_dirs()
            df_unican.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")
            print(f"üíæ Saved ‚Üí {OUTPUT_PATH}  ({done}/{len(targets_idx)})")
            if bad_rows:
                pd.DataFrame(bad_rows).to_csv(os.path.join(LOG_DIR, "unican_bad_rows.csv"), index=False)
finally:
    safe_quit(driver)

if bad_rows:
    print(f"‚ö†Ô∏è ‡∏°‡∏µ {len(bad_rows)} ‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏î‡∏∂‡∏á‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ ‡∏î‡∏π‡πÑ‡∏ü‡∏•‡πå {os.path.join(LOG_DIR, 'unican_bad_rows.csv')}")
else:
    print("‚úÖ ‡∏î‡∏∂‡∏á‡∏Ñ‡∏£‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î")

print("üìÑ Final file:", os.path.abspath(OUTPUT_PATH))

‚ñ∂Ô∏è START_FROM_ROW = 63
ü™¥ Total to (re)fill: 10 rows
‚Üí Row 63: fetching wfo-0000981823 ...
   ‚úì Hetaeria anomala Lindl.
‚Üí Row 64: fetching wfo-0000981817 ...
   ‚úì Hetaeria affinis (Griff.) Seidenf. & Ormerod
‚Üí Row 65: fetching wfo-0000296473 ...
   ‚úì Rhomboda moulmeinensis (C.S.P.Parish & Rchb.f.) Ormerod
‚Üí Row 66: fetching wfo-0000335342 ...
   ‚úì Zeuxine nervosa (Wall. ex Lindl.) Benth. ex Trimen
‚Üí Row 67: fetching wfo-0000335291 ...
   ‚úì Zeuxine flava (Wall. ex Lindl.) Trimen
‚Üí Row 68: fetching wfo-0000667948 ...
   ‚úì Engelhardia spicata Lechen ex Blume
‚Üí Row 69: fetching wfo-4000030761 ...
   ‚úì Polyosma Blume
‚Üí Row 70: fetching wfo-4000011405 ...
   ‚úì Dichapetalum Thouars
‚Üí Row 71: fetching wfo-0000510878 ...
   ‚úì Abelmoschus manihot (L.) Medik.
‚Üí Row 72: fetching wfo-0000774811 ...
   ‚úì Barringtonia acutangula Gaertn.
üíæ Saved ‚Üí ../Processed data/unican_fillname.csv  (10/10)
‚úÖ ‡∏î‡∏∂‡∏á‡∏Ñ‡∏£‡∏ö‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
üìÑ Final fil

Move unican_fillname.csv to Processed data Folder

In [24]:
df_unican

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id,wfo_full_name,wfo_source
0,2,Rhizophoraceae,2 Bruguiera,Bruguiera,2.0,Bruguiera gymnorhiza (L.) Savigny,This is currently accepted.,"SOUTH-EASTERN: Chanthaburi, Trat; PENINSULAR: ...","In the Old World tropics, from S & E Africa th...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:18:33.101996+00:00,Bruguiera gymnorhiza (L.) Savigny,wfo-0000572747,Bruguiera gymnorhiza (L.) Lam. ex Savigny,candidates_unique
1,2,Rhizophoraceae,3 Ceriops,Ceriops,3.0,Ceriops decandra (Griff.) Ding Hou,This is currently accepted.,"SOUTH-EASTERN: Chon Buri (Si Racha), Chathabur...","India (type), Burma, Cambodia, S Vietnam, Male...",Mangrove forests.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:19:14.152028+00:00,Ceriops decandra (Griff.) Ding Hou,wfo-0000597936,Ceriops decandra (Griff.) W.Theob.,candidates_unique
2,2,Connaraceae,4 Rourea (in part),Rourea (in part),4.0,Rourea sp.,This is currently accepted.,SOUTH-EASTERN: Chanthaburi.,,Evergreen forests. Fruiting April‚ÄìJune.,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:36:30.322756+00:00,Rourea sp.,wfo-4000033545,Rourea Aubl.,candidates_unique
3,2,Connaraceae,6 Connarus,Connarus,6.0,Connarus monocarpus L.,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-19T18:37:53.618260+00:00,Connarus monocarpus L.,wfo-0000618164,Connarus monocarpus Wight & Arn.,candidates_unique
4,2,Apostasiaceae,1 Neuwiedia,Neuwiedia,1.0,Neuwiedia zollingeri Reichb.f.,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraSpecies.h...,2025-10-19T18:38:43.553646+00:00,Neuwiedia zollingeri Reichb.f.,wfo-0000251829,Neuwiedia zollingeri Rchb.f.,candidates_unique
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,13,Juglandaceae,2. Engelhardia,Engelhardia,2.0,Engelhardia spicata Lesch. ex Blume,This is currently accepted.,e-Flora of Thailand,,,https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-20T21:21:36.557538+00:00,Engelhardia spicata Lesch. ex Blume,wfo-0000667948,Engelhardia spicata Lechen ex Blume,candidates_unique
69,13,Polyosmaceae,Polyosma,Polyosma,,Polyosma sp.,This is currently accepted.,"NORTH-EASTERN: Loei (Phu Kradueng, Wang Saphun...","Indochina, China.","In primary seasonal evergreen forests, along s...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-20T21:24:37.210065+00:00,Polyosma sp.,wfo-4000030761,Polyosma Blume,candidates_unique
70,14,Dichapetalaceae,Dichapetalum,Dichapetalum,,Dichapetalum sp. 1,This is currently accepted.,PENINSULAR: Nakhon Si Thammarat (Khao Luang).,Endemic.,"Wet evergreen forests, ca 750 m alt.",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-20T21:37:07.149762+00:00,Dichapetalum sp. 1,wfo-4000011405,Dichapetalum Thouars,candidates_unique
71,14,Malvaceae,1. Abelmoschus,Abelmoschus,1.0,Abelmoschus manihot (L.) Medik. var. pungens (...,This is currently accepted.,"NORTHERN: Mae Hong Son (Khun Yuam, Mae Sariang...","India to S China, Malesia.","Cultivated, sometimes escaping from cultivatio...",https://botany.dnp.go.th/eflora/florafamily.ht...,https://botany.dnp.go.th/eflora/floragenus.htm...,https://botany.dnp.go.th/eflora/floraspecies.h...,2025-10-20T22:01:55.431146+00:00,Abelmoschus manihot (L.) Medik. var. pungens (...,wfo-0000510878,Abelmoschus manihot (L.) Medik.,candidates_unique


In [25]:
df_unican[df_unican['wfo_full_name'].isna() | (df_unican['wfo_full_name'].astype(str).str.strip() == "")]

Unnamed: 0,volume,family_name,genus_name,genus_label,genus_index_parsed,species_scientific_name,accepted_name,thailand,distribution,ecology,family_url,genus_url,species_url,scraped_at,specific_name,wfo_id,wfo_full_name,wfo_source


‡πÄ‡∏≠‡∏≤‡πÑ‡∏õ‡πÄ‡∏ä‡πá‡∏Ñ‡πÅ‡∏°‡∏ô‡∏ô‡∏ß‡∏•‡∏ï‡πà‡∏≠

‡πÄ‡∏ä‡πá‡∏Ñ‡πÄ‡∏™‡∏£‡πá‡∏à‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£ ‡∏£‡∏ß‡∏°‡∏ó‡∏∏‡∏Å‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÅ‡∏•‡πâ‡∏ß‡∏Å‡πá multi-candidates ‡∏Ñ‡πà‡∏≠‡∏¢‡∏ó‡∏≥