In [21]:
# =========================
# Option A: Jupyter script
# =========================

import csv
import re
import sys
import time
import json
import math
import queue
import shutil
import string
import threading
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, Iterable
from urllib.parse import urljoin, urlparse, urlunparse

import requests

# -------------------------
# Windows + Playwright fix
# -------------------------
# Playwright uses subprocesses under the hood. On Windows, SelectorEventLoop often breaks that.
# Setting Proactor policy allows subprocess support for newly created loops.
import asyncio
if sys.platform.startswith("win"):
    try:
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    except Exception:
        pass


# -------------------------
# User config (edit these)
# -------------------------
INPUT_CSV  = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\companies_with_website_2.csv"
OUTPUT_CSV = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\management_teams.csv"
DEBUG_DIR  = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\debug_html"

MAX_VISITS_PER_COMPANY = 80
SLEEP_S = 0.35

USE_PLAYWRIGHT = True           # set False to disable Playwright entirely
PLAYWRIGHT_TIMEOUT_MS = 25000   # per page
REQUEST_TIMEOUT_S = 25

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

KEYWORDS = (
    "board", "director", "directors", "management", "leadership", "team",
    "executive", "executives", "governance", "corporate", "about", "company",
    "our-team", "people", "staff", "key-people"
)

TEAM_CATEGORY_HINTS = (
    "team_category/board-of-directors",
    "team_category/management",
    "team_category/management-team",
    "team_category/leadership",
    "team_category/executive",
    "team_category/executives",
)

GUESS_PATHS = (
    "/board/",
    "/board",
    "/management-team/",
    "/management-team",
    "/team/",
    "/team",
    "/leadership/",
    "/leadership",
    "/directors-and-management/",
    "/directors-and-management",
    "/corporate-governance/",
    "/corporate-governance",
    "/about/board-of-directors/",
    "/about/board-of-directors",
    "/about-us/board-of-directors/",
    "/about-us/board-of-directors",
    "/investors/corporate-governance/",
    "/investors/corporate-governance",
)

SITEMAP_STARTS = ("/sitemap.xml", "/sitemap_index.xml", "/wp-sitemap.xml", "/wp-sitemap.xml?orderby=url")


# -------------------------
# Output schema
# -------------------------
@dataclass
class PersonRow:
    ticker: str
    company: str
    name: str
    title: str
    group: str          # "board", "management", or "unknown"
    profile_url: str
    source_url: str


# -------------------------
# Utilities
# -------------------------
def safe_filename(s: str) -> str:
    keep = string.ascii_letters + string.digits + "._-"
    out = "".join(c if c in keep else "_" for c in s)
    return out[:180]

def save_debug(debug_dir: Path, label: str, url: str, text: str) -> None:
    try:
        debug_dir.mkdir(parents=True, exist_ok=True)
        fn = f"{label}_{safe_filename(urlparse(url).netloc + urlparse(url).path)}.html"
        (debug_dir / fn).write_text(text or "", encoding="utf-8", errors="ignore")
    except Exception:
        pass

def normalize_website(raw: str) -> str:
    raw = (raw or "").strip()
    if not raw:
        return ""
    raw = raw.replace("http://", "").replace("https://", "").strip("/")
    # basic sanity
    if " " in raw:
        raw = raw.split()[0]
    return raw

def url_variants(domain: str) -> list[str]:
    """Return URL roots to try for a given domain."""
    domain = normalize_website(domain)
    if not domain:
        return []
    roots = []
    for scheme in ("https://", "http://"):
        roots.append(scheme + domain + "/")
        if not domain.startswith("www."):
            roots.append(scheme + "www." + domain + "/")
    # de-dupe in order
    seen = set()
    out = []
    for r in roots:
        if r not in seen:
            seen.add(r)
            out.append(r)
    return out

def canonicalize_url(u: str) -> str:
    """Remove fragments, normalize trailing slash (keep for root), preserve query."""
    try:
        p = urlparse(u)
        p = p._replace(fragment="")
        path = p.path or "/"
        if path != "/" and path.endswith("/"):
            path = path[:-1]
        p = p._replace(path=path)
        return urlunparse(p)
    except Exception:
        return u

def looks_js_blocked(html: str) -> bool:
    h = (html or "").lower()
    if len(h) < 800:
        return True
    needles = ("enable javascript", "please enable javascript", "requires javascript", "javascript required")
    return any(n in h for n in needles)

def detect_group_from_url(u: str) -> str:
    u = (u or "").lower()
    if any(k in u for k in ("board", "director", "governance")):
        return "board"
    if any(k in u for k in ("management", "leadership", "executive", "team")):
        return "management"
    return "unknown"


# -------------------------
# CSV reading (tab or comma)
# -------------------------
def sniff_delimiter(path: str) -> str:
    sample = Path(path).read_text(encoding="utf-8", errors="ignore")[:4000]
    # prefer tab if it dominates
    if sample.count("\t") > sample.count(","):
        return "\t"
    return ","

def detect_columns(fieldnames: list[str]) -> tuple[str, str, str]:
    """Return (ticker_col, company_col, website_col) from headers."""
    norm = {h: re.sub(r"[^a-z0-9]+", "", (h or "").lower()) for h in fieldnames}
    def pick(*cands: str) -> Optional[str]:
        for h, hn in norm.items():
            if hn in cands:
                return h
        return None

    ticker = pick("ticker", "asxticker", "code", "symbol")
    company = pick("company", "companyname", "name", "issuer")
    website = pick("website", "url", "web", "site", "homepage")

    if not ticker:
        raise ValueError(f"Could not find a ticker column in CSV headers: {fieldnames}")
    if not company:
        raise ValueError(f"Could not find a company column in CSV headers: {fieldnames}")
    if not website:
        raise ValueError(f"Could not find a website/url column in CSV headers: {fieldnames}")

    return ticker, company, website


# -------------------------
# Fetching
# -------------------------
def fetch_requests(url: str) -> tuple[int, str, str, str]:
    """(status, content_type, final_url, html_text)"""
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT_S, allow_redirects=True)
        ctype = r.headers.get("content-type", "")
        text = r.text if isinstance(r.text, str) else (r.content or b"").decode("utf-8", "ignore")
        return r.status_code, ctype, r.url, text
    except Exception:
        return 0, "", url, ""

def fetch_playwright_threaded(url: str, timeout_ms: int = PLAYWRIGHT_TIMEOUT_MS) -> tuple[int, str, str, str]:
    """
    Run Playwright sync API in a background thread so it doesn't clash with Jupyter's running loop.
    Returns (status, content_type, final_url, html).
    """
    try:
        from playwright.sync_api import sync_playwright
    except Exception:
        return 0, "", url, ""

    q: "queue.Queue[tuple[int,str,str,str]]" = queue.Queue()

    def worker():
        try:
            # Ensure proactor policy is set for any loops created in this thread
            import asyncio as _asyncio
            if sys.platform.startswith("win"):
                try:
                    _asyncio.set_event_loop_policy(_asyncio.WindowsProactorEventLoopPolicy())
                except Exception:
                    pass

            with sync_playwright() as p:
                browser = p.chromium.launch(headless=True)
                page = browser.new_page()
                page.goto(url, wait_until="networkidle", timeout=timeout_ms)
                html = page.content() or ""
                final = page.url or url
                browser.close()
            q.put((200, "text/html", final, html))
        except Exception:
            q.put((0, "", url, ""))

    t = threading.Thread(target=worker, daemon=True)
    t.start()
    # Give a bit of buffer over timeout_ms
    t.join(timeout_ms / 1000.0 + 5)

    if not q.empty():
        return q.get()
    return 0, "", url, ""

def fetch_html(url: str, debug_dir: Path, label: str) -> tuple[int, str, str, str]:
    status, ctype, final_url, html = fetch_requests(url)
    if html:
        save_debug(debug_dir, f"{label}_req", final_url, html)

    global USE_PLAYWRIGHT
    if USE_PLAYWRIGHT and looks_js_blocked(html):
        p_status, p_ctype, p_final, p_html = fetch_playwright_threaded(final_url)
        # If playwright fetched something better, use it
        if p_html and len(p_html) > len(html or ""):
            status, ctype, final_url, html = p_status, p_ctype, p_final, p_html
            save_debug(debug_dir, f"{label}_pw", final_url, html)
        # If Playwright completely dies repeatedly, just turn it off and keep going
        if p_status == 0 and (html is None or html == ""):
            # disable to avoid spam errors
            USE_PLAYWRIGHT = False

    return status, ctype, final_url, html or ""


# -------------------------
# Sitemap parsing
# -------------------------
def parse_sitemap(xml_text: str) -> tuple[list[str], list[str]]:
    """Return (urls, sitemap_indexes)."""
    urls, maps = [], []
    if not xml_text:
        return urls, maps
    try:
        import xml.etree.ElementTree as ET
        root = ET.fromstring(xml_text.strip().encode("utf-8", "ignore"))
        tag = root.tag.lower()
        ns_strip = lambda t: t.split("}")[-1].lower() if "}" in t else t.lower()

        if ns_strip(tag).endswith("sitemapindex"):
            for sm in root.findall(".//{*}sitemap/{*}loc"):
                if sm.text:
                    maps.append(sm.text.strip())
        else:
            for loc in root.findall(".//{*}url/{*}loc"):
                if loc.text:
                    urls.append(loc.text.strip())
    except Exception:
        # super lightweight regex fallback
        locs = re.findall(r"<loc>(.*?)</loc>", xml_text, flags=re.I | re.S)
        for l in locs[:5000]:
            l = re.sub(r"\s+", " ", l).strip()
            if l:
                urls.append(l)
    return urls, maps

def collect_sitemap_urls(start_url: str, debug_dir: Path, depth_limit: int = 2, max_urls: int = 5000) -> set[str]:
    seen_maps = set()
    out_urls: set[str] = set()

    stack = [(start_url, 0)]
    while stack:
        sm_url, depth = stack.pop()
        if sm_url in seen_maps or depth > depth_limit:
            continue
        seen_maps.add(sm_url)

        status, ctype, final_url, xml_text = fetch_html(sm_url, debug_dir, f"sitemap_{depth}")
        if status and status >= 400:
            continue
        if "xml" not in (ctype or "").lower() and "<url" not in (xml_text or "").lower() and "<sitemap" not in (xml_text or "").lower():
            continue

        urls, maps = parse_sitemap(xml_text)
        for u in urls:
            out_urls.add(canonicalize_url(u))
            if len(out_urls) >= max_urls:
                return out_urls
        for m in maps:
            m = canonicalize_url(m)
            if m not in seen_maps:
                stack.append((m, depth + 1))

    return out_urls


# -------------------------
# Candidate URL building
# -------------------------
def keyword_score(url: str) -> int:
    u = (url or "").lower()
    return sum(1 for k in KEYWORDS if k in u)

def build_candidates(website_domain: str, debug_dir: Path) -> tuple[str, list[str]]:
    """
    Returns (home_final, candidates)
    - Tries several root variants for the home page.
    - Pulls sitemap URLs and ranks likely team pages.
    - Adds guessed common paths + FireFly-style team_category hints.
    """
    roots = url_variants(website_domain)
    home_final = ""
    best_home_html = ""
    best_root = ""

    for r in roots:
        st, ct, fin, html = fetch_html(r, debug_dir, "home")
        if st == 200 and len(html) > len(best_home_html):
            best_home_html = html
            home_final = fin
            best_root = fin if fin.endswith("/") else fin + "/"

    if not best_root:
        # if nothing worked, just use https root
        best_root = roots[0] if roots else ""
        home_final = best_root

    # sitemaps
    sitemap_pages: set[str] = set()
    for sm_path in SITEMAP_STARTS:
        sm_url = urljoin(best_root, sm_path.lstrip("/"))
        pages = collect_sitemap_urls(sm_url, debug_dir, depth_limit=2, max_urls=5000)
        sitemap_pages |= pages

    # rank sitemap pages by keyword score, keep top chunk
    ranked = sorted(sitemap_pages, key=lambda u: (keyword_score(u), -len(u)), reverse=True)
    ranked = [u for u in ranked if keyword_score(u) >= 2][:250]

    # guessed paths
    guessed = [urljoin(best_root, p.lstrip("/")) for p in GUESS_PATHS]
    teamcat = [urljoin(best_root, p.lstrip("/")) for p in TEAM_CATEGORY_HINTS]

    # also try http root with same paths (some sites behave oddly)
    http_root = ""
    if best_root.startswith("https://"):
        http_root = "http://" + best_root[len("https://"):]
    guessed_http = [urljoin(http_root, p.lstrip("/")) for p in GUESS_PATHS] if http_root else []
    teamcat_http = [urljoin(http_root, p.lstrip("/")) for p in TEAM_CATEGORY_HINTS] if http_root else []

    # combine + de-dupe
    candidates = []
    seen = set()
    for u in (ranked + guessed + teamcat + guessed_http + teamcat_http):
        cu = canonicalize_url(u)
        if cu not in seen:
            seen.add(cu)
            candidates.append(cu)

    return home_final, candidates


# -------------------------
# HTML -> people extraction
# -------------------------
def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", (s or "")).strip()
    return s

def plausible_name(s: str) -> bool:
    s = clean_text(s)
    if len(s) < 5 or len(s) > 80:
        return False
    # reject obvious non-names
    bad = ("cookie", "privacy", "search", "menu", "read more", "investor", "careers")
    if any(b in s.lower() for b in bad):
        return False
    # allow letters, spaces, dots, apostrophes, hyphens
    if not re.fullmatch(r"[A-Za-z][A-Za-z\.\'\-\s]+", s):
        return False
    parts = s.split()
    if len(parts) < 2:
        return False
    # too many words is usually not a name
    if len(parts) > 5:
        return False
    # at least 2 capitalized-ish tokens
    caps = sum(1 for p in parts if p[:1].isupper())
    return caps >= 2

def extract_cards_basic(html: str, base_url: str) -> list[tuple[str, str, str]]:
    """
    Returns list of (name, title, profile_url)
    Heuristic: find repeated blocks with headings + nearby role text.
    """
    try:
        from bs4 import BeautifulSoup
    except Exception:
        raise RuntimeError("Missing dependency: bs4. Install with: pip install beautifulsoup4")

    soup = BeautifulSoup(html, "html.parser")

    # candidate containers (common in WP themes)
    containers = soup.find_all(
        lambda tag: tag.name in ("div", "article", "li", "section")
        and tag.get("class")
        and any(
            any(k in c.lower() for k in ("team", "member", "profile", "director", "management", "person", "staff"))
            for c in tag.get("class", [])
        )
    )

    out = []
    for c in containers:
        # name candidates
        name = ""
        for hn in ("h1", "h2", "h3", "h4", "strong"):
            h = c.find(hn)
            if h:
                t = clean_text(h.get_text(" ", strip=True))
                if plausible_name(t):
                    name = t
                    break
        if not name:
            continue

        # title candidates
        title = ""
        # class-based title
        tnode = c.find(lambda t: t.get("class") and any("title" in x.lower() or "role" in x.lower() or "position" in x.lower() for x in t.get("class")))
        if tnode:
            title = clean_text(tnode.get_text(" ", strip=True))
        if not title:
            # first short paragraph after name
            ps = c.find_all("p")
            for p in ps[:3]:
                tt = clean_text(p.get_text(" ", strip=True))
                if tt and len(tt) <= 120 and not plausible_name(tt):
                    title = tt
                    break

        # profile url
        profile = ""
        a = c.find("a", href=True)
        if a:
            profile = urljoin(base_url, a["href"])

        out.append((name, title, profile))

    # de-dupe by name
    dedup = {}
    for n, t, p in out:
        if n not in dedup:
            dedup[n] = (n, t, p)
        else:
            # keep the one with more title info or profile url
            old = dedup[n]
            score_old = (1 if old[1] else 0) + (1 if old[2] else 0)
            score_new = (1 if t else 0) + (1 if p else 0)
            if score_new > score_old:
                dedup[n] = (n, t, p)

    return list(dedup.values())

def extract_people(html: str, page_url: str) -> list[tuple[str, str, str]]:
    """
    Returns list of (name, title, profile_url)
    """
    base = page_url
    people = extract_cards_basic(html, base)

    # fallback: schema.org Person
    if not people:
        try:
            from bs4 import BeautifulSoup
        except Exception:
            return []
        soup = BeautifulSoup(html, "html.parser")
        items = soup.find_all(attrs={"itemtype": re.compile(r"schema\.org/Person", re.I)})
        out = []
        for it in items:
            name = ""
            title = ""
            nm = it.find(attrs={"itemprop": "name"})
            if nm:
                name = clean_text(nm.get_text(" ", strip=True))
            jt = it.find(attrs={"itemprop": "jobTitle"})
            if jt:
                title = clean_text(jt.get_text(" ", strip=True))
            a = it.find("a", href=True)
            profile = urljoin(base, a["href"]) if a else ""
            if plausible_name(name):
                out.append((name, title, profile))
        people = out

    return people


# -------------------------
# Company scrape
# -------------------------
def scrape_company(ticker: str, company: str, website: str, debug_dir: Path) -> list[PersonRow]:
    website = normalize_website(website)
    if not website:
        return []

    home_final, candidates = build_candidates(website, debug_dir)
    visited = 0
    rows: list[PersonRow] = []
    seen_people = set()

    for u in candidates:
        if visited >= MAX_VISITS_PER_COMPANY:
            break
        visited += 1

        st, ct, fin, html = fetch_html(u, debug_dir, f"visit{visited}")
        if st and st >= 400:
            continue
        if not html or len(html) < 500:
            continue

        # Only attempt parsing on likely team pages
        if keyword_score(fin) < 2 and keyword_score(u) < 2:
            continue

        people = extract_people(html, fin)
        if not people:
            continue

        group = detect_group_from_url(fin)
        for name, title, profile in people:
            key = (ticker, name)
            if key in seen_people:
                continue
            seen_people.add(key)
            rows.append(PersonRow(
                ticker=ticker,
                company=company,
                name=name,
                title=title or "",
                group=group,
                profile_url=profile or "",
                source_url=fin
            ))

        # if we got a decent set, stop early
        if len(rows) >= 8:
            # many boards are ~5-10 people, this is usually enough
            break

        time.sleep(SLEEP_S)

    # Debug summary
    (debug_dir / f"{ticker}_attempts.txt").write_text(
        json.dumps({
            "ticker": ticker,
            "company": company,
            "website": website,
            "home_final": home_final,
            "candidates_count": len(candidates),
            "visited": visited,
            "people_found": len(rows),
        }, indent=2),
        encoding="utf-8"
    )

    return rows


# -------------------------
# Runner
# -------------------------
def run_all(input_csv: str, output_csv: str, debug_dir: str) -> None:
    input_path = Path(input_csv)
    if not input_path.exists():
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")

    debug_root = Path(debug_dir)
    debug_root.mkdir(parents=True, exist_ok=True)

    delim = sniff_delimiter(input_csv)
    out_rows: list[PersonRow] = []

    with open(input_csv, "r", encoding="utf-8", errors="ignore", newline="") as f:
        reader = csv.DictReader(f, delimiter=delim)
        if not reader.fieldnames:
            raise ValueError("Input CSV has no header row.")
        ticker_col, company_col, web_col = detect_columns(reader.fieldnames)

        for i, row in enumerate(reader, 1):
            ticker = (row.get(ticker_col) or "").strip()
            company = (row.get(company_col) or "").strip()
            website = (row.get(web_col) or "").strip()

            if not ticker or not company:
                continue

            company_debug = debug_root / ticker
            company_debug.mkdir(parents=True, exist_ok=True)

            print(f"\n=== {ticker} {company} ===")
            print(f"Website: {website}")

            people = scrape_company(ticker, company, website, company_debug)
            if not people:
                print(f"No people parsed for {ticker} ({company}). Visited up to {MAX_VISITS_PER_COMPANY}.")
            else:
                print(f"{ticker}: {len(people)} people")

            out_rows.extend(people)

    # write output
    out_path = Path(output_csv)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with open(out_path, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=list(asdict(PersonRow("", "", "", "", "", "", "")).keys()))
        w.writeheader()
        for r in out_rows:
            w.writerow(asdict(r))

    print(f"\nDone. Wrote {len(out_rows)} rows to: {out_path}")
    print(f"Debug dir: {debug_root}")


# -------------------------
# Execute
# -------------------------
run_all(INPUT_CSV, OUTPUT_CSV, DEBUG_DIR)



=== AGE Alligator Energy Ltd ===
Website: alligatorenergy.com.au
No people parsed for AGE (Alligator Energy Ltd). Visited up to 80.

=== AR3 Australian Rare Earths Ltd ===
Website: ar3.com.au
No people parsed for AR3 (Australian Rare Earths Ltd). Visited up to 80.

=== BM1 Ballard Mining Ltd ===
Website: ballardmining.com.au
No people parsed for BM1 (Ballard Mining Ltd). Visited up to 80.

=== BGD Barton Gold Holdings Ltd ===
Website: bartongold.com.au
No people parsed for BGD (Barton Gold Holdings Ltd). Visited up to 80.

=== BCA Black Canyon Ltd ===
Website: blackcanyon.com.au
No people parsed for BCA (Black Canyon Ltd). Visited up to 80.

=== BSX Blackstone Minerals Ltd ===
Website: blackstoneminerals.com.au
No people parsed for BSX (Blackstone Minerals Ltd). Visited up to 80.

=== CTM Centaurus Metals Ltd ===
Website: centaurus.com.au
No people parsed for CTM (Centaurus Metals Ltd). Visited up to 80.

=== COD Coda Minerals Ltd ===
Website: codaminerals.com
No people parsed for COD

In [25]:
# ============================================================
# Management / Board scraper (Notebook-friendly, no argparse)
# ============================================================
# Dependencies:
#   pip install requests beautifulsoup4 lxml
#
# What it does:
# - Reads a CSV with columns: Company, Ticker, Website (tab or comma OK)
# - For each company:
#   - collects candidate pages (sitemap + homepage links + common guesses)
#   - visits up to MAX_VISITS_PER_COMPANY pages
#   - extracts people via:
#       (1) JSON-LD schema.org Person
#       (2) text-pattern scanning (Name -> Title -> Bio)
#       (3) simple "card" containers + profile link follow
# - Writes one output CSV: management_teams.csv (or whatever path you set)
#
# Notes:
# - Playwright is OFF by default. Your env showed multiple Playwright loop/subprocess issues.
# - If you *really* need JS rendering later, I can give you an async-playwright notebook-safe add-on,
#   but get this working first; most ASX sites have the team pages in static HTML.
# ============================================================

from __future__ import annotations

import csv
import json
import os
import re
import time
import traceback
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, Iterable
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

# -------------------------
# CONFIG (global defaults)
# -------------------------
MAX_VISITS_PER_COMPANY = 80          # total page fetches per company (includes profiles)
MAX_SITEMAP_URLS = 8000              # cap for sitemap urls collected
MAX_HOME_LINKS = 600                 # cap for internal links harvested from home + relevant pages
MAX_CANDIDATES = 350                 # pages we will queue up initially
MAX_PROFILE_FOLLOWS = 30             # max profile pages to follow per company
SLEEP_S = 0.25                       # polite delay between requests
TIMEOUT_S = 25

# If True, saves fetched HTML files into DEBUG_DIR/<TICKER>/
SAVE_DEBUG_HTML = True

# If True, prints more detail
VERBOSE = True

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-AU,en;q=0.9",
}

URL_KEYWORDS = [
    "board", "director", "directors", "leadership", "management", "executive",
    "team", "our-team", "people", "about", "corporate", "governance", "company",
    "key-personnel", "key-person", "key-management", "key-management-personnel",
    "who-we-are", "our-people", "management-team", "board-and-management",
]
NEGATIVE_URL_HINTS = [
    "news", "media", "announcement", "announcements", "asx", "investor",
    "presentation", "presentations", "reports", "report", "download",
    "careers", "jobs", "vacancies", "events", "privacy", "terms", "cookie",
]
TITLE_HINTS = [
    "ceo", "chief", "managing", "director", "chair", "chairman", "chairperson",
    "non executive", "executive", "cfo", "coo", "cto", "cio", "secretary",
    "president", "vp", "general manager", "gm", "head of", "manager",
    "principal", "technical", "exploration", "geologist",
]

# -------------------------
# Data model
# -------------------------
@dataclass
class PersonRow:
    ticker: str
    company: str
    website: str
    name: str
    title: str
    bio: str
    profile_url: str
    source_url: str
    method: str


# -------------------------
# Small utilities
# -------------------------
def _norm_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def _safe_filename(s: str) -> str:
    s = re.sub(r"[^a-zA-Z0-9._-]+", "_", s)
    return s[:180].strip("_") or "page"

def _same_domain(a: str, b: str) -> bool:
    try:
        pa, pb = urlparse(a), urlparse(b)
        return pa.netloc.lower() == pb.netloc.lower() and pa.netloc != ""
    except Exception:
        return False

def _ensure_scheme(website: str) -> str:
    w = (website or "").strip()
    if not w:
        return ""
    if w.startswith("http://") or w.startswith("https://"):
        return w
    return "https://" + w

def _canonical(u: str) -> str:
    # remove fragments and trim trailing slashes (but keep root "/")
    try:
        p = urlparse(u)
        clean = p._replace(fragment="").geturl()
        if clean.endswith("/") and len(clean) > len(p.scheme + "://" + p.netloc + "/"):
            clean = clean.rstrip("/")
        return clean
    except Exception:
        return u

def sniff_dialect(path: str) -> csv.Dialect:
    sample = Path(path).read_text(encoding="utf-8", errors="ignore")[:8000]
    sniffer = csv.Sniffer()
    try:
        return sniffer.sniff(sample)
    except Exception:
        # fallback: tab is common in your files
        class TabDialect(csv.Dialect):
            delimiter = "\t"
            quotechar = '"'
            doublequote = True
            skipinitialspace = True
            lineterminator = "\n"
            quoting = csv.QUOTE_MINIMAL
        return TabDialect()

def detect_columns(fieldnames: list[str]) -> tuple[str, str, str]:
    # We expect at least Company, Ticker, Website
    lower = {c.lower().strip(): c for c in fieldnames}

    def pick(*cands: str) -> Optional[str]:
        for cand in cands:
            if cand in lower:
                return lower[cand]
        return None

    company_col = pick("company", "name", "company name", "company_name") or fieldnames[0]
    ticker_col = pick("ticker", "asx", "symbol", "code") or (fieldnames[1] if len(fieldnames) > 1 else fieldnames[0])
    web_col = pick("website", "web", "url", "site", "homepage", "domain")

    if not web_col:
        raise ValueError(f"Could not find a website/url column in CSV headers: {fieldnames}\n"
                         f"Expected something like: Website / URL / Domain.")

    return ticker_col, company_col, web_col

def keyword_score(text: str) -> int:
    t = (text or "").lower()
    score = 0
    for kw in URL_KEYWORDS:
        if kw in t:
            score += 2 if kw in ("board", "director", "directors", "management", "leadership") else 1
    for bad in NEGATIVE_URL_HINTS:
        if bad in t:
            score -= 2
    return score

def is_probable_title(line: str) -> bool:
    l = (line or "").strip()
    if not l or len(l) > 90:
        return False
    ll = l.lower()
    if any(h in ll for h in TITLE_HINTS):
        return True
    # heuristic: short Title Case (e.g. "Non Executive Director")
    if len(l) <= 45 and sum(1 for w in l.split() if w[:1].isupper()) >= 2:
        return True
    return False

def is_probable_name(line: str) -> bool:
    l = _norm_space(line)
    if not l:
        return False
    if len(l) < 5 or len(l) > 60:
        return False
    # exclude obvious headings / nav items
    bad = {"home", "contact", "corporate", "about", "projects", "investor", "news", "media", "governance"}
    if l.lower() in bad:
        return False
    # allow hyphens/apostrophes
    if not re.fullmatch(r"[A-Za-z][A-Za-z'.-]*(?:\s+[A-Za-z][A-Za-z'.-]*){1,3}", l):
        return False
    # at least two words, mostly capitalized
    parts = l.split()
    if len(parts) < 2:
        return False
    caps = sum(1 for p in parts if p[:1].isupper())
    return caps >= max(2, len(parts) - 1)

def clean_bio(text: str) -> str:
    t = _norm_space(text)
    # strip boilerplate
    t = re.sub(r"\b(read more|view profile|linkedin)\b.*$", "", t, flags=re.I).strip()
    return t

# -------------------------
# Fetching
# -------------------------
session = requests.Session()
session.headers.update(HEADERS)

def fetch(url: str) -> tuple[int, str, str, str]:
    """Return (status, content_type, final_url, text)"""
    try:
        r = session.get(url, timeout=TIMEOUT_S, allow_redirects=True)
        ctype = (r.headers.get("content-type") or "").lower()
        text = r.text if "text" in ctype or "html" in ctype or ctype == "" else ""
        return r.status_code, ctype, r.url, text
    except Exception:
        return 0, "", url, ""

def save_debug(debug_dir: Path, label: str, url: str, html: str) -> None:
    if not SAVE_DEBUG_HTML:
        return
    debug_dir.mkdir(parents=True, exist_ok=True)
    fname = f"{label}__{_safe_filename(url)}.html"
    (debug_dir / fname).write_text(html or "", encoding="utf-8", errors="ignore")

# -------------------------
# Sitemap collection
# -------------------------
def parse_sitemap_xml(xml_text: str) -> list[str]:
    # minimal and fast: regex <loc>...</loc>
    return re.findall(r"<loc>\s*(.*?)\s*</loc>", xml_text, flags=re.I)

def collect_sitemap_urls(base_url: str, debug_dir: Path) -> set[str]:
    """
    Tries common sitemap locations. Handles sitemap indexes recursively (depth <= 2).
    """
    start_points = [
        urljoin(base_url, "/sitemap.xml"),
        urljoin(base_url, "/sitemap_index.xml"),
        urljoin(base_url, "/wp-sitemap.xml"),
    ]

    seen_maps: set[str] = set()
    found_urls: set[str] = set()

    queue: list[tuple[str, int]] = [(u, 0) for u in start_points]

    while queue and len(found_urls) < MAX_SITEMAP_URLS:
        sm_url, depth = queue.pop(0)
        sm_url = _canonical(sm_url)
        if sm_url in seen_maps or depth > 2:
            continue
        seen_maps.add(sm_url)

        st, ctype, final, xml = fetch(sm_url)
        if VERBOSE:
            print(f"FETCH sitemap url={sm_url} -> status={st} ctype={ctype} len={len(xml or '')}")
        if st >= 400 or not xml:
            continue
        if "xml" not in ctype and "<loc" not in (xml.lower()[:5000]):
            continue

        save_debug(debug_dir, f"sitemap_d{depth}", final, xml)

        locs = parse_sitemap_xml(xml)
        # if this sitemap contains other sitemaps, recurse
        if any("sitemap" in (u.lower()) for u in locs) and depth < 2:
            for u in locs[:2000]:
                queue.append((u, depth + 1))
        else:
            for u in locs:
                found_urls.add(_canonical(u))
                if len(found_urls) >= MAX_SITEMAP_URLS:
                    break

        time.sleep(SLEEP_S)

    return found_urls

# -------------------------
# Link harvesting
# -------------------------
def extract_internal_links(html: str, page_url: str, max_links: int = 500) -> list[tuple[str, str]]:
    """Return list of (absolute_url, anchor_text)."""
    soup = BeautifulSoup(html or "", "lxml")
    out: list[tuple[str, str]] = []
    base = page_url
    for a in soup.find_all("a", href=True):
        href = (a.get("href") or "").strip()
        if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
            continue
        abs_u = urljoin(base, href)
        if not _same_domain(abs_u, base):
            continue
        txt = _norm_space(a.get_text(" ", strip=True))
        out.append((_canonical(abs_u), txt))
        if len(out) >= max_links:
            break
    return out

def build_candidate_urls(website: str, debug_dir: Path) -> list[str]:
    base = _ensure_scheme(website)
    if not base:
        return []

    # fetch home
    st, ctype, final, html = fetch(base)
    if VERBOSE:
        print(f"FETCH home url={base} -> final={final} status={st} len={len(html or '')}")
    if html:
        save_debug(debug_dir, "home", final, html)

    # gather homepage internal links
    home_links = extract_internal_links(html or "", final, max_links=MAX_HOME_LINKS) if html else []
    # score them
    scored: dict[str, int] = {}
    for u, txt in home_links:
        s = keyword_score(u) + keyword_score(txt)
        if s <= 0:
            continue
        scored[u] = max(scored.get(u, -999), s)

    # sitemap urls (high value for WP sites)
    sitemap_urls = set()
    try:
        sitemap_urls = collect_sitemap_urls(final, debug_dir)
    except Exception:
        sitemap_urls = set()

    for u in sitemap_urls:
        s = keyword_score(u)
        if s > 0:
            scored[u] = max(scored.get(u, -999), s + 1)

    # common guesses (helps non-sitemap sites)
    guesses = [
        "/corporate/board-of-directors",
        "/corporate/management-team",
        "/corporate/directors-and-management",
        "/corporate/leadership",
        "/corporate/our-team",
        "/about/leadership",
        "/about/our-team",
        "/about/team",
        "/our-team",
        "/team",
        "/leadership",
        "/management",
        "/board",
        "/board-of-directors",
        "/directors",
        "/directors-and-management",
        "/board-and-management",
        "/about-us/board",
        "/about-us/management",
        "/company/board",
        "/company/management",
        "/team_category/board-of-directors",
        "/team_category/management",
        "/team_category/leadership",
    ]
    for g in guesses:
        u = _canonical(urljoin(final, g))
        scored[u] = max(scored.get(u, -999), keyword_score(u) + 3)

    # rank
    ranked = sorted(scored.items(), key=lambda kv: kv[1], reverse=True)
    urls = [u for (u, s) in ranked[:MAX_CANDIDATES]]

    # always include the homepage itself early (some sites have people on home)
    if final and final not in urls:
        urls.insert(0, final)

    return urls

# -------------------------
# Extraction: JSON-LD
# -------------------------
def _walk_json(obj) -> Iterable[dict]:
    if isinstance(obj, dict):
        yield obj
        for v in obj.values():
            yield from _walk_json(v)
    elif isinstance(obj, list):
        for it in obj:
            yield from _walk_json(it)

def extract_people_jsonld(soup: BeautifulSoup, source_url: str) -> list[tuple[str, str, str, str]]:
    """
    Returns list of (name, title, bio, profile_url)
    """
    out = []
    scripts = soup.find_all("script", attrs={"type": re.compile(r"ld\+json", re.I)})
    for sc in scripts:
        raw = sc.string or sc.get_text(strip=True) or ""
        raw = raw.strip()
        if not raw:
            continue
        try:
            data = json.loads(raw)
        except Exception:
            continue

        for node in _walk_json(data):
            typ = node.get("@type") or node.get("type")
            if isinstance(typ, list):
                is_person = any(str(t).lower() == "person" for t in typ)
            else:
                is_person = str(typ).lower() == "person"

            if not is_person:
                continue

            name = _norm_space(node.get("name") or "")
            job = _norm_space(node.get("jobTitle") or node.get("description") or "")
            url = node.get("url") or ""
            if isinstance(url, dict):
                url = url.get("@id") or url.get("url") or ""
            profile = urljoin(source_url, str(url)) if url else ""

            # Try to use a more explicit description if present
            bio = _norm_space(node.get("description") or "")

            if is_probable_name(name):
                out.append((name, job, bio, profile))
    return out

# -------------------------
# Extraction: Text pattern
# -------------------------
def extract_people_textpattern(soup: BeautifulSoup, source_url: str) -> list[tuple[str, str, str, str]]:
    """
    Uses visible text order:
      Name
      Title
      Bio...
    """
    # remove scripts/styles
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    text = soup.get_text("\n", strip=True)
    lines = [_norm_space(x) for x in text.split("\n")]
    # prune junk lines
    lines = [x for x in lines if x and len(x) <= 220]

    out = []
    i = 0
    while i < len(lines) - 1:
        name = lines[i]
        if not is_probable_name(name):
            i += 1
            continue

        title = lines[i + 1] if i + 1 < len(lines) else ""
        # Sometimes it's "Name â Title" in one line
        if " - " in name or " â " in name:
            parts = re.split(r"\s[-â]\s", name, maxsplit=1)
            if len(parts) == 2 and is_probable_name(parts[0]) and is_probable_title(parts[1]):
                name, title = parts[0], parts[1]

        if not is_probable_title(title):
            i += 1
            continue

        # Bio: accumulate until next probable name or a hard stop
        bio_parts = []
        j = i + 2
        while j < len(lines):
            if is_probable_name(lines[j]) and (j + 1 < len(lines) and is_probable_title(lines[j + 1])):
                break
            # stop if we hit footer-ish blocks
            if lines[j].lower().startswith("Â© ") or lines[j].lower().startswith("copyright"):
                break
            bio_parts.append(lines[j])
            if len(" ".join(bio_parts)) > 1500:
                break
            j += 1

        bio = clean_bio(" ".join(bio_parts))
        out.append((name, title, bio, ""))

        i = j
    return out

# -------------------------
# Extraction: Card containers + profile links
# -------------------------
def extract_people_cards(soup: BeautifulSoup, source_url: str) -> list[tuple[str, str, str, str]]:
    """
    Looks for repeated "team member" blocks.
    Returns list of (name, title, bio, profile_url).
    """
    out = []
    # pick containers with likely class/id
    candidates = soup.find_all(
        lambda tag: tag.name in ("div", "section", "article", "li")
        and ("class" in tag.attrs or "id" in tag.attrs)
        and any(k in " ".join(tag.get("class", [])).lower() for k in ["team", "member", "profile", "director", "executive", "leadership"])
    )

    # If nothing matched, try some general cards
    if not candidates:
        candidates = soup.find_all(["article", "li"], limit=200)

    for block in candidates[:250]:
        txt = _norm_space(block.get_text(" ", strip=True))
        if not txt or len(txt) < 20:
            continue

        # try to find name in headings/strong
        name = ""
        for tag in block.find_all(["h1", "h2", "h3", "h4", "strong"], limit=8):
            t = _norm_space(tag.get_text(" ", strip=True))
            if is_probable_name(t):
                name = t
                break
        if not name:
            continue

        # title
        title = ""
        # common: <p>Title</p> right after heading
        for tag in block.find_all(["p", "span", "div"], limit=10):
            t = _norm_space(tag.get_text(" ", strip=True))
            if t and is_probable_title(t) and not is_probable_name(t):
                title = t
                break

        # profile link (if any)
        profile = ""
        a = block.find("a", href=True)
        if a:
            profile = urljoin(source_url, a["href"])

        # bio (short)
        bio = clean_bio(txt)
        out.append((name, title, bio, profile))

    # dedupe
    seen = set()
    deduped = []
    for n, t, b, p in out:
        key = (n.lower(), (t or "").lower())
        if key in seen:
            continue
        seen.add(key)
        deduped.append((n, t, b, p))
    return deduped

def extract_people_from_html(html: str, source_url: str) -> list[tuple[str, str, str, str, str]]:
    """
    Returns list of (name, title, bio, profile_url, method)
    """
    soup = BeautifulSoup(html or "", "lxml")

    people = []

    # JSON-LD first (highest precision when present)
    for n, t, b, p in extract_people_jsonld(soup, source_url):
        people.append((n, t, b, p, "jsonld"))

    # Card blocks (often good for company/team pages)
    for n, t, b, p in extract_people_cards(soup, source_url):
        people.append((n, t, b, p, "cards"))

    # Text-pattern (very robust for pages like Barton Gold)
    for n, t, b, p in extract_people_textpattern(soup, source_url):
        people.append((n, t, b, p, "textpattern"))

    # final dedupe by name
    seen = set()
    out = []
    for n, t, b, p, m in people:
        if not is_probable_name(n):
            continue
        key = n.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append((n, t or "", b or "", p or "", m))
    return out

# -------------------------
# Main scrape per company
# -------------------------
def scrape_company(ticker: str, company: str, website: str, debug_dir: Path) -> list[PersonRow]:
    ticker = (ticker or "").strip()
    company = (company or "").strip()
    website = (website or "").strip()

    if not website:
        return []

    candidates = build_candidate_urls(website, debug_dir)

    if VERBOSE:
        print(f"\n=== {ticker} {company} ===")
        print(f"Website: {website}")
        print(f"Candidates: {len(candidates)} (showing top 12)")
        for u in candidates[:12]:
            print("  ", u)

    visited: set[str] = set()
    out_rows: list[PersonRow] = []
    profile_queue: list[str] = []
    profile_followed = 0

    # simple priority: process candidates in order; newly discovered profile links go first
    queue: list[str] = list(candidates)

    visits = 0
    while queue and visits < MAX_VISITS_PER_COMPANY:
        url = _canonical(queue.pop(0))
        if url in visited:
            continue
        visited.add(url)

        st, ctype, final, html = fetch(url)
        visits += 1

        if VERBOSE:
            print(f"VISIT {visits}/{MAX_VISITS_PER_COMPANY} url={url} -> status={st} final={final} len={len(html or '')}")

        if st >= 400 or not html or "text/html" not in (ctype or ""):
            time.sleep(SLEEP_S)
            continue

        save_debug(debug_dir, f"visit{visits}", final, html)

        # quick relevance check (avoid extracting from random pages)
        rel = keyword_score(final)
        # also look at title/h1 quickly
        try:
            soup0 = BeautifulSoup(html, "lxml")
            title = _norm_space((soup0.title.get_text(" ", strip=True) if soup0.title else ""))
            h1 = _norm_space((soup0.find("h1").get_text(" ", strip=True) if soup0.find("h1") else ""))
            rel += keyword_score(title) + keyword_score(h1)
        except Exception:
            pass

        if rel < 1 and visits > 6:
            # still harvest internal links lightly (in case nav pages are weird),
            # but don't spend time extracting people here
            links = extract_internal_links(html, final, max_links=120)
            for u, txt in links:
                sc = keyword_score(u) + keyword_score(txt)
                if sc >= 3 and u not in visited:
                    queue.append(u)
            time.sleep(SLEEP_S)
            continue

        people = extract_people_from_html(html, final)

        if people:
            for name, title, bio, profile_url, method in people:
                out_rows.append(PersonRow(
                    ticker=ticker,
                    company=company,
                    website=website,
                    name=name,
                    title=title,
                    bio=bio,
                    profile_url=profile_url,
                    source_url=final,
                    method=method
                ))
                # if we found profile links, follow them (often gives better title/bio)
                if profile_url and profile_url not in visited:
                    profile_queue.append(profile_url)

        # Follow profile pages with priority (but cap it)
        while profile_queue and profile_followed < MAX_PROFILE_FOLLOWS and visits < MAX_VISITS_PER_COMPANY:
            purl = _canonical(profile_queue.pop(0))
            if purl in visited:
                continue
            visited.add(purl)

            st2, ctype2, final2, html2 = fetch(purl)
            visits += 1
            profile_followed += 1

            if VERBOSE:
                print(f"PROFILE {profile_followed}/{MAX_PROFILE_FOLLOWS} url={purl} -> status={st2} final={final2} len={len(html2 or '')}")

            if st2 >= 400 or not html2 or "text/html" not in (ctype2 or ""):
                time.sleep(SLEEP_S)
                continue

            save_debug(debug_dir, f"profile{profile_followed}", final2, html2)

            people2 = extract_people_from_html(html2, final2)
            # profile page typically has 1 person; use it to overwrite if same name exists
            if people2:
                for name, title, bio, profile_url, method in people2[:2]:
                    # update if same person exists
                    replaced = False
                    for r in out_rows:
                        if r.name.lower() == name.lower():
                            if title and (not r.title or len(r.title) < 3):
                                r.title = title
                            if bio and (not r.bio or len(r.bio) < 30):
                                r.bio = bio
                            if final2:
                                r.source_url = final2
                            r.profile_url = final2
                            r.method = f"{r.method}+profile"
                            replaced = True
                            break
                    if not replaced:
                        out_rows.append(PersonRow(
                            ticker=ticker,
                            company=company,
                            website=website,
                            name=name,
                            title=title,
                            bio=bio,
                            profile_url=final2,
                            source_url=final2,
                            method=f"profile:{method}",
                        ))

            time.sleep(SLEEP_S)

        # expand queue using internal links from this relevant page
        links = extract_internal_links(html, final, max_links=250)
        for u, txt in links:
            sc = keyword_score(u) + keyword_score(txt)
            if sc >= 4 and u not in visited:
                # high-priority insert near front
                queue.insert(min(8, len(queue)), u)

        time.sleep(SLEEP_S)

        # If we already got a decent set, stop early
        if len(out_rows) >= 10 and visits >= 12:
            break

    # final dedupe by name
    uniq = {}
    for r in out_rows:
        key = r.name.lower()
        if key not in uniq:
            uniq[key] = r
        else:
            # keep the richer one
            if len(r.bio or "") > len(uniq[key].bio or ""):
                uniq[key] = r

    final_rows = list(uniq.values())
    if VERBOSE:
        print(f"Parsed people: {len(final_rows)} (visited {visits} pages)")
        for r in final_rows[:10]:
            print("  -", r.name, "|", r.title)

    return final_rows

# -------------------------
# Run all companies
# -------------------------
def run_all(input_csv: str, output_csv: str, debug_dir: str) -> None:
    in_path = Path(input_csv)
    if not in_path.exists():
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")

    debug_root = Path(debug_dir) if debug_dir else None
    if debug_root:
        debug_root.mkdir(parents=True, exist_ok=True)

    dialect = sniff_dialect(str(in_path))

    out_rows: list[PersonRow] = []

    with open(in_path, "r", encoding="utf-8", errors="ignore", newline="") as f:
        reader = csv.DictReader(f, dialect=dialect)
        if not reader.fieldnames:
            raise ValueError("Input CSV has no header row.")

        ticker_col, company_col, web_col = detect_columns(reader.fieldnames)

        for idx, row in enumerate(reader, 1):
            ticker = _norm_space(row.get(ticker_col, ""))
            company = _norm_space(row.get(company_col, ""))
            website = _norm_space(row.get(web_col, ""))

            if not ticker and not company and not website:
                continue

            # per-company debug folder
            company_debug = (debug_root / ticker) if debug_root else Path(".") / "debug_html" / (ticker or f"row{idx}")
            company_debug.mkdir(parents=True, exist_ok=True)

            try:
                people = scrape_company(ticker, company, website, company_debug)
                out_rows.extend(people)
            except Exception as e:
                print(f"ERROR scraping {ticker} {company}: {e}")
                if VERBOSE:
                    traceback.print_exc()

    # write output
    out_path = Path(output_csv)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    fieldnames = list(asdict(PersonRow(
        ticker="", company="", website="", name="", title="", bio="",
        profile_url="", source_url="", method=""
    )).keys())

    with open(out_path, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in out_rows:
            w.writerow(asdict(r))

    print(f"\nDone. Wrote {len(out_rows)} rows to: {out_path}")
    print(f"Debug dir: {debug_root if debug_root else '(none)'}")

# ============================================================
# NOTEBOOK RUN (EDIT THESE THREE PATHS)
# ============================================================
# Example:
# INPUT_CSV = r"C:\...\companies_with_website.csv"
# OUTPUT_CSV = r"C:\...\management_teams.csv"
# DEBUG_DIR  = r"C:\...\debug_html"

# --- Set your paths here ---
INPUT_CSV = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\companies_with_website_2.csv"
OUTPUT_CSV = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\management_teams.csv"
DEBUG_DIR  = r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\debug_html"

# Run:
run_all(INPUT_CSV, OUTPUT_CSV, DEBUG_DIR)


FETCH home url=https://alligatorenergy.com.au -> final=https://alligatorenergy.com.au/ status=200 len=75370
FETCH sitemap url=https://alligatorenergy.com.au/sitemap.xml -> status=200 ctype=application/xml len=0
FETCH sitemap url=https://alligatorenergy.com.au/sitemap_index.xml -> status=404 ctype=text/html; charset=utf-8 len=3835
FETCH sitemap url=https://alligatorenergy.com.au/wp-sitemap.xml -> status=404 ctype=text/html; charset=utf-8 len=3835

=== AGE Alligator Energy Ltd ===
Website: alligatorenergy.com.au
Candidates: 29 (showing top 12)
   https://alligatorenergy.com.au/
   https://alligatorenergy.com.au/corporate/board-of-directors
   https://alligatorenergy.com.au/corporate/directors-and-management
   https://alligatorenergy.com.au/team_category/board-of-directors
   https://alligatorenergy.com.au/board-of-directors
   https://alligatorenergy.com.au/directors-and-management
   https://alligatorenergy.com.au/corporate/management-team
   https://alligatorenergy.com.au/board-and-ma