In [4]:
# =========================
# RIU: Resource Definition / PFS flagger (v2)
# - Reads companies_with_website.csv (Company, Ticker, Website)
# - Crawls each company website (sitemap + light crawl) to find PDF links
# - Downloads Quarterly + Annual PDFs, extracts text, flags + summarizes evidence (<= 50 words)
# - Handles:
#   * invalid date parsing (no crash)
#   * PDFs where Content-Type isn't application/pdf (checks PDF signature)
#   * ASX "Access to this site" HTML wall by optionally using Playwright to fetch PDF bytes
# - Outputs: riu_stage_flags.csv + downloaded_reports/<TICKER>/*.pdf + debug_pages/<TICKER>/*.html
# =========================

import re
import time
import json
import hashlib
import shutil
import datetime as dt
from dataclasses import dataclass, asdict
from pathlib import Path
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

# -------- Optional: Playwright (for ASX links that return "Access to this site") --------
PLAYWRIGHT_OK = False
try:
    from playwright.sync_api import sync_playwright
    PLAYWRIGHT_OK = True
except Exception:
    PLAYWRIGHT_OK = False

# -------- PDF text extraction --------
PDF_OK = False
try:
    from pypdf import PdfReader
    PDF_OK = True
except Exception:
    PDF_OK = False


# -------------------------
# USER SETTINGS
# -------------------------
INPUT_CSV  = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\companies_with_website.csv"
OUT_CSV    = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags.csv"

DOWNLOAD_DIR = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports")
DEBUG_DIR    = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\debug_pages")

ONLY_TICKERS = None  # e.g. ["CHN"] for testing; else None for all

REQUEST_TIMEOUT = 25
SLEEP_S = 0.6
MAX_PAGES_TO_VISIT = 35
MAX_SITEMAP_URLS = 6000
MAX_SITEMAP_CHILDREN = 25

# If True, and a PDF link is on asx.com.au and we hit "Access to this site",
# the script will try Playwright to fetch the real PDF bytes.
USE_PLAYWRIGHT_FOR_ASX_PDFS = True

# Debug verbosity
VERBOSE = True

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)


# -------------------------
# KEYWORDS / HEURISTICS
# -------------------------
INTENT_RE = re.compile(
    r"\b(will|plan(?:s|ned)?|intend(?:s|ed)?|to\s+(?:commence|start|undertake|complete|deliver|progress|advance)|"
    r"target(?:s|ing)?|scheduled|expected|underway|next\s+quarter)\b",
    re.IGNORECASE
)

RESOURCE_RE_LIST = [
    r"\bresource definition\b",
    r"\bresource drilling\b",
    r"\binfill drilling\b",
    r"\bmineral resource\b",
    r"\bresource estimate\b",
    r"\bmaiden resource\b",
    r"\bJORC\b",
    r"\bMRE\b",
    r"\bupgrade\b.{0,40}\bresource\b",
]

PFS_RE_LIST = [
    r"\bpre[-\s]?feasibility\b",
    r"\bPFS\b",
    r"\bfeasibility study\b",
    r"\bDFS\b",
    r"\bdefinitive feasibility\b",
    r"\bscoping study\b",
]

RESOURCE_RE = re.compile("|".join(RESOURCE_RE_LIST), re.IGNORECASE)
PFS_RE      = re.compile("|".join(PFS_RE_LIST), re.IGNORECASE)

QUARTERLY_HINT = re.compile(r"(quarterly|appendix\s*4c|activities\s+report|quarter\s+report|3[-\s]?month)", re.IGNORECASE)
ANNUAL_HINT    = re.compile(r"(annual\s+report|appendix\s*4e|year\s+end|full\s+year|financial\s+report|annual\s+financial)", re.IGNORECASE)

REPORT_URL_HINT = re.compile(
    r"(investor|investors|asx|announce|announcement|release|news|media|report|results|financial|presentation|quarter|appendix|4c|4e)",
    re.IGNORECASE
)

PDF_EXT_RE = re.compile(r"\.pdf(\?|$)", re.IGNORECASE)


# -------------------------
# DATA MODEL
# -------------------------
@dataclass
class StageRow:
    Ticker: str
    Company: str
    Website: str
    Quarterly_URL: str
    Annual_URL: str
    Resource_Definition_Flag: str
    PFS_Flag: str
    Evidence_Summary_50w: str
    Evidence_Snippets: str
    Notes: str


# -------------------------
# HELPERS
# -------------------------
def log(msg: str):
    if VERBOSE:
        print(msg)

def safe_filename(s: str) -> str:
    # Keep filenames short for Windows + OneDrive
    s = re.sub(r"[^a-zA-Z0-9._-]+", "_", s)
    return s[:120]

def short_hash(s: str) -> str:
    return hashlib.md5(s.encode("utf-8", errors="ignore")).hexdigest()[:10]

def normalize_site(site: str) -> str:
    site = (site or "").strip()
    if not site:
        return ""
    if not site.startswith(("http://", "https://")):
        site = "https://" + site
    return site.rstrip("/")

def same_domain(base: str, url: str) -> bool:
    try:
        b = urlparse(base)
        u = urlparse(url)
        bn = b.netloc.lower()
        un = u.netloc.lower()
        return (bn == un) or un.endswith("." + bn) or bn.endswith("." + un)
    except Exception:
        return False

def is_asx_url(url: str) -> bool:
    try:
        host = urlparse(url).netloc.lower()
        return host.endswith("asx.com.au") or host.endswith("announcements.asx.com.au")
    except Exception:
        return False

def session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"})
    return s

def fetch_text(sess: requests.Session, url: str) -> tuple[int, str, str]:
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        ctype = (r.headers.get("content-type") or "").lower()
        return r.status_code, ctype, r.text or ""
    except Exception:
        return 0, "", ""

def looks_like_pdf_bytes(first_bytes: bytes) -> bool:
    return first_bytes.startswith(b"%PDF-")

def download_via_requests(sess: requests.Session, url: str, outpath: Path) -> tuple[bool, str]:
    """
    Download using requests. Accept if:
      - URL endswith .pdf OR content-type includes pdf OR file starts with %PDF-
    """
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, stream=True, allow_redirects=True)
        status = r.status_code
        if status >= 400:
            return False, f"http_{status}"

        ctype = (r.headers.get("content-type") or "").lower()
        cd    = (r.headers.get("content-disposition") or "").lower()

        # Peek first chunk to detect PDF signature / ASX access wall
        it = r.iter_content(chunk_size=1024 * 64)
        first = next(it, b"") or b""

        # Detect ASX access wall
        if b"Access to this site" in first or b"Access to this Site" in first:
            return False, "asx_access_wall"

        is_pdf = (
            ("pdf" in ctype)
            or PDF_EXT_RE.search(url)
            or (".pdf" in cd)
            or looks_like_pdf_bytes(first)
        )
        if not is_pdf:
            return False, f"not_pdf_ctype={ctype[:40]}"

        outpath.parent.mkdir(parents=True, exist_ok=True)
        with open(outpath, "wb") as f:
            if first:
                f.write(first)
            for chunk in it:
                if chunk:
                    f.write(chunk)

        if not outpath.exists() or outpath.stat().st_size < 10_000:
            return False, "download_too_small"
        return True, "ok"

    except Exception as e:
        return False, f"exception_{type(e).__name__}"

def download_via_playwright(url: str, outpath: Path) -> tuple[bool, str]:
    """
    Fetch bytes via Playwright (helps with ASX access wall).
    """
    if not PLAYWRIGHT_OK:
        return False, "playwright_not_installed"

    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            context = browser.new_context(user_agent=USER_AGENT)
            page = context.new_page()

            resp = page.goto(url, wait_until="networkidle", timeout=REQUEST_TIMEOUT * 1000)
            content = page.content() or ""

            # If we landed on ASX access page, try clicking confirmation text/button if present.
            if "Access to this site" in content:
                # Try a few likely selectors / text patterns
                clicked = False
                candidates = [
                    "text=I confirm",
                    "text=I Confirm",
                    "text=confirm that any content",
                    "input[type=submit]",
                    "button:has-text('I confirm')",
                    "a:has-text('I confirm')",
                ]
                for sel in candidates:
                    try:
                        if page.locator(sel).count() > 0:
                            page.locator(sel).first.click(timeout=3000)
                            clicked = True
                            break
                    except Exception:
                        pass

                # Re-try navigation after confirming
                if clicked:
                    resp = page.goto(url, wait_until="networkidle", timeout=REQUEST_TIMEOUT * 1000)

            if resp is None:
                browser.close()
                return False, "no_response"

            headers = {k.lower(): v for k, v in (resp.headers or {}).items()}
            ctype = headers.get("content-type", "").lower()

            body = resp.body()
            if not body:
                browser.close()
                return False, "empty_body"

            if (("pdf" not in ctype) and (not looks_like_pdf_bytes(body[:8]))):
                # Still not a PDF
                browser.close()
                return False, f"not_pdf_playwright_ctype={ctype[:40]}"

            outpath.parent.mkdir(parents=True, exist_ok=True)
            outpath.write_bytes(body)

            browser.close()
            if outpath.stat().st_size < 10_000:
                return False, "pw_download_too_small"
            return True, "ok"

    except Exception as e:
        return False, f"pw_exception_{type(e).__name__}"

def download_file(sess: requests.Session, url: str, outpath: Path) -> tuple[bool, str]:
    """
    Try requests first. If ASX access wall and allowed, try Playwright.
    """
    ok, why = download_via_requests(sess, url, outpath)
    if ok:
        return True, "requests_ok"

    if why == "asx_access_wall" and USE_PLAYWRIGHT_FOR_ASX_PDFS and is_asx_url(url):
        ok2, why2 = download_via_playwright(url, outpath)
        return ok2, f"playwright_{why2}"

    return False, why

def extract_pdf_text(pdf_path: Path) -> str:
    if not PDF_OK:
        return ""
    try:
        reader = PdfReader(str(pdf_path))
        parts = []
        for p in reader.pages:
            t = p.extract_text() or ""
            if t:
                parts.append(t)
        return "\n".join(parts)
    except Exception:
        return ""

def split_sentences(text: str) -> list[str]:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    sents = re.split(r"(?<=[\.\?\!])\s+|\n+", text)
    sents = [s.strip() for s in sents if s and len(s.strip()) > 20]
    return sents

def trim_to_50_words(s: str) -> str:
    words = s.split()
    if len(words) <= 50:
        return s.strip()
    return " ".join(words[:50]).strip() + "…"

def safe_dt(y: int, mo: int, d: int) -> dt.datetime | None:
    try:
        return dt.datetime(y, mo, d)
    except ValueError:
        return None

def guess_date_from_text(s: str) -> dt.datetime | None:
    s = s or ""

    # YYYY-MM-DD / YYYY/MM/DD
    m = re.search(r"(20\d{2})[-/](\d{1,2})[-/](\d{1,2})", s)
    if m:
        y, mo, d = map(int, m.groups())
        out = safe_dt(y, mo, d)
        if out:
            return out

    # DD MMM YYYY
    m = re.search(r"(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(20\d{2})", s, re.IGNORECASE)
    if m:
        d = int(m.group(1)); mon = m.group(2).lower(); y = int(m.group(3))
        months = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
        out = safe_dt(y, months[mon[:3]], d)
        if out:
            return out

    # MMM YYYY
    m = re.search(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(20\d{2})", s, re.IGNORECASE)
    if m:
        mon = m.group(1).lower(); y = int(m.group(2))
        months = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
        return dt.datetime(y, months[mon[:3]], 1)

    # YYYYMMDD
    m = re.search(r"(20\d{2})(\d{2})(\d{2})", s)
    if m:
        y, mo, d = map(int, m.groups())
        out = safe_dt(y, mo, d)
        if out:
            return out

    return None

def extract_pdf_links_from_html(base_url: str, html: str) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        absu = urljoin(base_url, href)
        label = " ".join(a.get_text(" ", strip=True).split())

        # Prefer explicit .pdf links, but also catch "download" style links that include "pdf" in query/path
        if PDF_EXT_RE.search(absu) or ("pdf" in absu.lower() and ("download" in absu.lower() or "media" in absu.lower() or "uploads" in absu.lower())):
            out.append({
                "url": absu,
                "label": label,
                "date_guess": guess_date_from_text(label + " " + absu),
                "host": "asx" if is_asx_url(absu) else "site"
            })
    return out

def best_doc_candidate(cands: list[dict], want: str) -> dict | None:
    if not cands:
        return None

    # Prefer site-hosted PDFs over ASX-hosted PDFs (ASX may block automation)
    site_first = sorted(cands, key=lambda c: 0 if c.get("host") == "site" else 1)

    if want == "quarterly":
        filt = [c for c in site_first if QUARTERLY_HINT.search((c.get("label","") + " " + c.get("url","")))]
    else:
        filt = [c for c in site_first if ANNUAL_HINT.search((c.get("label","") + " " + c.get("url","")))]

    if not filt:
        filt = site_first[:]  # fallback

    def key(c):
        d = c.get("date_guess")
        return d or dt.datetime(1970, 1, 1)

    filt.sort(key=key, reverse=True)
    return filt[0]

def get_sitemap_urls(sess: requests.Session, site: str) -> list[str]:
    starts = ["/sitemap.xml", "/sitemap_index.xml", "/wp-sitemap.xml"]
    all_urls = []

    for p in starts:
        sm = site + p
        status, ctype, txt = fetch_text(sess, sm)
        time.sleep(SLEEP_S)

        if status >= 400 or not txt or "xml" not in ctype:
            continue

        soup = BeautifulSoup(txt, "xml")
        sitemaps = [loc.get_text(strip=True) for loc in soup.find_all("sitemap") for loc in loc.find_all("loc")]  # type: ignore

        if sitemaps:
            for child in sitemaps[:MAX_SITEMAP_CHILDREN]:
                st, ct, t2 = fetch_text(sess, child)
                time.sleep(SLEEP_S)
                if st >= 400 or not t2 or "xml" not in ct:
                    continue
                s2 = BeautifulSoup(t2, "xml")
                locs = [loc.get_text(strip=True) for loc in s2.find_all("loc")]
                for u in locs:
                    if REPORT_URL_HINT.search(u):
                        all_urls.append(u)
                if len(all_urls) >= MAX_SITEMAP_URLS:
                    return all_urls[:MAX_SITEMAP_URLS]
        else:
            locs = [loc.get_text(strip=True) for loc in soup.find_all("loc")]
            for u in locs:
                if REPORT_URL_HINT.search(u):
                    all_urls.append(u)
            if all_urls:
                return all_urls[:MAX_SITEMAP_URLS]

    return all_urls[:MAX_SITEMAP_URLS]

def discover_report_pages(site: str) -> list[str]:
    paths = [
        "/investors", "/investor", "/investor-centre",
        "/asx-announcements", "/announcements", "/asx-releases",
        "/reports", "/financial-reports", "/results", "/news", "/media",
        "/investors/asx-announcements", "/investors/announcements", "/investors/reports",
    ]
    return [site + p for p in paths]

def pick_latest_quarterly_and_annual_from_site(sess: requests.Session, site: str, debug_dir: Path) -> tuple[dict|None, dict|None, list[str]]:
    notes = []
    pdf_candidates = []

    debug_dir.mkdir(parents=True, exist_ok=True)

    pages = get_sitemap_urls(sess, site)
    if pages:
        notes.append(f"sitemap_pages={len(pages)}")
    else:
        notes.append("no_sitemap_or_no_filtered_urls")
        pages = discover_report_pages(site)

    visited = set()
    queue = []

    for u in pages[:250]:
        if REPORT_URL_HINT.search(u):
            queue.append(u)

    while queue and len(visited) < MAX_PAGES_TO_VISIT:
        url = queue.pop(0)
        if url in visited:
            continue
        if not same_domain(site, url):
            continue

        visited.add(url)
        st, ct, html = fetch_text(sess, url)
        time.sleep(SLEEP_S)

        if not html or st >= 400:
            continue

        # Safer debug filename (hash-based)
        dbg_name = f"{len(visited):02d}_{short_hash(url)}_{safe_filename(urlparse(url).path)}.html"
        (debug_dir / dbg_name).write_text(html, encoding="utf-8", errors="ignore")

        pdfs = extract_pdf_links_from_html(url, html)
        for p in pdfs:
            p["source_page"] = url
            pdf_candidates.append(p)

        soup = BeautifulSoup(html, "lxml")
        new_links = 0
        for a in soup.find_all("a", href=True):
            href = (a.get("href") or "").strip()
            if not href:
                continue
            absu = urljoin(url, href)
            if absu in visited:
                continue
            if not same_domain(site, absu):
                continue
            if REPORT_URL_HINT.search(absu):
                queue.append(absu)
                new_links += 1
                if new_links >= 30:
                    break

    notes += [f"visited_pages={len(visited)}", f"pdf_candidates_raw={len(pdf_candidates)}"]

    if not pdf_candidates:
        return None, None, notes + ["no_pdf_links_found"]

    # de-dup by URL
    seen = set()
    dedup = []
    for c in pdf_candidates:
        u = c.get("url")
        if not u or u in seen:
            continue
        seen.add(u)
        if not c.get("date_guess"):
            c["date_guess"] = guess_date_from_text((c.get("label","") + " " + u))
        dedup.append(c)

    q = best_doc_candidate(dedup, want="quarterly")
    a = best_doc_candidate(dedup, want="annual")

    if q and a and q.get("url") == a.get("url"):
        annuals = [c for c in dedup if ANNUAL_HINT.search((c.get("label","") + " " + c.get("url","")))]
        if annuals:
            annuals.sort(key=lambda c: c.get("date_guess") or dt.datetime(1970,1,1), reverse=True)
            a = annuals[0]

    notes.append(f"pdf_candidates_dedup={len(dedup)}")
    return q, a, notes

def evidence_from_text(text: str) -> tuple[bool, bool, list[str]]:
    sents = split_sentences(text)
    resource_hits = []
    pfs_hits = []

    for s in sents:
        has_intent = bool(INTENT_RE.search(s))
        if RESOURCE_RE.search(s):
            if has_intent or ("program" in s.lower()) or ("next" in s.lower()):
                resource_hits.append(s)
        if PFS_RE.search(s):
            if has_intent or ("study" in s.lower()) or ("engineering" in s.lower()):
                pfs_hits.append(s)

    resource_hits = sorted(resource_hits, key=len)[:2]
    pfs_hits      = sorted(pfs_hits, key=len)[:2]

    return (len(resource_hits) > 0), (len(pfs_hits) > 0), (resource_hits + pfs_hits)

def process_company(ticker: str, company: str, website: str, base_out: Path, base_debug: Path) -> StageRow:
    site = normalize_site(website)
    sess = session()
    tkr = (ticker or "").strip().upper()

    if not site or not tkr:
        return StageRow(tkr, company, website, "", "", "N", "N", "", "", "missing ticker or website")

    company_out = base_out / tkr
    company_dbg = base_debug / tkr
    company_out.mkdir(parents=True, exist_ok=True)
    company_dbg.mkdir(parents=True, exist_ok=True)

    q_doc, a_doc, notes = pick_latest_quarterly_and_annual_from_site(sess, site, company_dbg)

    q_url = q_doc["url"] if q_doc else ""
    a_url = a_doc["url"] if a_doc else ""

    resource_flag = "N"
    pfs_flag = "N"
    snippets = []
    summary = ""
    note_txt = "; ".join(notes)

    # Save a small debug JSON about what we picked
    try:
        dbg = {"ticker": tkr, "site": site, "quarterly": q_doc, "annual": a_doc, "notes": notes}
        (company_dbg / "picked_docs.json").write_text(json.dumps(dbg, default=str, indent=2), encoding="utf-8")
    except Exception:
        pass

    def handle_doc(doc: dict|None, doc_tag: str):
        nonlocal resource_flag, pfs_flag, snippets, summary, note_txt
        if not doc:
            note_txt += f"; no_{doc_tag}_doc"
            return

        url = (doc.get("url") or "").strip()
        if not url:
            note_txt += f"; empty_{doc_tag}_url"
            return

        date_guess = doc.get("date_guess")
        datestr = date_guess.strftime("%Y%m%d") if isinstance(date_guess, dt.datetime) else "unknown"
        pdf_path = company_out / f"{tkr}_{doc_tag}_{datestr}.pdf"

        log(f"  - {doc_tag}: {url}")
        ok, why = download_file(sess, url, pdf_path)
        time.sleep(SLEEP_S)

        if not ok:
            note_txt += f"; download_failed_{doc_tag}({why})"
            return
        else:
            note_txt += f"; downloaded_{doc_tag}({why})"

        txt = extract_pdf_text(pdf_path)
        if not txt.strip():
            note_txt += f"; no_text_{doc_tag}_pdf"
            return

        r_ok, p_ok, hits = evidence_from_text(txt)
        if r_ok:
            resource_flag = "Y"
        if p_ok:
            pfs_flag = "Y"
        if hits:
            snippets.extend([f"[{doc_tag.upper()}] {h}" for h in hits])

    handle_doc(q_doc, "quarterly")
    handle_doc(a_doc, "annual")

    if snippets:
        combined = " ".join(snippets)
        summary = trim_to_50_words(combined)
        snippet_txt = "\n".join(snippets[:6])
    else:
        snippet_txt = ""

    return StageRow(
        Ticker=tkr,
        Company=company,
        Website=website,
        Quarterly_URL=q_url,
        Annual_URL=a_url,
        Resource_Definition_Flag=resource_flag,
        PFS_Flag=pfs_flag,
        Evidence_Summary_50w=summary,
        Evidence_Snippets=snippet_txt,
        Notes=note_txt
    )

def read_companies(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = [c.strip() for c in df.columns]

    colmap = {c.lower(): c for c in df.columns}
    def pick(*names):
        for n in names:
            if n in colmap:
                return colmap[n]
        return None

    c_company = pick("company", "name", "co", "issuer")
    c_ticker  = pick("ticker", "asx", "code", "symbol")
    c_web     = pick("website", "web", "url", "site")

    if not c_company or not c_ticker or not c_web:
        raise ValueError(f"Missing required columns. Found: {list(df.columns)}. Need Company/Ticker/Website (or similar).")

    df = df.rename(columns={c_company:"Company", c_ticker:"Ticker", c_web:"Website"})
    df["Ticker"] = df["Ticker"].astype(str).str.strip()
    df["Company"] = df["Company"].astype(str).str.strip()
    df["Website"] = df["Website"].astype(str).str.strip()
    return df

def run_all():
    DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
    DEBUG_DIR.mkdir(parents=True, exist_ok=True)

    if USE_PLAYWRIGHT_FOR_ASX_PDFS and not PLAYWRIGHT_OK:
        print("NOTE: Playwright not installed, ASX links may fail. Install with: pip install playwright; python -m playwright install chromium")

    df = read_companies(INPUT_CSV)
    if ONLY_TICKERS:
        df = df[df["Ticker"].str.upper().isin([t.upper() for t in ONLY_TICKERS])].copy()

    out_rows = []
    total = len(df)

    for idx, r in df.iterrows():
        tkr = str(r["Ticker"]).strip().upper()
        company = str(r["Company"]).strip()
        web = str(r["Website"]).strip()

        print(f"\n[{len(out_rows)+1}/{total}] {tkr} - {company} ({web})")

        try:
            row = process_company(tkr, company, web, DOWNLOAD_DIR, DEBUG_DIR)
        except Exception as e:
            row = StageRow(
                Ticker=tkr, Company=company, Website=web,
                Quarterly_URL="", Annual_URL="",
                Resource_Definition_Flag="N", PFS_Flag="N",
                Evidence_Summary_50w="", Evidence_Snippets="",
                Notes=f"FATAL_{type(e).__name__}:{e}"
            )

        out_rows.append(asdict(row))

    out_df = pd.DataFrame(out_rows)
    out_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"\nDone. Wrote: {OUT_CSV}")
    print(f"Downloaded reports folder: {DOWNLOAD_DIR}")

    hits = out_df[(out_df["Resource_Definition_Flag"]=="Y") | (out_df["PFS_Flag"]=="Y")].copy()
    print(f"Flagged companies: {len(hits)} / {len(out_df)}")
    return out_df, hits

# ---- RUN ----
out_df, hits_df = run_all()
hits_df



[1/91] RMS - Ramelius Resources Ltd (rameliusresources.com.au)
  - quarterly: https://www.rameliusresources.com.au/wp-content/uploads/bsk-pdf-manager/2026/01/3016456.pdf
  - annual: https://www.rameliusresources.com.au/wp-content/uploads/bsk-pdf-manager/2025/10/2025-Annual-Report-1.pdf

[2/91] FFM - FireFly Metals Ltd (fireflymetals.com.au)

[3/91] CHN - Chalice Mining Ltd (chalicemining.com)
  - quarterly: https://chalicemining.com/wp-content/uploads/2026/01/61309309.pdf
  - annual: https://chalicemining.com/wp-content/uploads/2025/09/61286187-2.pdf

[4/91] RXL - Rox Resources Ltd (roxresources.com.au)

[5/91] NMG - New Murchison Gold Ltd (newmurchgold.com.au)

[6/91] MEI - Meteoric Resources NL (meteoric.com.au)

[7/91] SVL - Silver Mines Ltd (silvermines.com.au)
  - quarterly: https://www.silvermines.com.au/wp-content/uploads/2025/11/06rb480zcsf5jd.pdf
  - annual: https://www.silvermines.com.au/wp-content/uploads/2025/10/06pwx1cz93f4cj.pdf

[8/91] STK - Strickland Metals Ltd (stric

CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found


  - annual: https://brightstarresources.com.au/wp-content/uploads/2025/04/20221221_WA-gold-results-boost-ambitions-for-Brightstar-flagship.pdf


CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found



[14/91] WWI - West Wits Mining Ltd (westwitsmining.com)
  - quarterly: http://westwitsmining.com/wp-content/uploads/2021/11/Quarterly-Report-July-Sep-2021.pdf
  - annual: https://westwitsmining.com/wp-content/uploads/2026/02/WWI-Corporate-Presentation-February-2026.pdf

[15/91] CTM - Centaurus Metals Ltd (centaurus.com.au)

[16/91] MM8 - Medallion Metals Ltd (medallionmetals.com.au)
  - quarterly: https://medallionmetals.com.au/wp-content/uploads/2025/09/Medallion-Metals-Group-Purchase-Order-TCs.pdf
  - annual: https://medallionmetals.com.au/wp-content/uploads/2025/09/Medallion-Metals-Group-Purchase-Order-TCs.pdf

[17/91] BGD - Barton Gold Holdings Ltd (bartongold.com.au)
  - quarterly: https://bartongold.com.au/wp-content/uploads/BartonGoldProspectus14May21.pdf
  - annual: https://bartongold.com.au/wp-content/uploads/BartonGoldProspectus14May21.pdf

[18/91] RHI - Red Hill Minerals Ltd (redhillminerals.com.au)

[19/91] HCH - Hot Chili Ltd (hotchili.net.au)
  - quarterly: https://www.h

Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wron

  - annual: https://www.cauldronenergy.com.au/wp-content/uploads/2025/04/20250225_CXU_Investor-Pres_RIU-Explorers.pdf


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wron


[55/91] AR3 - Australian Rare Earths Ltd (ar3.com.au)

[56/91] OR3 - Ore Resources Ltd (oreresources.com.au)

[57/91] NMT - Neometals Ltd (neometals.com.au)
  - quarterly: https://www.neometals.com.au/wp-content/uploads/2025/10/2025_Neometals-Sustainability-Report-FINAL.pdf
  - annual: https://www.neometals.com.au/wp-content/uploads/2025/10/2025_Neometals-Sustainability-Report-FINAL.pdf

[58/91] PGO - Pacgold Ltd (pacgold.com.au)

[59/91] CAE - Cannindah Resources Ltd (cannindah.com.au)
  - quarterly: https://cannindah.com.au/wp-content/uploads/2023/08/Noosa-mining-powerpoint.pdf
  - annual: https://cannindah.com.au/wp-content/uploads/2023/08/Noosa-mining-powerpoint.pdf

[60/91] ORD - Ordell Minerals Ltd (ordellminerals.com.au)
  - quarterly: https://www.ordellminerals.com.au/downloads/reports/ord_qa202512.pdf
  - annual: https://www.ordellminerals.com.au/downloads/reports/ord_af2025.pdf

[61/91] GA8 - GoldArc Resources Ltd (goldarcres.com.au)

[62/91] HMX - Hammer Metals Ltd (hammerm

Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)


  - annual: https://hammermetals.com.au/wp-content/uploads/Hammer-Shareholder-Communications-Aug-2022.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)



[63/91] IPT - Impact Minerals Ltd (impactminerals.com.au)

[64/91] DYM - Dynamic Metals Ltd (dynamicmetals.com.au)

[65/91] CPN - Caspin Resources Ltd (caspin.com.au)
  - quarterly: https://www.caspin.com.au/wp-content/uploads/2023/11/Website-Terms-of-Use.pdf


Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)


  - annual: https://www.caspin.com.au/wp-content/uploads/2023/11/Website-Terms-of-Use.pdf


Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)



[66/91] WIN - WIN Metals Ltd (winmetals.com.au)

[67/91] MAG - Magmatic Resources Ltd (magmaticresources.com)
  - quarterly: https://magmaticresources.com/mag/wp-content/uploads/2020/02/Eagle-Research-Advisory-Report-Feb-2020.pdf
  - annual: https://magmaticresources.com/mag/wp-content/uploads/2020/02/Eagle-Research-Advisory-Report-Feb-2020.pdf

[68/91] IDA - Indiana Resources Ltd (indianaresources.com.au)
  - quarterly: https://indianaresources.com.au/wp-content/uploads/2025/06/IDA-company-profile-2025-June-1.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)


  - annual: https://indianaresources.com.au/wp-content/uploads/2025/06/IDA-company-profile-2025-June-1.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)



[69/91] PGM - Platina Resources Ltd (platinaresources.com.au)

[70/91] ALM - Alma Metals Ltd (almametals.com.au)
  - quarterly: https://almametals.com.au/wp-content/uploads/2024/09/Alma-Metals-Research-Report_September-2024.pdf
  - annual: https://almametals.com.au/wp-content/uploads/2024/09/Alma-Metals-Research-Report_September-2024.pdf

[71/91] ADG - Adelong Gold Ltd (adelonggold.com)
  - quarterly: https://adelonggold.com/wp-content/uploads/ADGTargetMarketDetermination21May24.pdf
  - annual: https://adelonggold.com/wp-content/uploads/ADGTargetMarketDetermination21May24.pdf

[72/91] TG6 - TG Metals Ltd (tgmetals.com.au)

[73/91] JAV - Javelin Minerals Ltd (javelinminerals.com.au)
  - quarterly: https://javelinminerals.com.au/wp-content/uploads/2026/01/MLB-Jan22_26.pdf
  - annual: https://javelinminerals.com.au/wp-content/uploads/2026/01/MLB-Jan22_26.pdf

[74/91] SGA - Sarytogan Graphite Ltd (sarytogangraphite.com.au)

[75/91] VMS - Venari Minerals NL (venariminerals.com)

[76/91] AZ

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\julian.diaz\\OneDrive - XENITH CONSULTING PTY LTD\\Documents\\01_BD\\96_2026_RIU_Conference_Perth\\riu_stage_flags.csv'

In [9]:
# =========================
# RIU: REBUILD CSV from already-downloaded PDFs (NO crawling, NO downloading)
# - Scans downloaded_reports/<TICKER>/*.pdf
# - Picks latest quarterly + annual per ticker
# - Extracts text (pypdf -> fallback PyMuPDF)
# - Flags Resource Definition + PFS, writes riu_stage_flags_REBUILT.csv and _hits.csv
# =========================

import re
import datetime as dt
from pathlib import Path
from dataclasses import dataclass, asdict

import pandas as pd

# --- PDF text extractors (try pypdf first, then fitz) ---
def extract_pdf_text_pypdf(pdf_path: Path) -> str:
    try:
        from pypdf import PdfReader
        reader = PdfReader(str(pdf_path))
        parts = []
        for p in reader.pages:
            t = p.extract_text() or ""
            if t.strip():
                parts.append(t)
        return "\n".join(parts)
    except Exception:
        return ""

def extract_pdf_text_fitz(pdf_path: Path) -> str:
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(str(pdf_path))
        parts = []
        for page in doc:
            t = page.get_text("text") or ""
            if t.strip():
                parts.append(t)
        doc.close()
        return "\n".join(parts)
    except Exception:
        return ""

def extract_pdf_text(pdf_path: Path) -> str:
    txt = extract_pdf_text_pypdf(pdf_path)
    if txt.strip():
        return txt
    return extract_pdf_text_fitz(pdf_path)

# -------------------------
# SETTINGS (EDIT THESE TWO)
# -------------------------
DOWNLOAD_DIR = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports")
OUT_CSV_REBUILT = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_REBUILT.csv"

# -------------------------
# KEYWORDS / HEURISTICS (same spirit as original)
# -------------------------
INTENT_RE = re.compile(
    r"\b(will|plan(?:s|ned)?|intend(?:s|ed)?|to\s+(?:commence|start|undertake|complete|deliver|progress|advance)|"
    r"target(?:s|ing)?|scheduled|expected|underway|next\s+quarter|H[12]|FY\d{2})\b",
    re.IGNORECASE
)

RESOURCE_RE_LIST = [
    r"\bresource definition\b",
    r"\bresource drilling\b",
    r"\binfill drilling\b",
    r"\bmineral resource\b",
    r"\bresource estimate\b",
    r"\bmaiden resource\b",
    r"\bJORC\b",
    r"\bMRE\b",
    r"\bupgrade\b.{0,40}\bresource\b",
]
PFS_RE_LIST = [
    r"\bpre[-\s]?feasibility\b",
    r"\bPFS\b",
    r"\bfeasibility study\b",
    r"\bDFS\b",
    r"\bdefinitive feasibility\b",
    r"\bscoping study\b",
]

RESOURCE_RE = re.compile("|".join(RESOURCE_RE_LIST), re.IGNORECASE)
PFS_RE      = re.compile("|".join(PFS_RE_LIST), re.IGNORECASE)

# -------------------------
# UTILS
# -------------------------
def split_sentences(text: str) -> list[str]:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    sents = re.split(r"(?<=[\.\?\!])\s+|\n+", text)
    return [s.strip() for s in sents if s and len(s.strip()) > 20]

def trim_to_50_words(s: str) -> str:
    words = s.split()
    if len(words) <= 50:
        return s.strip()
    return " ".join(words[:50]).strip() + "…"

def evidence_from_text(text: str) -> tuple[bool, bool, list[str]]:
    sents = split_sentences(text)
    resource_hits = []
    pfs_hits = []

    for s in sents:
        has_intent = bool(INTENT_RE.search(s))
        if RESOURCE_RE.search(s):
            if has_intent or ("program" in s.lower()) or ("next" in s.lower()):
                resource_hits.append(s)
        if PFS_RE.search(s):
            if has_intent or ("study" in s.lower()) or ("engineering" in s.lower()):
                pfs_hits.append(s)

    resource_hits = sorted(resource_hits, key=len)[:2]
    pfs_hits      = sorted(pfs_hits, key=len)[:2]
    return (len(resource_hits) > 0), (len(pfs_hits) > 0), (resource_hits + pfs_hits)

def parse_date_from_filename(name: str) -> dt.datetime | None:
    """
    Supports:
      - 20250131 (YYYYMMDD)
      - 2025-01-31 or 2025_01_31
      - fallback: None
    """
    s = name
    m = re.search(r"(20\d{2})[.\-_]?(0[1-9]|1[0-2])[.\-_]?([0-2]\d|3[01])", s)
    if m:
        y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3))
        try:
            return dt.datetime(y, mo, d)
        except ValueError:
            return None
    return None

def doc_tag_from_filename(name: str) -> str:
    n = name.lower()
    # match your saved pattern: TKR_quarterly_YYYYMMDD.pdf etc.
    if "quarter" in n or "_qtr" in n or "4c" in n:
        return "quarterly"
    if "annual" in n or "4e" in n or "fy" in n:
        return "annual"
    # if unknown, keep it as "other"
    return "other"

def pick_latest_pdf(files: list[Path]) -> Path | None:
    if not files:
        return None
    def key(p: Path):
        d = parse_date_from_filename(p.name)
        mtime = dt.datetime.fromtimestamp(p.stat().st_mtime)
        # prefer parsed date if present; else use modified time
        return d or mtime
    return sorted(files, key=key, reverse=True)[0]

@dataclass
class StageRow:
    Ticker: str
    Quarterly_PDF: str
    Annual_PDF: str
    Resource_Definition_Flag: str
    PFS_Flag: str
    Evidence_Summary_50w: str
    Evidence_Snippets: str
    Notes: str

# -------------------------
# MAIN: rebuild
# -------------------------
def rebuild_from_downloads(download_dir: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    if not download_dir.exists():
        raise FileNotFoundError(f"DOWNLOAD_DIR not found: {download_dir}")

    rows = []
    tickers = sorted([p for p in download_dir.iterdir() if p.is_dir()])

    print(f"Found ticker folders: {len(tickers)} in {download_dir}")

    for i, tdir in enumerate(tickers, start=1):
        ticker = tdir.name.upper().strip()
        pdfs = list(tdir.glob("*.pdf"))
        if not pdfs:
            rows.append(asdict(StageRow(ticker, "", "", "N", "N", "", "", "no_pdfs_in_folder")))
            continue

        quarterly_files = [p for p in pdfs if doc_tag_from_filename(p.name) == "quarterly"]
        annual_files    = [p for p in pdfs if doc_tag_from_filename(p.name) == "annual"]

        # If your filenames don't contain quarterly/annual, fall back to "newest 2"
        q_pdf = pick_latest_pdf(quarterly_files) if quarterly_files else None
        a_pdf = pick_latest_pdf(annual_files) if annual_files else None

        notes = []
        if q_pdf is None and a_pdf is None:
            # fallback: use two newest PDFs
            newest = sorted(pdfs, key=lambda p: p.stat().st_mtime, reverse=True)[:2]
            if newest:
                q_pdf = newest[0]
                a_pdf = newest[1] if len(newest) > 1 else newest[0]
                notes.append("fallback_used_newest_2_pdfs")
        else:
            if q_pdf is None and pdfs:
                q_pdf = pick_latest_pdf(pdfs)
                notes.append("no_quarterly_named_pdf_used_latest_any")
            if a_pdf is None and pdfs:
                a_pdf = pick_latest_pdf(pdfs)
                notes.append("no_annual_named_pdf_used_latest_any")

        print(f"[{i}/{len(tickers)}] {ticker} | q={q_pdf.name if q_pdf else 'None'} | a={a_pdf.name if a_pdf else 'None'}")

        resource_flag = "N"
        pfs_flag = "N"
        snippets = []

        def scan_one(p: Path, tag: str):
            nonlocal resource_flag, pfs_flag, snippets, notes
            if p is None:
                return
            txt = extract_pdf_text(p)
            if not txt.strip():
                notes.append(f"no_text_{tag}")
                return
            r_ok, p_ok, hits = evidence_from_text(txt)
            if r_ok:
                resource_flag = "Y"
            if p_ok:
                pfs_flag = "Y"
            if hits:
                snippets.extend([f"[{tag.upper()}] {h}" for h in hits])

        scan_one(q_pdf, "quarterly")
        scan_one(a_pdf, "annual")

        if snippets:
            summary = trim_to_50_words(" ".join(snippets))
            snippet_txt = "\n".join(snippets[:6])
        else:
            summary = ""
            snippet_txt = ""

        rows.append(asdict(StageRow(
            Ticker=ticker,
            Quarterly_PDF=str(q_pdf) if q_pdf else "",
            Annual_PDF=str(a_pdf) if a_pdf else "",
            Resource_Definition_Flag=resource_flag,
            PFS_Flag=pfs_flag,
            Evidence_Summary_50w=summary,
            Evidence_Snippets=snippet_txt,
            Notes="; ".join(notes) if notes else ""
        )))

    df = pd.DataFrame(rows).sort_values("Ticker").reset_index(drop=True)
    hits = df[(df["Resource_Definition_Flag"]=="Y") | (df["PFS_Flag"]=="Y")].copy()
    return df, hits

# ---- RUN ----
out_df, hits_df = rebuild_from_downloads(DOWNLOAD_DIR)

print("\nRebuilt rows:", len(out_df), "| hits:", len(hits_df))
out_df.head(5)




Found ticker folders: 91 in C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports
[2/91] ADG | q=ADG_quarterly_unknown.pdf | a=ADG_annual_unknown.pdf
[4/91] ALM | q=ALM_quarterly_unknown.pdf | a=ALM_annual_unknown.pdf
[7/91] ARL | q=ARL_quarterly_20250301.pdf | a=ARL_annual_20190630.pdf
[10/91] AZ9 | q=AZ9_quarterly_20250102.pdf | a=AZ9_annual_20250528.pdf
[11/91] AZY | q=AZY_quarterly_20251201.pdf | a=AZY_annual_20200312.pdf
[13/91] BGD | q=BGD_quarterly_unknown.pdf | a=BGD_annual_unknown.pdf
[15/91] BNR | q=BNR_quarterly_20231231.pdf | a=BNR_annual_20231201.pdf
[17/91] BTR | q=BTR_quarterly_20250420.pdf | a=BTR_annual_20250420.pdf


CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found
CAUTION: startxref found while searching for %%EOF. The file might be truncated and some data might not be read.
EOF marker not found


[18/91] CAE | q=CAE_quarterly_unknown.pdf | a=CAE_annual_unknown.pdf
[19/91] CHN | q=CHN_quarterly_20251226.pdf | a=CHN_annual_unknown.pdf
[20/91] CMO | q=CMO_quarterly_20260210.pdf | a=CMO_annual_20260210.pdf
[21/91] CNB | q=CNB_quarterly_20260101.pdf | a=CNB_annual_20260101.pdf
[23/91] CPN | q=CPN_quarterly_unknown.pdf | a=CPN_annual_unknown.pdf


Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong po

[26/91] CXU | q=CXU_quarterly_20250420.pdf | a=CXU_annual_20250420.pdf


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 40 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 83 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 98 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wron

[27/91] DEV | q=DEV_quarterly_20251201.pdf | a=DEV_annual_20250601.pdf
[31/91] FAL | q=FAL_quarterly_20260120.pdf | a=FAL_annual_unknown.pdf
[33/91] FLG | q=FLG_quarterly_20250102.pdf | a=FLG_annual_20251001.pdf
[37/91] GG8 | q=GG8_quarterly_20260131.pdf | a=GG8_annual_20260131.pdf
[38/91] GHM | q=GHM_quarterly_20251120.pdf | a=GHM_annual_20251120.pdf
[39/91] HAS | q=HAS_quarterly_unknown.pdf | a=HAS_annual_unknown.pdf
[40/91] HCH | q=HCH_quarterly_20260204.pdf | a=HCH_annual_20260204.pdf
[41/91] HMG | q=HMG_quarterly_20260129.pdf | a=HMG_annual_20260209.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)


[42/91] HMX | q=HMX_quarterly_unknown.pdf | a=HMX_annual_unknown.pdf
[43/91] HRN | q=HRN_quarterly_20190819.pdf | a=HRN_annual_20190819.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)


[45/91] IDA | q=IDA_quarterly_unknown.pdf | a=IDA_annual_unknown.pdf


Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)


[49/91] JAV | q=JAV_quarterly_20260122.pdf | a=JAV_annual_20260122.pdf
[50/91] KOB | q=KOB_quarterly_unknown.pdf | a=KOB_annual_unknown.pdf
[55/91] MAG | q=MAG_quarterly_unknown.pdf | a=MAG_annual_unknown.pdf
[56/91] MAT | q=MAT_quarterly_unknown.pdf | a=MAT_annual_unknown.pdf
[57/91] MAU | q=MAU_quarterly_20260204.pdf | a=MAU_annual_20260204.pdf
[59/91] MM8 | q=MM8_quarterly_unknown.pdf | a=MM8_annual_unknown.pdf
[60/91] MMA | q=MMA_quarterly_20230301.pdf | a=MMA_annual_20250601.pdf
[62/91] NMT | q=NMT_quarterly_20251020.pdf | a=NMT_annual_20251020.pdf
[63/91] NSM | q=NSM_quarterly_unknown.pdf | a=NSM_annual_unknown.pdf
[65/91] ORD | q=ORD_quarterly_20251201.pdf | a=ORD_annual_unknown.pdf
[66/91] PC2 | q=PC2_quarterly_20251125.pdf | a=PC2_annual_20251125.pdf
[71/91] RDM | q=RDM_quarterly_unknown.pdf | a=RDM_annual_20250328.pdf
[73/91] RMS | q=RMS_quarterly_20260129.pdf | a=RMS_annual_20251020.pdf
[74/91] RNU | q=RNU_quarterly_20191120.pdf | a=RNU_annual_20191120.pdf
[75/91] RNX | q=RN

Unnamed: 0,Ticker,Quarterly_PDF,Annual_PDF,Resource_Definition_Flag,PFS_Flag,Evidence_Summary_50w,Evidence_Snippets,Notes
0,AAR,,,N,N,,,no_pdfs_in_folder
1,ADG,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,N,,,
2,AGE,,,N,N,,,no_pdfs_in_folder
3,ALM,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,[QUARTERLY] The results of the drilling will f...,[QUARTERLY] The results of the drilling will f...,
4,AM5,,,N,N,,,no_pdfs_in_folder


In [10]:
tmp = OUT_CSV_REBUILT
out_df.to_csv(tmp, index=False, encoding="utf-8")
hits_df.to_csv(tmp.replace(".csv", "_hits.csv"), index=False, encoding="utf-8")
print("Saved:", tmp)
print("Saved:", tmp.replace(".csv", "_hits.csv"))


Saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_REBUILT.csv
Saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_REBUILT_hits.csv


In [11]:
# ============================================================
# RIU - PASS 2 (Retry only the "missing/no-info" companies)
# - Reads:
#     1) companies_with_website.csv  (Ticker, Company, Website)
#     2) riu_stage_flags_REBUILT.csv (your 91-row rebuilt results)
# - Identifies rows that look "missing" (no evidence, no PDFs, or no_text errors)
# - Crawls ONLY those websites, finds likely PDFs, downloads a few, extracts text,
#   flags Resource Definition / PFS, summarizes evidence
# - Merges back into a compiled 91-row output
#
# Notes:
# - Uses PyMuPDF (fitz) for extraction (quiet, robust).
# - Does NOT require admin rights.
# - OCR is OPTIONAL (off by default). If you enable it, it avoids --clean so no "unpaper" required.
# ============================================================

import re
import time
import subprocess
import datetime as dt
from dataclasses import dataclass, asdict
from pathlib import Path
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

import fitz  # PyMuPDF
from tqdm import tqdm


# -------------------------
# PATHS (EDIT THESE)
# -------------------------
COMPANIES_CSV = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\companies_with_website.csv"
REBUILT_CSV   = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_REBUILT.csv"

# Where to put NEW downloads from this retry run
RETRY_DOWNLOAD_DIR = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports_retry")
RETRY_DEBUG_DIR    = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\debug_pages_retry")

# Output (write to a NEW filename to avoid OneDrive/Excel lock pain)
OUT_FINAL_CSV = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.csv"
OUT_RETRY_ONLY_CSV = OUT_FINAL_CSV.replace(".csv", "_retry_only.csv")
OUT_FINAL_HITS_CSV = OUT_FINAL_CSV.replace(".csv", "_hits.csv")

# -------------------------
# RUNTIME SETTINGS
# -------------------------
REQUEST_TIMEOUT = 25
SLEEP_S = 0.4
MAX_PAGES_TO_VISIT = 35       # per company crawl budget
MAX_SITEMAP_URLS = 4000       # per company (filtered)
MAX_SITEMAP_CHILDREN = 25
MAX_PDFS_TO_SCAN = 3          # scan top N candidate PDFs

# Optional OCR (OFF by default; only used when extracted text is empty)
# If you enable this, it will call the `ocrmypdf` executable.
DO_OCR_IF_NO_TEXT = False

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

# -------------------------
# KEYWORDS / HEURISTICS
# -------------------------
INTENT_RE = re.compile(
    r"\b(will|plan(?:s|ned)?|intend(?:s|ed)?|to\s+(?:commence|start|undertake|complete|deliver|progress|advance)|"
    r"target(?:s|ing)?|scheduled|expected|underway|next\s+quarter)\b",
    re.IGNORECASE
)

RESOURCE_RE_LIST = [
    r"\bresource definition\b",
    r"\bresource drilling\b",
    r"\binfill drilling\b",
    r"\bmineral resource\b",
    r"\bresource estimate\b",
    r"\bmaiden resource\b",
    r"\bJORC\b",
    r"\bMRE\b",
    r"\bupgrade\b.{0,40}\bresource\b",
]
PFS_RE_LIST = [
    r"\bpre[-\s]?feasibility\b",
    r"\bPFS\b",
    r"\bfeasibility study\b",
    r"\bDFS\b",
    r"\bdefinitive feasibility\b",
    r"\bscoping study\b",
]

RESOURCE_RE = re.compile("|".join(RESOURCE_RE_LIST), re.IGNORECASE)
PFS_RE      = re.compile("|".join(PFS_RE_LIST), re.IGNORECASE)

QUARTERLY_HINT = re.compile(r"(quarterly|appendix\s*4c|activities\s+report|quarter\s+report|3[-\s]?month|appendix\s*4d)", re.IGNORECASE)
ANNUAL_HINT    = re.compile(r"(annual\s+report|appendix\s*4e|year\s+end|full\s+year|financial\s+report|annual\s+financial|annual\s+results)", re.IGNORECASE)

REPORT_URL_HINT = re.compile(
    r"(investor|investors|asx|announce|announcement|release|news|media|report|results|financial|presentation|quarter|appendix|4c|4e)",
    re.IGNORECASE
)

PDF_EXT_RE = re.compile(r"\.pdf(\?|$)", re.IGNORECASE)
PRESENTATION_HINT = re.compile(r"(presentation|investor[-\s_]*pres|deck|slides)", re.IGNORECASE)


# -------------------------
# UTILITIES
# -------------------------
def is_blank(x) -> bool:
    if x is None:
        return True
    s = str(x)
    return (s.strip() == "") or (s.strip().lower() == "nan")

def safe_filename(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9._-]+", "_", s)[:180]

def normalize_site(site: str) -> str:
    site = (site or "").strip()
    if not site:
        return ""
    if not site.startswith(("http://", "https://")):
        site = "https://" + site
    return site.rstrip("/")

def same_domain(base: str, url: str) -> bool:
    try:
        b = urlparse(base)
        u = urlparse(url)
        return (b.netloc.lower() == u.netloc.lower()) or (u.netloc.lower().endswith(b.netloc.lower()))
    except Exception:
        return False

def make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"})
    return s

def fetch_text(sess: requests.Session, url: str) -> tuple[int, str, str]:
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        ctype = (r.headers.get("content-type") or "").lower()
        return r.status_code, ctype, r.text or ""
    except Exception:
        return 0, "", ""

def download_file(sess: requests.Session, url: str, outpath: Path) -> bool:
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, stream=True, allow_redirects=True)
        if r.status_code >= 400:
            return False
        ctype = (r.headers.get("content-type") or "").lower()
        if ("pdf" not in ctype) and (not PDF_EXT_RE.search(url)):
            return False
        outpath.parent.mkdir(parents=True, exist_ok=True)
        with open(outpath, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 64):
                if chunk:
                    f.write(chunk)
        return outpath.exists() and outpath.stat().st_size > 10_000
    except Exception:
        return False

def extract_pdf_text_fitz(pdf_path: Path) -> str:
    try:
        doc = fitz.open(str(pdf_path))
        parts = []
        for page in doc:
            t = page.get_text("text") or ""
            if t.strip():
                parts.append(t)
        doc.close()
        return "\n".join(parts)
    except Exception:
        return ""

def try_ocr(pdf_in: Path, pdf_out: Path) -> bool:
    """
    OCR optional. Avoids --clean so no 'unpaper' needed.
    Requires: ocrmypdf executable + tesseract + ghostscript available.
    """
    cmd = [
        "ocrmypdf",
        "--deskew",
        "--rotate-pages",
        "--optimize", "1",
        "--skip-text",
        str(pdf_in),
        str(pdf_out),
    ]
    try:
        p = subprocess.run(cmd, capture_output=True, text=True)
        return (p.returncode == 0) and pdf_out.exists() and pdf_out.stat().st_size > 10_000
    except Exception:
        return False

def split_sentences(text: str) -> list[str]:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    sents = re.split(r"(?<=[\.\?\!])\s+|\n+", text)
    return [s.strip() for s in sents if s and len(s.strip()) > 20]

def trim_to_50_words(s: str) -> str:
    words = s.split()
    if len(words) <= 50:
        return s.strip()
    return " ".join(words[:50]).strip() + "…"

def safe_dt(y, m, d):
    try:
        return dt.datetime(y, m, d)
    except Exception:
        return None

def guess_date_from_text(s: str):
    s = s or ""

    # YYYY-MM-DD or YYYY/MM/DD
    m = re.search(r"(20\d{2})[-/](\d{1,2})[-/](\d{1,2})", s)
    if m:
        y, mo, d = map(int, m.groups())
        x = safe_dt(y, mo, d)
        if x:
            return x

    # DD MMM YYYY
    m = re.search(r"(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(20\d{2})", s, re.IGNORECASE)
    if m:
        d = int(m.group(1)); mon = m.group(2).lower(); y = int(m.group(3))
        months = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
        x = safe_dt(y, months[mon[:3]], d)
        if x:
            return x

    # MMM YYYY
    m = re.search(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(20\d{2})", s, re.IGNORECASE)
    if m:
        mon = m.group(1).lower(); y = int(m.group(2))
        months = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
        return dt.datetime(y, months[mon[:3]], 1)

    # YYYYMMDD in URL/file (validate!)
    m = re.search(r"(20\d{2})(\d{2})(\d{2})", s)
    if m:
        y, mo, d = map(int, m.groups())
        x = safe_dt(y, mo, d)
        if x:
            return x

    return None

def extract_pdf_links_from_html(base_url: str, html: str) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        absu = urljoin(base_url, href)
        if not PDF_EXT_RE.search(absu):
            continue
        label = " ".join(a.get_text(" ", strip=True).split())
        out.append({
            "url": absu,
            "label": label,
            "date_guess": guess_date_from_text(label + " " + absu),
            "source_page": base_url
        })
    return out

def get_sitemaps_from_robots(sess: requests.Session, site: str) -> list[str]:
    robots = site + "/robots.txt"
    st, ct, txt = fetch_text(sess, robots)
    time.sleep(SLEEP_S)
    if st >= 400 or not txt:
        return []
    sitemaps = []
    for line in txt.splitlines():
        if line.lower().startswith("sitemap:"):
            sm = line.split(":", 1)[1].strip()
            if sm:
                sitemaps.append(sm)
    return sitemaps

def get_sitemap_urls(sess: requests.Session, site: str) -> list[str]:
    starts = ["/sitemap.xml", "/sitemap_index.xml", "/wp-sitemap.xml"]
    all_urls = []

    # robots first (often points to the real sitemap)
    candidate_sitemaps = get_sitemaps_from_robots(sess, site) + [site + p for p in starts]

    seen_sm = set()
    for sm in candidate_sitemaps:
        if sm in seen_sm:
            continue
        seen_sm.add(sm)

        status, ctype, txt = fetch_text(sess, sm)
        time.sleep(SLEEP_S)
        if status >= 400 or not txt:
            continue
        if ("xml" not in ctype) and ("<urlset" not in txt.lower()) and ("<sitemapindex" not in txt.lower()):
            continue

        soup = BeautifulSoup(txt, "xml")

        # sitemap index?
        sitemap_locs = [loc.get_text(strip=True) for loc in soup.find_all("sitemap") for loc in loc.find_all("loc")]
        if sitemap_locs:
            for child in sitemap_locs[:MAX_SITEMAP_CHILDREN]:
                st2, ct2, t2 = fetch_text(sess, child)
                time.sleep(SLEEP_S)
                if st2 >= 400 or not t2:
                    continue
                s2 = BeautifulSoup(t2, "xml")
                locs = [loc.get_text(strip=True) for loc in s2.find_all("loc")]
                for u in locs:
                    if REPORT_URL_HINT.search(u):
                        all_urls.append(u)
                if len(all_urls) >= MAX_SITEMAP_URLS:
                    return all_urls[:MAX_SITEMAP_URLS]
        else:
            # plain sitemap
            locs = [loc.get_text(strip=True) for loc in soup.find_all("loc")]
            for u in locs:
                if REPORT_URL_HINT.search(u):
                    all_urls.append(u)
            if all_urls:
                return all_urls[:MAX_SITEMAP_URLS]

    return all_urls[:MAX_SITEMAP_URLS]

def discover_report_pages(site: str) -> list[str]:
    paths = [
        "/investors", "/investor", "/investor-centre", "/investor-centre/asx-announcements",
        "/asx-announcements", "/announcements", "/asx-releases", "/asx-reports",
        "/reports", "/financial-reports", "/results", "/news", "/media",
        "/investors/asx-announcements", "/investors/announcements", "/investors/reports",
    ]
    return [site + p for p in paths]

def score_pdf_candidate(c: dict) -> float:
    label = (c.get("label") or "")
    url = (c.get("url") or "")
    text = f"{label} {url}"
    score = 0.0

    if QUARTERLY_HINT.search(text):
        score += 50
    if ANNUAL_HINT.search(text):
        score += 45
    if REPORT_URL_HINT.search(text):
        score += 15
    if PRESENTATION_HINT.search(text):
        score -= 10  # keep it, but lower priority

    d = c.get("date_guess")
    if isinstance(d, dt.datetime):
        # recency bump: more recent => higher
        days_ago = (dt.datetime.utcnow() - d).days
        score += max(0, 25 - min(25, days_ago / 30.0))  # gentle bump

    return score

def evidence_from_text(text: str) -> tuple[bool, bool, list[str]]:
    sents = split_sentences(text)
    resource_hits, pfs_hits = [], []

    for s in sents:
        has_intent = bool(INTENT_RE.search(s))
        if RESOURCE_RE.search(s) and (has_intent or ("program" in s.lower()) or ("next" in s.lower())):
            resource_hits.append(s)
        if PFS_RE.search(s) and (has_intent or ("study" in s.lower()) or ("engineering" in s.lower())):
            pfs_hits.append(s)

    resource_hits = sorted(resource_hits, key=len)[:2]
    pfs_hits      = sorted(pfs_hits, key=len)[:2]
    return (len(resource_hits) > 0), (len(pfs_hits) > 0), (resource_hits + pfs_hits)


# -------------------------
# INPUT READERS
# -------------------------
def read_companies(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = [c.strip() for c in df.columns]
    colmap = {c.lower(): c for c in df.columns}

    def pick(*names):
        for n in names:
            if n in colmap:
                return colmap[n]
        return None

    c_company = pick("company", "name", "co", "issuer")
    c_ticker  = pick("ticker", "asx", "code", "symbol")
    c_web     = pick("website", "web", "url", "site")

    if not c_company or not c_ticker or not c_web:
        raise ValueError(f"Missing required columns. Found: {list(df.columns)}. Need Company/Ticker/Website (or similar).")

    df = df.rename(columns={c_company: "Company", c_ticker: "Ticker", c_web: "Website"})
    df["Ticker"] = df["Ticker"].astype(str).str.strip().str.upper()
    df["Company"] = df["Company"].astype(str).str.strip()
    df["Website"] = df["Website"].astype(str).str.strip()
    return df

def read_rebuilt(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    df["Ticker"] = df["Ticker"].astype(str).str.strip().str.upper()
    return df


# -------------------------
# MISSING/RETRY DETECTOR
# -------------------------
def needs_retry(row) -> bool:
    notes = (str(row.get("Notes") or "")).lower()
    summ  = str(row.get("Evidence_Summary_50w") or "")
    snip  = str(row.get("Evidence_Snippets") or "")
    rflag = str(row.get("Resource_Definition_Flag") or "N").upper()
    pflag = str(row.get("PFS_Flag") or "N").upper()
    qpdf  = str(row.get("Quarterly_PDF") or "")
    apdf  = str(row.get("Annual_PDF") or "")

    no_evidence = (is_blank(summ) and is_blank(snip) and rflag != "Y" and pflag != "Y")
    no_pdfs = (is_blank(qpdf) and is_blank(apdf))
    bad_text = ("no_text" in notes) or ("extract_failed" in notes)
    missing = ("no_pdfs_in_folder" in notes) or ("no_pdf" in notes) or ("download_failed" in notes)

    return no_evidence and (no_pdfs or bad_text or missing)


# -------------------------
# WEB RETRY CORE
# -------------------------
@dataclass
class RetryRow:
    Ticker: str
    Company: str
    Website: str
    Quarterly_URL: str
    Annual_URL: str
    Quarterly_PDF: str
    Annual_PDF: str
    Resource_Definition_Flag: str
    PFS_Flag: str
    Evidence_Summary_50w: str
    Evidence_Snippets: str
    Notes: str

def crawl_for_pdfs(sess: requests.Session, site: str, debug_dir: Path) -> tuple[list[dict], list[str]]:
    notes = []
    debug_dir.mkdir(parents=True, exist_ok=True)

    pages = get_sitemap_urls(sess, site)
    if pages:
        notes.append(f"sitemap_pages={len(pages)}")
    else:
        notes.append("no_sitemap_or_no_filtered_urls")
        pages = discover_report_pages(site)

    visited = set()
    queue = []

    for u in pages[:250]:
        if REPORT_URL_HINT.search(u):
            queue.append(u)

    pdf_candidates = []

    while queue and len(visited) < MAX_PAGES_TO_VISIT:
        url = queue.pop(0)
        if url in visited:
            continue
        if not same_domain(site, url):
            continue
        visited.add(url)

        st, ct, html = fetch_text(sess, url)
        time.sleep(SLEEP_S)
        if st >= 400 or not html:
            continue

        # Save debug HTML snapshot (optional but helpful)
        try:
            (debug_dir / f"{len(visited):02d}_{safe_filename(url)}.html").write_text(html, encoding="utf-8", errors="ignore")
        except Exception:
            pass

        # Extract PDFs
        pdfs = extract_pdf_links_from_html(url, html)
        pdf_candidates.extend(pdfs)

        # Enqueue more internal links (limited)
        soup = BeautifulSoup(html, "lxml")
        new_links = 0
        for a in soup.find_all("a", href=True):
            href = (a.get("href") or "").strip()
            if not href:
                continue
            absu = urljoin(url, href)
            if absu in visited:
                continue
            if not same_domain(site, absu):
                continue
            if REPORT_URL_HINT.search(absu):
                queue.append(absu)
                new_links += 1
                if new_links >= 25:
                    break

    # Dedup
    seen = set()
    dedup = []
    for c in pdf_candidates:
        u = c.get("url")
        if not u or u in seen:
            continue
        seen.add(u)
        if not c.get("date_guess"):
            c["date_guess"] = guess_date_from_text((c.get("label","") + " " + u))
        c["score"] = score_pdf_candidate(c)
        dedup.append(c)

    notes.append(f"visited_pages={len(visited)}")
    notes.append(f"pdf_candidates_raw={len(pdf_candidates)}")
    notes.append(f"pdf_candidates_dedup={len(dedup)}")

    # Sort best-first
    dedup.sort(key=lambda x: x.get("score", 0.0), reverse=True)
    return dedup, notes

def pick_quarterly_annual(cands: list[dict]) -> tuple[dict|None, dict|None, str]:
    """
    Try to pick best quarterly + annual. If we can't, pick top two distinct PDFs.
    """
    if not cands:
        return None, None, "no_pdf_candidates"

    q = None
    a = None

    for c in cands:
        t = f"{c.get('label','')} {c.get('url','')}"
        if (q is None) and QUARTERLY_HINT.search(t):
            q = c
        if (a is None) and ANNUAL_HINT.search(t):
            a = c
        if q and a:
            break

    note = ""
    if q and a and q.get("url") == a.get("url"):
        a = None

    if (q is None) and (a is None):
        note = "no_quarterly_or_annual_hints_used_top2"
        q = cands[0]
        a = cands[1] if len(cands) > 1 else None
    elif q is None and a is not None:
        note = "no_quarterly_hint_used_best_other_as_quarterly"
        # pick another best distinct
        q = next((x for x in cands if x.get("url") != a.get("url")), a)
    elif a is None and q is not None:
        note = "no_annual_hint_used_best_other_as_annual"
        a = next((x for x in cands if x.get("url") != q.get("url")), None)

    return q, a, note

def process_retry_company(ticker: str, company: str, website: str) -> RetryRow:
    tkr = (ticker or "").strip().upper()
    site = normalize_site(website)

    if not tkr or not site:
        return RetryRow(tkr, company, website, "", "", "", "", "N", "N", "", "", "missing ticker/website")

    sess = make_session()
    out_dir = RETRY_DOWNLOAD_DIR / tkr
    dbg_dir = RETRY_DEBUG_DIR / tkr
    out_dir.mkdir(parents=True, exist_ok=True)
    dbg_dir.mkdir(parents=True, exist_ok=True)

    cands, notes = crawl_for_pdfs(sess, site, dbg_dir)
    if not cands:
        return RetryRow(tkr, company, website, "", "", "", "", "N", "N", "", "", "; ".join(notes + ["no_pdf_links_found"]))

    q_doc, a_doc, pick_note = pick_quarterly_annual(cands)
    if pick_note:
        notes.append(pick_note)

    # We'll scan up to MAX_PDFS_TO_SCAN from the TOP of the candidate list
    # (this improves hit rate when websites label things inconsistently)
    scan_list = []
    if q_doc:
        scan_list.append(q_doc)
    if a_doc and (a_doc.get("url") not in {x.get("url") for x in scan_list}):
        scan_list.append(a_doc)
    # fill with additional top candidates
    for c in cands:
        if len(scan_list) >= MAX_PDFS_TO_SCAN:
            break
        if c.get("url") and c.get("url") not in {x.get("url") for x in scan_list}:
            scan_list.append(c)

    resource_flag = "N"
    pfs_flag = "N"
    snippets = []

    quarterly_url = q_doc.get("url") if q_doc else ""
    annual_url    = a_doc.get("url") if a_doc else ""

    quarterly_pdf = ""
    annual_pdf = ""

    for idx, doc in enumerate(scan_list, start=1):
        url = doc.get("url","")
        if not url:
            continue

        # Decide a tag for filename
        tag = "other"
        t = f"{doc.get('label','')} {url}"
        if QUARTERLY_HINT.search(t):
            tag = "quarterly"
        elif ANNUAL_HINT.search(t):
            tag = "annual"

        d = doc.get("date_guess")
        datestr = d.strftime("%Y%m%d") if isinstance(d, dt.datetime) else "unknown"
        pdf_path = out_dir / f"{tkr}_{tag}_{datestr}_{idx}.pdf"

        ok = True
        if not pdf_path.exists() or pdf_path.stat().st_size < 10_000:
            ok = download_file(sess, url, pdf_path)
            time.sleep(SLEEP_S)

        if not ok:
            notes.append(f"download_failed_{tag}_{idx}")
            continue
        else:
            notes.append(f"downloaded_{tag}_{idx}")

        txt = extract_pdf_text_fitz(pdf_path)

        # OCR option if no text
        if DO_OCR_IF_NO_TEXT and not txt.strip():
            ocr_out = pdf_path.with_name(pdf_path.stem + "_ocr.pdf")
            if try_ocr(pdf_path, ocr_out):
                notes.append(f"ocr_ok_{tag}_{idx}")
                txt = extract_pdf_text_fitz(ocr_out)
                # Prefer OCR'd file for paths
                pdf_path = ocr_out
            else:
                notes.append(f"ocr_failed_{tag}_{idx}")

        if not txt.strip():
            notes.append(f"no_text_{tag}_{idx}")
            continue

        r_ok, p_ok, hits = evidence_from_text(txt)
        if r_ok:
            resource_flag = "Y"
        if p_ok:
            pfs_flag = "Y"

        if hits:
            for h in hits:
                snippets.append(f"[{tag.upper()}] {h}")

        # capture best pdf path for quarterly/annual if it matches tag or if blank
        if tag == "quarterly" and not quarterly_pdf:
            quarterly_pdf = str(pdf_path)
        if tag == "annual" and not annual_pdf:
            annual_pdf = str(pdf_path)

        # stop early if both flags hit
        if resource_flag == "Y" and pfs_flag == "Y":
            break

    # If we never got a quarterly/annual pdf path, fall back to first scanned
    if not quarterly_pdf and scan_list:
        quarterly_pdf = str((RETRY_DOWNLOAD_DIR / tkr / f"{tkr}_other_unknown_1.pdf")) if False else ""
    # (We keep it blank unless we have a real file path above; avoids lying.)

    summary = ""
    snippet_txt = ""
    if snippets:
        combined = " ".join(snippets)
        summary = trim_to_50_words(combined)
        snippet_txt = "\n".join(snippets[:8])

    return RetryRow(
        Ticker=tkr,
        Company=company,
        Website=website,
        Quarterly_URL=quarterly_url,
        Annual_URL=annual_url,
        Quarterly_PDF=quarterly_pdf,
        Annual_PDF=annual_pdf,
        Resource_Definition_Flag=resource_flag,
        PFS_Flag=pfs_flag,
        Evidence_Summary_50w=summary,
        Evidence_Snippets=snippet_txt,
        Notes="; ".join(notes)
    )


# -------------------------
# MERGE + RUN
# -------------------------
def merge_results(companies_df: pd.DataFrame, rebuilt_df: pd.DataFrame, retry_df: pd.DataFrame) -> pd.DataFrame:
    """
    Start from companies list (91). Add rebuilt fields. Then overwrite/augment with retry results where present.
    """
    base = companies_df.copy()

    # join rebuilt on Ticker
    base = base.merge(rebuilt_df, on="Ticker", how="left", suffixes=("", "_rebuilt"))

    # join retry on Ticker
    base = base.merge(retry_df, on=["Ticker", "Company", "Website"], how="left", suffixes=("", "_retry"))

    # For each output column, if retry has a non-blank value, use it; else keep rebuilt.
    cols = [
        "Quarterly_URL", "Annual_URL",
        "Quarterly_PDF", "Annual_PDF",
        "Resource_Definition_Flag", "PFS_Flag",
        "Evidence_Summary_50w", "Evidence_Snippets",
        "Notes"
    ]

    for c in cols:
        c_retry = c + "_retry"
        if c_retry in base.columns:
            base[c] = base[c].where(base[c_retry].apply(is_blank), base[c_retry])

    # Clean up helper columns
    drop_cols = [c for c in base.columns if c.endswith("_rebuilt") or c.endswith("_retry")]
    base = base.drop(columns=drop_cols, errors="ignore")

    # Ensure consistent column order
    desired = ["Ticker","Company","Website","Quarterly_URL","Annual_URL","Quarterly_PDF","Annual_PDF",
               "Resource_Definition_Flag","PFS_Flag","Evidence_Summary_50w","Evidence_Snippets","Notes"]
    for c in desired:
        if c not in base.columns:
            base[c] = ""
    base = base[desired]

    return base

def run_retry_pass():
    RETRY_DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
    RETRY_DEBUG_DIR.mkdir(parents=True, exist_ok=True)

    companies = read_companies(COMPANIES_CSV)
    rebuilt = read_rebuilt(REBUILT_CSV)

    # Ensure rebuilt has all expected columns
    for col in ["Quarterly_PDF","Annual_PDF","Resource_Definition_Flag","PFS_Flag","Evidence_Summary_50w","Evidence_Snippets","Notes"]:
        if col not in rebuilt.columns:
            rebuilt[col] = ""

    # Combine for retry detection (needs website + company)
    combined = companies.merge(rebuilt, on="Ticker", how="left")

    # Identify the “missing/no-info” set
    retry_set = combined[combined.apply(needs_retry, axis=1)].copy()
    print(f"Retrying {len(retry_set)} companies out of {len(companies)} total.\n")

    retry_rows = []
    checkpoint_path = Path(OUT_FINAL_CSV).with_suffix(".checkpoint_retry.csv")

    for _, r in tqdm(retry_set.iterrows(), total=len(retry_set)):
        tkr = r["Ticker"]
        co  = r["Company"]
        web = r["Website"]

        print(f"\n[RETRY] {tkr} - {co} ({web})")
        row = process_retry_company(tkr, co, web)
        retry_rows.append(asdict(row))

        # checkpoint every ~5 companies (so crashes don't wipe progress)
        if len(retry_rows) % 5 == 0:
            pd.DataFrame(retry_rows).to_csv(checkpoint_path, index=False, encoding="utf-8")
            print(f"Checkpoint saved: {checkpoint_path}")

    retry_df = pd.DataFrame(retry_rows)
    retry_df.to_csv(OUT_RETRY_ONLY_CSV, index=False, encoding="utf-8")
    print(f"\nWrote retry-only results: {OUT_RETRY_ONLY_CSV}")

    final_df = merge_results(companies, rebuilt, retry_df)
    final_df.to_csv(OUT_FINAL_CSV, index=False, encoding="utf-8")
    print(f"Wrote FINAL compiled results: {OUT_FINAL_CSV}")

    hits = final_df[(final_df["Resource_Definition_Flag"]=="Y") | (final_df["PFS_Flag"]=="Y")].copy()
    hits.to_csv(OUT_FINAL_HITS_CSV, index=False, encoding="utf-8")
    print(f"Wrote hits-only file: {OUT_FINAL_HITS_CSV}  (hits={len(hits)})")

    return final_df, retry_df, hits


# ---- RUN IT ----
final_df, retry_df, hits_df = run_retry_pass()
hits_df.head(20)


Retrying 46 companies out of 91 total.



  0%|                                                                                           | 0/46 [00:00<?, ?it/s]


[RETRY] FFM - FireFly Metals Ltd (fireflymetals.com.au)


  2%|█▊                                                                                 | 1/46 [00:15<11:21, 15.14s/it]


[RETRY] RXL - Rox Resources Ltd (roxresources.com.au)


  4%|███▌                                                                               | 2/46 [00:23<08:23, 11.43s/it]


[RETRY] NMG - New Murchison Gold Ltd (newmurchgold.com.au)


  7%|█████▍                                                                             | 3/46 [00:33<07:27, 10.40s/it]


[RETRY] MEI - Meteoric Resources NL (meteoric.com.au)


  9%|███████▏                                                                           | 4/46 [00:59<11:39, 16.65s/it]


[RETRY] AAR - Astral Resources NL (astralresources.com.au)


 11%|█████████                                                                          | 5/46 [01:32<15:23, 22.52s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] BTR - Brightstar Resources Ltd (brightstarresources.com.au)


 13%|██████████▊                                                                        | 6/46 [02:14<19:28, 29.22s/it]


[RETRY] CTM - Centaurus Metals Ltd (centaurus.com.au)


 15%|████████████▋                                                                      | 7/46 [02:23<14:41, 22.61s/it]


[RETRY] RHI - Red Hill Minerals Ltd (redhillminerals.com.au)


 17%|██████████████▍                                                                    | 8/46 [02:25<10:09, 16.03s/it]


[RETRY] IVR - Investigator Silver Ltd (investres.com.au)


 20%|████████████████▏                                                                  | 9/46 [02:56<12:47, 20.74s/it]


[RETRY] BM1 - Ballard Mining Ltd (ballardmining.com.au)


 22%|█████████████████▊                                                                | 10/46 [03:25<13:55, 23.20s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] HRZ - Horizon Minerals Ltd (horizonminerals.com.au)


 24%|███████████████████▌                                                              | 11/46 [04:02<16:04, 27.55s/it]


[RETRY] PLA - Pacific Lime & Cement Ltd (placltd.com)


 26%|█████████████████████▍                                                            | 12/46 [04:11<12:22, 21.84s/it]


[RETRY] AUE - Aurum Resources Ltd (aurumres.com.au)


 28%|███████████████████████▏                                                          | 13/46 [04:27<11:07, 20.23s/it]


[RETRY] DLI - Delta Lithium Ltd (deltalithium.com.au)


 30%|████████████████████████▉                                                         | 14/46 [04:57<12:17, 23.05s/it]


[RETRY] GBZ - GBM Resources Ltd (gbmr.com.au)


 33%|██████████████████████████▋                                                       | 15/46 [05:09<10:07, 19.58s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] DRE - Dreadnought Resources Ltd (dreadnoughtresources.com.au)


 35%|████████████████████████████▌                                                     | 16/46 [05:40<11:31, 23.04s/it]


[RETRY] AGE - Alligator Energy Ltd (alligatorenergy.com.au)


 37%|██████████████████████████████▎                                                   | 17/46 [06:11<12:17, 25.44s/it]


[RETRY] KSN - Kingston Resources Ltd (kingstonresources.com.au)


 39%|████████████████████████████████                                                  | 18/46 [06:22<09:49, 21.04s/it]


[RETRY] TM1 - Terra Metals Ltd (terrametals.com.au)


 41%|█████████████████████████████████▊                                                | 19/46 [07:23<14:54, 33.12s/it]


[RETRY] PTR - PTR Minerals Ltd (ptrminerals.com.au)


 43%|███████████████████████████████████▋                                              | 20/46 [07:59<14:45, 34.07s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] LM8 - Lunnon Metals Ltd (lunnonmetals.com.au)


 46%|█████████████████████████████████████▍                                            | 21/46 [09:26<20:47, 49.92s/it]


[RETRY] YRL - Yandal Resources Ltd (yandalresources.com.au)


 48%|███████████████████████████████████████▏                                          | 22/46 [09:58<17:51, 44.66s/it]


[RETRY] BSX - Blackstone Minerals Ltd (blackstoneminerals.com.au)


 50%|█████████████████████████████████████████                                         | 23/46 [10:29<15:29, 40.42s/it]


[RETRY] COD - Coda Minerals Ltd (codaminerals.com)


 52%|██████████████████████████████████████████▊                                       | 24/46 [11:00<13:48, 37.65s/it]


[RETRY] CRS - Caprice Resources Ltd (capriceresources.com)


 54%|████████████████████████████████████████████▌                                     | 25/46 [12:23<17:53, 51.10s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] RTR - Rumble Resources Ltd (rumbleresources.com.au)


 57%|██████████████████████████████████████████████▎                                   | 26/46 [12:32<12:50, 38.51s/it]


[RETRY] M79 - Mammoth Minerals Ltd (firetailresources.com.au)


 59%|████████████████████████████████████████████████▏                                 | 27/46 [12:52<10:29, 33.15s/it]


[RETRY] BCA - Black Canyon Ltd (blackcanyon.com.au)


 61%|█████████████████████████████████████████████████▉                                | 28/46 [14:47<17:18, 57.72s/it]


[RETRY] AR3 - Australian Rare Earths Ltd (ar3.com.au)


 63%|███████████████████████████████████████████████████▋                              | 29/46 [15:20<14:11, 50.11s/it]


[RETRY] OR3 - Ore Resources Ltd (oreresources.com.au)


 65%|█████████████████████████████████████████████████████▍                            | 30/46 [15:51<11:52, 44.55s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] PGO - Pacgold Ltd (pacgold.com.au)


 67%|███████████████████████████████████████████████████████▎                          | 31/46 [16:02<08:34, 34.30s/it]


[RETRY] GA8 - GoldArc Resources Ltd (goldarcres.com.au)


 70%|█████████████████████████████████████████████████████████                         | 32/46 [16:11<06:13, 26.68s/it]


[RETRY] IPT - Impact Minerals Ltd (impactminerals.com.au)


 72%|██████████████████████████████████████████████████████████▊                       | 33/46 [16:38<05:48, 26.78s/it]


[RETRY] DYM - Dynamic Metals Ltd (dynamicmetals.com.au)


 74%|████████████████████████████████████████████████████████████▌                     | 34/46 [17:28<06:48, 34.01s/it]


[RETRY] WIN - WIN Metals Ltd (winmetals.com.au)


 76%|██████████████████████████████████████████████████████████████▍                   | 35/46 [18:01<06:10, 33.67s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] PGM - Platina Resources Ltd (platinaresources.com.au)


 78%|████████████████████████████████████████████████████████████████▏                 | 36/46 [18:17<04:41, 28.15s/it]


[RETRY] TG6 - TG Metals Ltd (tgmetals.com.au)


 80%|█████████████████████████████████████████████████████████████████▉                | 37/46 [19:01<04:58, 33.13s/it]


[RETRY] SGA - Sarytogan Graphite Ltd (sarytogangraphite.com.au)


 83%|███████████████████████████████████████████████████████████████████▋              | 38/46 [19:16<03:39, 27.48s/it]


[RETRY] VMS - Venari Minerals NL (venariminerals.com)


 85%|█████████████████████████████████████████████████████████████████████▌            | 39/46 [19:50<03:26, 29.56s/it]


[RETRY] LAT - Latitude 66 Ltd (lat66.com)


 87%|███████████████████████████████████████████████████████████████████████▎          | 40/46 [20:06<02:33, 25.54s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] IVG - InVert Graphite Ltd (invertgraphite.com.au)


 89%|█████████████████████████████████████████████████████████████████████████         | 41/46 [20:14<01:40, 20.17s/it]


[RETRY] FMR - FMR Resources Ltd (fmrresources.com.au)


 91%|██████████████████████████████████████████████████████████████████████████▊       | 42/46 [20:23<01:07, 16.89s/it]


[RETRY] ATT - Altitude Minerals Ltd (coppersearch.com.au)


 93%|████████████████████████████████████████████████████████████████████████████▋     | 43/46 [20:38<00:49, 16.35s/it]


[RETRY] YAR - Yari Minerals Ltd (yariminerals.com.au)


 96%|██████████████████████████████████████████████████████████████████████████████▍   | 44/46 [21:13<00:43, 21.95s/it]


[RETRY] AM5 - Antares Metals Ltd (antaresmetals.com.au)


 98%|████████████████████████████████████████████████████████████████████████████████▏ | 45/46 [21:45<00:24, 24.95s/it]

Checkpoint saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.checkpoint_retry.csv

[RETRY] SLM - Solis Minerals Ltd (solisminerals.com)


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [22:18<00:00, 29.09s/it]



Wrote retry-only results: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL_retry_only.csv
Wrote FINAL compiled results: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.csv
Wrote hits-only file: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL_hits.csv  (hits=26)


Unnamed: 0,Ticker,Company,Website,Quarterly_URL,Annual_URL,Quarterly_PDF,Annual_PDF,Resource_Definition_Flag,PFS_Flag,Evidence_Summary_50w,Evidence_Snippets,Notes
0,RMS,Ramelius Resources Ltd,rameliusresources.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,[QUARTERLY] mineral resource and there is no c...,[QUARTERLY] mineral resource and there is no c...,
2,CHN,Chalice Mining Ltd,chalicemining.com,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,Y,[QUARTERLY] 1.1.2 Pre-Feasibility Study [QUART...,[QUARTERLY] 1.1.2 Pre-Feasibility Study\n[QUAR...,
6,SVL,Silver Mines Ltd,silvermines.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,[QUARTERLY] targets were prepared by a Compete...,[QUARTERLY] targets were prepared by a Compete...,
9,AZY,Antipa Minerals Ltd,antipaminerals.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,"[QUARTERLY] intersections, confirming the targ...","[QUARTERLY] intersections, confirming the targ...",
13,WWI,West Wits Mining Ltd,westwitsmining.com,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,Y,[QUARTERLY] 2022 Scoping Study: 5 Development ...,[QUARTERLY] 2022 Scoping Study: 5 Development ...,no_quarterly_named_pdf_used_latest_any
16,BGD,Barton Gold Holdings Ltd,bartongold.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,[QUARTERLY] it intends to prioritise for explo...,[QUARTERLY] it intends to prioritise for explo...,
18,HCH,Hot Chili Ltd,hotchili.net.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,Y,[QUARTERLY] • commencement of the Costa Fuego ...,[QUARTERLY] • commencement of the Costa Fuego ...,
24,RNU,Renascor Resources Ltd,renascor.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,Y,[QUARTERLY] Siviour Definitive Feasibility Stu...,[QUARTERLY] Siviour Definitive Feasibility Stu...,
30,HRN,Horizon Gold Ltd,horizongold.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,Y,[QUARTERLY] In March 2016 a scoping study for ...,[QUARTERLY] In March 2016 a scoping study for ...,
32,ARL,Ardea Resources Ltd,ardearesources.com.au,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,Y,Y,[QUARTERLY] • Complete the infill resource def...,[QUARTERLY] • Complete the infill resource def...,


In [12]:
# ============================================================
# RIU - Scoping / Feasibility flagger (Local PDFs first + Web fallback)
# - Scans your existing downloaded PDFs for:
#     "Scoping Study", "Scoping Studies", "Feasibility"
# - If no local hits/no text, does a light web pass:
#     crawl site -> pick top PDFs -> download -> scan
# - Outputs compiled CSVs
#
# Requirements:
#   pip/conda install: requests beautifulsoup4 lxml pandas pymupdf tqdm
# ============================================================

import re
import time
import datetime as dt
from dataclasses import dataclass, asdict
from pathlib import Path
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
from tqdm import tqdm


# -------------------------
# PATHS (EDIT THESE)
# -------------------------
COMPANIES_CSV = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\companies_with_website.csv"
# Optional: your prior compiled file (used only to carry-over previous columns if you want)
PRIOR_FINAL_CSV = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_stage_flags_FINAL.csv"

# Scan these folders for already-downloaded PDFs
DOWNLOAD_DIRS = [
    Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports"),
    Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports_retry"),
]

# Web fallback downloads go here
FALLBACK_DOWNLOAD_DIR = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\downloaded_reports_scoping_fallback")
FALLBACK_DEBUG_DIR    = Path(r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\debug_pages_scoping_fallback")

OUT_FINAL = r"C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_scoping_feasibility_FINAL.csv"
OUT_HITS  = OUT_FINAL.replace(".csv", "_hits.csv")


# -------------------------
# RUNTIME SETTINGS
# -------------------------
REQUEST_TIMEOUT = 25
SLEEP_S = 0.4
MAX_PAGES_TO_VISIT = 30
MAX_SITEMAP_URLS = 3500
MAX_SITEMAP_CHILDREN = 25
MAX_FALLBACK_PDFS_TO_SCAN = 3     # only when local has no hits/no text

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)


# -------------------------
# KEYWORDS
# -------------------------
SCOPE_RE = re.compile(r"\bscoping\s+stud(?:y|ies)\b", re.IGNORECASE)
FEAS_RE  = re.compile(r"\bfeasibility\b", re.IGNORECASE)

# If you want to focus on "study context", keep a light intent cue
INTENT_RE = re.compile(
    r"\b(will|plan(?:s|ned)?|intend(?:s|ed)?|commence|start|complete|deliver|progress|advance|"
    r"target(?:s|ing)?|scheduled|expected|underway|next\s+quarter|FY\d{2}|CY\d{2})\b",
    re.IGNORECASE
)

REPORT_URL_HINT = re.compile(
    r"(investor|investors|asx|announce|announcement|release|news|media|report|results|financial|presentation|quarter|appendix|4c|4e)",
    re.IGNORECASE
)

PDF_EXT_RE = re.compile(r"\.pdf(\?|$)", re.IGNORECASE)
PRESENTATION_HINT = re.compile(r"(presentation|investor[-\s_]*pres|deck|slides)", re.IGNORECASE)


# -------------------------
# HELPERS
# -------------------------
def is_blank(x) -> bool:
    if x is None:
        return True
    s = str(x)
    return (s.strip() == "") or (s.strip().lower() == "nan")

def normalize_site(site: str) -> str:
    site = (site or "").strip()
    if not site:
        return ""
    if not site.startswith(("http://", "https://")):
        site = "https://" + site
    return site.rstrip("/")

def safe_filename(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9._-]+", "_", s)[:180]

def same_domain(base: str, url: str) -> bool:
    try:
        b = urlparse(base)
        u = urlparse(url)
        return (b.netloc.lower() == u.netloc.lower()) or (u.netloc.lower().endswith(b.netloc.lower()))
    except Exception:
        return False

def make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"})
    return s

def fetch_text(sess: requests.Session, url: str) -> tuple[int, str, str]:
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        ctype = (r.headers.get("content-type") or "").lower()
        return r.status_code, ctype, r.text or ""
    except Exception:
        return 0, "", ""

def download_file(sess: requests.Session, url: str, outpath: Path) -> bool:
    try:
        r = sess.get(url, timeout=REQUEST_TIMEOUT, stream=True, allow_redirects=True)
        if r.status_code >= 400:
            return False
        ctype = (r.headers.get("content-type") or "").lower()
        if ("pdf" not in ctype) and (not PDF_EXT_RE.search(url)):
            return False
        outpath.parent.mkdir(parents=True, exist_ok=True)
        with open(outpath, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 64):
                if chunk:
                    f.write(chunk)
        return outpath.exists() and outpath.stat().st_size > 10_000
    except Exception:
        return False

def extract_pdf_text_fitz(pdf_path: Path) -> str:
    try:
        doc = fitz.open(str(pdf_path))
        parts = []
        for page in doc:
            t = page.get_text("text") or ""
            if t.strip():
                parts.append(t)
        doc.close()
        return "\n".join(parts)
    except Exception:
        return ""

def split_sentences(text: str) -> list[str]:
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    sents = re.split(r"(?<=[\.\?\!])\s+|\n+", text)
    return [s.strip() for s in sents if s and len(s.strip()) > 25]

def trim_to_50_words(s: str) -> str:
    words = s.split()
    if len(words) <= 50:
        return s.strip()
    return " ".join(words[:50]).strip() + "…"

def evidence_from_text_scoping_feas(text: str):
    sents = split_sentences(text)
    scope_hits = []
    feas_hits = []

    for s in sents:
        # Prefer sentences that look like "real statements" (intent cues),
        # but don't require it (many reports are bullet-ish).
        has_intent = bool(INTENT_RE.search(s))

        if SCOPE_RE.search(s):
            scope_hits.append((has_intent, s))
        if FEAS_RE.search(s):
            feas_hits.append((has_intent, s))

    # rank: intent-bearing first, then shorter
    def rank(hit):
        has_intent, sent = hit
        return (1 if has_intent else 0, -len(sent))

    scope_hits.sort(key=rank, reverse=True)
    feas_hits.sort(key=rank, reverse=True)

    scope_sents = [s for _, s in scope_hits[:2]]
    feas_sents  = [s for _, s in feas_hits[:2]]

    return (len(scope_sents) > 0), (len(feas_sents) > 0), (scope_sents + feas_sents)

def read_companies(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python")
    df.columns = [c.strip() for c in df.columns]
    colmap = {c.lower(): c for c in df.columns}

    def pick(*names):
        for n in names:
            if n in colmap:
                return colmap[n]
        return None

    c_company = pick("company", "name", "co", "issuer")
    c_ticker  = pick("ticker", "asx", "code", "symbol")
    c_web     = pick("website", "web", "url", "site")

    if not c_company or not c_ticker or not c_web:
        raise ValueError(f"Missing required columns. Found: {list(df.columns)}. Need Company/Ticker/Website (or similar).")

    df = df.rename(columns={c_company: "Company", c_ticker: "Ticker", c_web: "Website"})
    df["Ticker"] = df["Ticker"].astype(str).str.strip().str.upper()
    df["Company"] = df["Company"].astype(str).str.strip()
    df["Website"] = df["Website"].astype(str).str.strip()
    return df

def build_pdf_index(download_dirs: list[Path]) -> dict[str, list[Path]]:
    """
    Map Ticker -> list of PDFs found under DOWNLOAD_DIR/TICKER/**.pdf
    """
    idx: dict[str, list[Path]] = {}
    for d in download_dirs:
        if not d.exists():
            continue
        for tkr_dir in d.iterdir():
            if not tkr_dir.is_dir():
                continue
            tkr = tkr_dir.name.strip().upper()
            pdfs = list(tkr_dir.rglob("*.pdf"))
            if not pdfs:
                continue
            idx.setdefault(tkr, []).extend(pdfs)
    # de-dupe paths
    for k in list(idx.keys()):
        seen = set()
        unique = []
        for p in idx[k]:
            sp = str(p)
            if sp in seen:
                continue
            seen.add(sp)
            unique.append(p)
        idx[k] = unique
    return idx


# -------------------------
# WEB FALLBACK (only for those with no local hits)
# -------------------------
def get_sitemaps_from_robots(sess: requests.Session, site: str) -> list[str]:
    st, ct, txt = fetch_text(sess, site + "/robots.txt")
    time.sleep(SLEEP_S)
    if st >= 400 or not txt:
        return []
    sitemaps = []
    for line in txt.splitlines():
        if line.lower().startswith("sitemap:"):
            sm = line.split(":", 1)[1].strip()
            if sm:
                sitemaps.append(sm)
    return sitemaps

def get_sitemap_urls(sess: requests.Session, site: str) -> list[str]:
    starts = ["/sitemap.xml", "/sitemap_index.xml", "/wp-sitemap.xml"]
    all_urls = []
    candidate_sitemaps = get_sitemaps_from_robots(sess, site) + [site + p for p in starts]

    seen_sm = set()
    for sm in candidate_sitemaps:
        if sm in seen_sm:
            continue
        seen_sm.add(sm)

        status, ctype, txt = fetch_text(sess, sm)
        time.sleep(SLEEP_S)
        if status >= 400 or not txt:
            continue
        if ("xml" not in ctype) and ("<urlset" not in txt.lower()) and ("<sitemapindex" not in txt.lower()):
            continue

        soup = BeautifulSoup(txt, "xml")
        sitemap_locs = [loc.get_text(strip=True) for loc in soup.find_all("sitemap") for loc in loc.find_all("loc")]

        if sitemap_locs:
            for child in sitemap_locs[:MAX_SITEMAP_CHILDREN]:
                st2, ct2, t2 = fetch_text(sess, child)
                time.sleep(SLEEP_S)
                if st2 >= 400 or not t2:
                    continue
                s2 = BeautifulSoup(t2, "xml")
                locs = [loc.get_text(strip=True) for loc in s2.find_all("loc")]
                for u in locs:
                    if REPORT_URL_HINT.search(u):
                        all_urls.append(u)
                if len(all_urls) >= MAX_SITEMAP_URLS:
                    return all_urls[:MAX_SITEMAP_URLS]
        else:
            locs = [loc.get_text(strip=True) for loc in soup.find_all("loc")]
            for u in locs:
                if REPORT_URL_HINT.search(u):
                    all_urls.append(u)
            if all_urls:
                return all_urls[:MAX_SITEMAP_URLS]

    return all_urls[:MAX_SITEMAP_URLS]

def discover_report_pages(site: str) -> list[str]:
    paths = [
        "/investors", "/investor", "/investor-centre", "/asx-announcements",
        "/announcements", "/asx-releases", "/reports", "/financial-reports",
        "/results", "/news", "/media", "/investors/reports", "/investors/announcements",
    ]
    return [site + p for p in paths]

def extract_pdf_links_from_html(base_url: str, html: str) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")
    out = []
    for a in soup.find_all("a", href=True):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        absu = urljoin(base_url, href)
        if not PDF_EXT_RE.search(absu):
            continue
        label = " ".join(a.get_text(" ", strip=True).split())
        out.append({"url": absu, "label": label, "source_page": base_url})
    return out

def score_pdf_candidate(c: dict) -> float:
    """
    For scoping/feasibility, investor presentations can still be very useful,
    so we only mildly penalize them.
    """
    label = (c.get("label") or "")
    url = (c.get("url") or "")
    text = f"{label} {url}".lower()

    score = 0.0
    if "quarter" in text or "4c" in text or "activities" in text:
        score += 15
    if "annual" in text or "4e" in text:
        score += 12
    if "report" in text or "results" in text:
        score += 10
    if "scoping" in text:
        score += 25
    if "feasib" in text:
        score += 22
    if PRESENTATION_HINT.search(text):
        score -= 3

    return score

def crawl_site_for_best_pdfs(sess: requests.Session, site: str, debug_dir: Path) -> tuple[list[dict], list[str]]:
    notes = []
    debug_dir.mkdir(parents=True, exist_ok=True)

    pages = get_sitemap_urls(sess, site)
    if pages:
        notes.append(f"sitemap_pages={len(pages)}")
    else:
        notes.append("no_sitemap_or_no_filtered_urls")
        pages = discover_report_pages(site)

    visited = set()
    queue = [u for u in pages[:250] if REPORT_URL_HINT.search(u)]
    pdf_candidates = []

    while queue and len(visited) < MAX_PAGES_TO_VISIT:
        url = queue.pop(0)
        if url in visited:
            continue
        if not same_domain(site, url):
            continue
        visited.add(url)

        st, ct, html = fetch_text(sess, url)
        time.sleep(SLEEP_S)
        if st >= 400 or not html:
            continue

        try:
            (debug_dir / f"{len(visited):02d}_{safe_filename(url)}.html").write_text(html, encoding="utf-8", errors="ignore")
        except Exception:
            pass

        pdf_candidates.extend(extract_pdf_links_from_html(url, html))

        soup = BeautifulSoup(html, "lxml")
        new_links = 0
        for a in soup.find_all("a", href=True):
            href = (a.get("href") or "").strip()
            if not href:
                continue
            absu = urljoin(url, href)
            if absu in visited:
                continue
            if not same_domain(site, absu):
                continue
            if REPORT_URL_HINT.search(absu):
                queue.append(absu)
                new_links += 1
                if new_links >= 20:
                    break

    # Dedup + score
    seen = set()
    dedup = []
    for c in pdf_candidates:
        u = c.get("url")
        if not u or u in seen:
            continue
        seen.add(u)
        c["score"] = score_pdf_candidate(c)
        dedup.append(c)

    dedup.sort(key=lambda x: x.get("score", 0.0), reverse=True)

    notes.append(f"visited_pages={len(visited)}")
    notes.append(f"pdf_candidates_raw={len(pdf_candidates)}")
    notes.append(f"pdf_candidates_dedup={len(dedup)}")
    return dedup, notes


# -------------------------
# OUTPUT MODEL
# -------------------------
@dataclass
class OutRow:
    Ticker: str
    Company: str
    Website: str

    # Local scan
    Local_PDF_Count: int
    Local_Scoping_Flag: str
    Local_Feasibility_Flag: str
    Local_Evidence_Summary_50w: str
    Local_Evidence_Snippets: str
    Local_Evidence_PDFs: str

    # Web fallback
    Web_Checked: str
    Web_Scoping_Flag: str
    Web_Feasibility_Flag: str
    Web_Evidence_Summary_50w: str
    Web_Evidence_Snippets: str
    Web_Evidence_PDFs: str
    Notes: str

    # Final combined flags
    Scoping_Flag: str
    Feasibility_Flag: str
    Evidence_Summary_50w: str
    Evidence_Snippets: str


# -------------------------
# CORE PER-COMPANY PROCESS
# -------------------------
def scan_local_pdfs(tkr: str, pdfs: list[Path]):
    scope_flag = "N"
    feas_flag = "N"
    snippets = []
    hit_pdfs = []

    for pdf_path in pdfs:
        txt = extract_pdf_text_fitz(pdf_path)
        if not txt.strip():
            continue

        s_ok, f_ok, hits = evidence_from_text_scoping_feas(txt)
        if s_ok:
            scope_flag = "Y"
        if f_ok:
            feas_flag = "Y"
        if hits:
            hit_pdfs.append(str(pdf_path))
            for h in hits:
                snippets.append(f"[LOCAL] {h}")

        # early exit if we have both
        if scope_flag == "Y" and feas_flag == "Y":
            # still okay to stop early; we just want evidence, not exhaustive counts
            break

    if snippets:
        summary = trim_to_50_words(" ".join(snippets))
        snippet_txt = "\n".join(snippets[:8])
    else:
        summary, snippet_txt = "", ""

    return scope_flag, feas_flag, summary, snippet_txt, "; ".join(hit_pdfs[:5])

def scan_web_fallback(tkr: str, company: str, website: str):
    site = normalize_site(website)
    if not site:
        return ("N","N","","","", "missing website")

    sess = make_session()
    out_dir = FALLBACK_DOWNLOAD_DIR / tkr
    dbg_dir = FALLBACK_DEBUG_DIR / tkr
    out_dir.mkdir(parents=True, exist_ok=True)
    dbg_dir.mkdir(parents=True, exist_ok=True)

    cands, notes = crawl_site_for_best_pdfs(sess, site, dbg_dir)
    if not cands:
        return ("N","N","","","", "; ".join(notes + ["no_pdf_links_found"]))

    # Download + scan top few
    scope_flag = "N"
    feas_flag = "N"
    snippets = []
    hit_pdfs = []

    for i, c in enumerate(cands[:MAX_FALLBACK_PDFS_TO_SCAN], start=1):
        url = c.get("url","")
        if not url:
            continue
        pdf_path = out_dir / f"{tkr}_fallback_{i}.pdf"
        ok = download_file(sess, url, pdf_path)
        time.sleep(SLEEP_S)
        if not ok:
            notes.append(f"download_failed_{i}")
            continue
        notes.append(f"downloaded_{i}")

        txt = extract_pdf_text_fitz(pdf_path)
        if not txt.strip():
            notes.append(f"no_text_{i}")
            continue

        s_ok, f_ok, hits = evidence_from_text_scoping_feas(txt)
        if s_ok:
            scope_flag = "Y"
        if f_ok:
            feas_flag = "Y"
        if hits:
            hit_pdfs.append(str(pdf_path))
            for h in hits:
                snippets.append(f"[WEB] {h}")

        if scope_flag == "Y" and feas_flag == "Y":
            break

    if snippets:
        summary = trim_to_50_words(" ".join(snippets))
        snippet_txt = "\n".join(snippets[:8])
    else:
        summary, snippet_txt = "", ""

    return scope_flag, feas_flag, summary, snippet_txt, "; ".join(hit_pdfs[:5]), "; ".join(notes)


def main():
    FALLBACK_DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
    FALLBACK_DEBUG_DIR.mkdir(parents=True, exist_ok=True)

    companies = read_companies(COMPANIES_CSV)

    # Local PDF index
    pdf_index = build_pdf_index(DOWNLOAD_DIRS)

    # Optional: bring forward prior columns (not required)
    prior = None
    try:
        prior = pd.read_csv(PRIOR_FINAL_CSV)
        prior.columns = [c.strip() for c in prior.columns]
        prior["Ticker"] = prior["Ticker"].astype(str).str.strip().str.upper()
    except Exception:
        prior = None

    out_rows = []

    for _, r in tqdm(companies.iterrows(), total=len(companies)):
        tkr = r["Ticker"].strip().upper()
        company = r["Company"]
        website = r["Website"]

        local_pdfs = pdf_index.get(tkr, [])
        local_scope, local_feas, local_sum, local_snips, local_hit_pdfs = scan_local_pdfs(tkr, local_pdfs)

        notes = []
        web_checked = "N"
        web_scope = "N"
        web_feas = "N"
        web_sum = ""
        web_snips = ""
        web_hit_pdfs = ""

        # Web fallback only if local had no evidence
        if local_scope == "N" and local_feas == "N":
            web_checked = "Y"
            ws, wf, wsum, wsnips, whit, wnotes = scan_web_fallback(tkr, company, website)
            web_scope, web_feas = ws, wf
            web_sum, web_snips = wsum, wsnips
            web_hit_pdfs = whit
            if wnotes:
                notes.append(wnotes)

        # Final combined
        final_scope = "Y" if (local_scope == "Y" or web_scope == "Y") else "N"
        final_feas  = "Y" if (local_feas == "Y" or web_feas == "Y") else "N"

        combined_snips = "\n".join([x for x in [local_snips, web_snips] if x.strip()])
        combined_sum = trim_to_50_words(combined_snips.replace("\n", " ")) if combined_snips.strip() else ""

        out_rows.append(asdict(OutRow(
            Ticker=tkr,
            Company=company,
            Website=website,

            Local_PDF_Count=len(local_pdfs),
            Local_Scoping_Flag=local_scope,
            Local_Feasibility_Flag=local_feas,
            Local_Evidence_Summary_50w=local_sum,
            Local_Evidence_Snippets=local_snips,
            Local_Evidence_PDFs=local_hit_pdfs,

            Web_Checked=web_checked,
            Web_Scoping_Flag=web_scope,
            Web_Feasibility_Flag=web_feas,
            Web_Evidence_Summary_50w=web_sum,
            Web_Evidence_Snippets=web_snips,
            Web_Evidence_PDFs=web_hit_pdfs,
            Notes="; ".join(notes),

            Scoping_Flag=final_scope,
            Feasibility_Flag=final_feas,
            Evidence_Summary_50w=combined_sum,
            Evidence_Snippets=combined_snips
        )))

    out_df = pd.DataFrame(out_rows)

    # If you want to merge prior flags into this output, do it here (optional)
    if prior is not None:
        # Keep a few interesting columns if present
        keep_cols = [c for c in ["Resource_Definition_Flag","PFS_Flag","Quarterly_URL","Annual_URL","Quarterly_PDF","Annual_PDF"] if c in prior.columns]
        if keep_cols:
            out_df = out_df.merge(prior[["Ticker"] + keep_cols], on="Ticker", how="left")

    out_df.to_csv(OUT_FINAL, index=False, encoding="utf-8")
    hits_df = out_df[(out_df["Scoping_Flag"]=="Y") | (out_df["Feasibility_Flag"]=="Y")].copy()
    hits_df.to_csv(OUT_HITS, index=False, encoding="utf-8")

    print("Saved:", OUT_FINAL)
    print("Saved hits:", OUT_HITS, f"(hits={len(hits_df)}/{len(out_df)})")
    return out_df, hits_df


# ---- RUN ----
out_df, hits_df = main()
hits_df.head(25)


100%|████████████████████████████████████████████████████████████████████████████████| 91/91 [1:21:19<00:00, 53.62s/it]


Saved: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_scoping_feasibility_FINAL.csv
Saved hits: C:\Users\julian.diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\01_BD\96_2026_RIU_Conference_Perth\riu_scoping_feasibility_FINAL_hits.csv (hits=29/91)


Unnamed: 0,Ticker,Company,Website,Local_PDF_Count,Local_Scoping_Flag,Local_Feasibility_Flag,Local_Evidence_Summary_50w,Local_Evidence_Snippets,Local_Evidence_PDFs,Web_Checked,...,Scoping_Flag,Feasibility_Flag,Evidence_Summary_50w,Evidence_Snippets,Resource_Definition_Flag,PFS_Flag,Quarterly_URL,Annual_URL,Quarterly_PDF,Annual_PDF
0,RMS,Ramelius Resources Ltd,rameliusresources.com.au,2,Y,Y,[LOCAL] deposit was optimised and a Scoping St...,[LOCAL] deposit was optimised and a Scoping St...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,Y,Y,[LOCAL] deposit was optimised and a Scoping St...,[LOCAL] deposit was optimised and a Scoping St...,Y,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
2,CHN,Chalice Mining Ltd,chalicemining.com,2,Y,Y,[LOCAL] the Scoping Study testwork phase. [LOC...,[LOCAL] the Scoping Study testwork phase.\n[LO...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,Y,Y,[LOCAL] the Scoping Study testwork phase. [LOC...,[LOCAL] the Scoping Study testwork phase.\n[LO...,N,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
6,SVL,Silver Mines Ltd,silvermines.com.au,2,N,Y,"[LOCAL] in the Kramer Hills Project, Lustrum G...","[LOCAL] in the Kramer Hills Project, Lustrum G...",C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,N,Y,"[LOCAL] in the Kramer Hills Project, Lustrum G...","[LOCAL] in the Kramer Hills Project, Lustrum G...",Y,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
7,STK,Strickland Metals Ltd,stricklandmetals.com.au,2,N,N,,,,Y,...,Y,Y,[WEB] Developers / near-producers are progress...,[WEB] Developers / near-producers are progress...,N,N,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
9,AZY,Antipa Minerals Ltd,antipaminerals.com.au,2,Y,Y,[LOCAL] confirming the Scoping Study metallurg...,[LOCAL] confirming the Scoping Study metallurg...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,Y,Y,[LOCAL] confirming the Scoping Study metallurg...,[LOCAL] confirming the Scoping Study metallurg...,Y,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
10,MAU,Magnetic Resources NL,magres.com.au,2,N,N,,,,Y,...,N,Y,[WEB] We expect the updated Feasibility will a...,[WEB] We expect the updated Feasibility will a...,N,N,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
13,WWI,West Wits Mining Ltd,westwitsmining.com,1,Y,Y,[LOCAL] 2022 Scoping Study: 5 Development stag...,[LOCAL] 2022 Scoping Study: 5 Development stag...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,Y,Y,[LOCAL] 2022 Scoping Study: 5 Development stag...,[LOCAL] 2022 Scoping Study: 5 Development stag...,N,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
16,BGD,Barton Gold Holdings Ltd,bartongold.com.au,2,N,Y,[LOCAL] and a Pre-Feasibility Study. [LOCAL] c...,[LOCAL] and a Pre-Feasibility Study.\n[LOCAL] ...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,N,Y,[LOCAL] and a Pre-Feasibility Study. [LOCAL] c...,[LOCAL] and a Pre-Feasibility Study.\n[LOCAL] ...,Y,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
18,HCH,Hot Chili Ltd,hotchili.net.au,2,N,Y,[LOCAL] These funds place us in a strong posit...,[LOCAL] These funds place us in a strong posit...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,N,Y,[LOCAL] These funds place us in a strong posit...,[LOCAL] These funds place us in a strong posit...,N,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
24,RNU,Renascor Resources Ltd,renascor.com.au,2,N,Y,[LOCAL] Siviour Definitive Feasibility Study P...,[LOCAL] Siviour Definitive Feasibility Study P...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,N,...,N,Y,[LOCAL] Siviour Definitive Feasibility Study P...,[LOCAL] Siviour Definitive Feasibility Study P...,N,Y,,,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...,C:\Users\julian.diaz\OneDrive - XENITH CONSULT...
