In [1]:
#!/usr/bin/env python3
"""
Jupyter-friendly GEMIS MEX downloader with pagination and filtering.

Reads a CSV like:

  TITLEID,RP_ID,DT_GRNT,DT_CEASED,STATUS,PTY_NAME,HANDLE_URL,UNIQ_ID

- RP_ID may contain multiple codes separated by commas.
- We only download reports whose RP_ID starts with 'CR' (MEX company reports).
- HANDLE_URL is usually a GEMIS simple-search URL, e.g.
    https://geoscience.nt.gov.au/gemis/ntgsjspui/simple-search?query=EL23571
- For each distinct CRxxxx-xxxx:
    * find the matching handle page (handle/1/NNNNN), following all
      pages of search results if needed
    * scrape ALL bitstreams and their SEQ from "Files in this Report"
    * download them into <out_dir>/<CR_no_dash>/SEQ_filename.ext

Output folder pattern:
  <workdir>/Reports-<CSV_stem>
"""

import csv
import os
import re
import sys
import time
from typing import Dict, Iterable, List, Tuple
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

# ---------- HTTP session ----------

SESSION = requests.Session()
SESSION.headers.update(
    {
        "User-Agent": "GEMIS-MEX-downloader/1.0 (Python; contact: your-email@example.com)"
    }
)
TIMEOUT = 60  # seconds


# ---------- small helpers ----------

def split_rp_ids(rp_field: str) -> List[str]:
    """
    Split 'CR2009-0365, CR2008-0901' -> ['CR2009-0365','CR2008-0901']
    """
    if not rp_field:
        return []
    parts = [p.strip() for p in rp_field.split(",")]
    return [p for p in parts if p]


def normalise_cr(rp_id: str) -> str:
    """
    CR2003-0423 -> CR20030423
    """
    return rp_id.replace("-", "").strip()


def fetch_html(url: str, max_attempts: int = 3, backoff: float = 5.0) -> BeautifulSoup:
    """
    GET a URL and return BeautifulSoup(html).
    Retries on network / DNS / HTTP errors up to max_attempts times
    with simple linear backoff.
    """
    last_error = None
    for attempt in range(1, max_attempts + 1):
        try:
            resp = SESSION.get(url, timeout=TIMEOUT)
            resp.raise_for_status()
            return BeautifulSoup(resp.text, "html.parser")
        except requests.RequestException as e:
            last_error = e
            if attempt == max_attempts:
                # Give up after last attempt
                raise
            print(
                f"  [warn] HTML request failed for {url} ({e}); "
                f"retry {attempt}/{max_attempts} after {backoff*attempt:.0f}s"
            )
            time.sleep(backoff * attempt)

    # Should never get here because of raise above
    raise last_error if last_error else RuntimeError(f"Unknown error fetching {url}")



# ---------- GEMIS navigation ----------

def find_item_handle_url_from_search(search_url: str, rp_id: str) -> str:
    """
    From a simple-search page and a CR (e.g. CR2004-0019), find the matching
    item row and return its handle URL: .../handle/1/NNNNN

    Walk through all paginated search-result pages by following the
    "next" link (if present) until rp_id is found or pages are exhausted.
    """
    visited = set()
    url = search_url

    while True:
        if url in visited:
            break
        visited.add(url)

        soup = fetch_html(url)

        # 1) Try to find the RP_ID on THIS page
        text_node = soup.find(string=lambda s: s and rp_id in s)
        if text_node is not None:
            row = text_node.find_parent("tr")
            if row is None:
                raise RuntimeError(
                    f"Found {rp_id!r} but could not locate its table row on {url}"
                )

            link = row.find("a", href=lambda h: h and "handle/1/" in h)
            if link is None:
                raise RuntimeError(
                    f"Could not find handle link for {rp_id!r} on {url}"
                )

            return urljoin(url, link["href"])

        # 2) If not found, try to follow a "next" link (pagination)
        next_link = soup.find("a", string=lambda s: s and s.strip().lower() == "next")
        if not next_link or not next_link.get("href"):
            # No more pages
            break

        url = urljoin(url, next_link["href"])

    # If we got here, we didn't find the RP_ID on any page
    raise RuntimeError(f"Could not find RP_ID {rp_id!r} on search pages starting at {search_url}")


def ensure_handle_url(any_url: str, rp_id: str) -> str:
    """
    HANDLE_URL from the CSV is usually a GEMIS simple-search URL, but this
    also supports direct handle URLs just in case.

    This assumes any_url is already known to be a GEMIS URL; non-GEMIS
    URLs should be filtered before calling this.
    """
    if "/handle/1/" in any_url and "simple-search" not in any_url:
        return any_url
    return find_item_handle_url_from_search(any_url, rp_id)


def list_report_files(handle_url: str) -> List[dict]:
    """
    Given a handle URL (handle/1/NNNNN), return a list of dicts:

      {
        "url": full_download_url,
        "filename": stored_filename.ext,
        "seq": sequence_number (int or None),
        "handle_id": "NNNNN"
      }

    We look for all <a href="...bitstream/..."> links, which include the
    pattern: /bitstream/1/NNNNN/SEQ/FILENAME.ext
    """
    soup = fetch_html(handle_url)
    files: List[dict] = []

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "bitstream" not in href:
            continue

        full_url = urljoin(handle_url, href)
        parsed = urlparse(full_url)
        path = parsed.path  # e.g. /gemis/ntgsjspui/bitstream/1/81743/2/DIP001.zip

        parts = path.split("/")
        handle_id = None
        seq = None

        # Find 'bitstream' and parse following segments
        # Expect: .../bitstream/1/<HANDLE_ID>/<SEQ>/<FILENAME>
        try:
            i = parts.index("bitstream")
            # parts[i+1] is usually "1"
            handle_id = parts[i + 2] if len(parts) > i + 2 else None
            seq_part = parts[i + 3] if len(parts) > i + 3 else None
            if seq_part and seq_part.isdigit():
                seq = int(seq_part)
        except ValueError:
            # 'bitstream' not in path somehow; skip seq/handle parsing
            pass

        # Filename from the path (ignoring query string)
        filename = os.path.basename(path)
        if not filename:
            # fallback to link text
            text = a.get_text(strip=True) or "file"
            filename = re.sub(r"\s+", "_", text)

        files.append(
            {
                "url": full_url,
                "filename": filename,
                "seq": seq,
                "handle_id": handle_id,
            }
        )

    # Sort by sequence number if present
    files.sort(key=lambda f: (f["seq"] if f["seq"] is not None else 9999, f["filename"]))
    return files


def download_file(url: str, out_path: str, max_attempts: int = 3, backoff: float = 5.0) -> None:
    """
    Stream-download a single file to out_path.

    - Skips if the final file already exists and > 0 bytes.
    - Downloads to out_path + '.part' and renames only on success.
    - Retries on network / connection errors up to max_attempts times.
    """
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    # If a *final* file exists and is non-empty, assume it's good
    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        print(f"  [skip] {out_path} (already exists)")
        return

    temp_path = out_path + ".part"
    last_error = None

    for attempt in range(1, max_attempts + 1):
        # Clean up any leftover partial file from previous attempts
        if os.path.exists(temp_path):
            try:
                os.remove(temp_path)
            except OSError:
                pass

        try:
            print(f"  [get ] {url} (attempt {attempt}/{max_attempts})")
            with SESSION.get(url, stream=True, timeout=TIMEOUT) as resp:
                resp.raise_for_status()
                with open(temp_path, "wb") as f:
                    for chunk in resp.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            # Download completed successfully; move temp -> final
            os.replace(temp_path, out_path)
            print(f"  [save] {out_path}")
            return

        except requests.RequestException as e:
            last_error = e
            print(
                f"  [warn] download failed for {url} ({e}); "
                f"attempt {attempt}/{max_attempts}"
            )
            # Clean up partial file
            if os.path.exists(temp_path):
                try:
                    os.remove(temp_path)
                except OSError:
                    pass

            if attempt < max_attempts:
                sleep_seconds = backoff * attempt
                print(f"        retrying after {sleep_seconds:.0f}s...")
                time.sleep(sleep_seconds)

    # If we get here, all attempts failed
    raise last_error if last_error else RuntimeError(f"Unknown error downloading {url}")



# ---------- CSV processing ----------

def iter_csv_reports(csv_path: str) -> Iterable[Tuple[str, str, int, str]]:
    """
    Yield (rp_id, handle_url_from_csv, row_number, titleid) for each RP_ID in the CSV.

    Non-CR report ids (e.g. GR642(*)) are skipped with a log message.
    """
    with open(csv_path, newline="", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        row_no = 0
        for row in reader:
            row_no += 1
            titleid = (row.get("TITLEID") or "").strip()
            handle_url = (row.get("HANDLE_URL") or "").strip()
            rp_field = row.get("RP_ID") or ""
            for rp in split_rp_ids(rp_field):
                rp = rp.strip()
                if not rp:
                    continue
                if not rp.upper().startswith("CR"):
                    # Skip GR*, etc.
                    print(f"[row {row_no}] Skipping non-CR report id {rp!r}")
                    continue
                yield rp, handle_url, row_no, titleid


def process_csv(csv_path: str, out_dir: str, delay: float = 0.0) -> None:
    """
    Main driver:
      - read CSV
      - for each distinct CR RP_ID:
          * resolve handle URL from the GEMIS simple-search URL (with pagination)
          * list ALL bitstreams (with SEQ)
          * download into <out_dir>/<CR_no_dash>/SEQ_filename.ext

      - non-GEMIS HANDLE_URLs (e.g. dpir STRIKE 'item-not-available') are
        skipped with a clear message.
    """
    processed_rp: Dict[str, str] = {}  # rp_id -> handle_url

    for rp_id, handle_url_raw, row_no, titleid in iter_csv_reports(csv_path):
        cr_clean = normalise_cr(rp_id)

        # Resolve handle URL once per RP_ID
        if rp_id not in processed_rp:
            if not handle_url_raw:
                print(f"[row {row_no}] {rp_id}: no HANDLE_URL, skipping")
                continue

            # Skip non-GEMIS URLs (e.g. dpir STRIKE item-not-available)
            if "geoscience.nt.gov.au" not in handle_url_raw.lower():
                print(
                    f"[row {row_no}] {rp_id}: HANDLE_URL not on GEMIS "
                    f"({handle_url_raw}), skipping"
                )
                processed_rp[rp_id] = None
                continue

            try:
                handle_url = ensure_handle_url(handle_url_raw, rp_id)
                processed_rp[rp_id] = handle_url
                print(f"[row {row_no}] {rp_id} ({titleid}): handle {handle_url}")
            except Exception as e:
                print(
                    f"[row {row_no}] ERROR resolving handle for {rp_id}: {e}",
                    file=sys.stderr,
                )
                processed_rp[rp_id] = None
                continue

            if delay:
                time.sleep(delay)
        else:
            handle_url = processed_rp[rp_id]
            if handle_url is None:
                # Previously marked as non-downloadable
                continue
            print(f"[row {row_no}] {rp_id} ({titleid}): reusing {handle_url}")

        if handle_url is None:
            continue

        # List all files (all SEQ) for this handle
        try:
            files = list_report_files(handle_url)
        except Exception as e:
            print(
                f"[row {row_no}] ERROR listing files for {rp_id} ({handle_url}): {e}",
                file=sys.stderr,
            )
            continue

        if not files:
            print(f"[row {row_no}] {rp_id}: no downloadable files found")
            continue

        rp_dir = os.path.join(out_dir, cr_clean)

        for f in files:
            seq = f["seq"]
            # put SEQ as prefix like 01_, 02_...
            seq_prefix = f"{seq:02d}_" if isinstance(seq, int) else ""
            out_name = seq_prefix + f["filename"]
            out_path = os.path.join(rp_dir, out_name)

            try:
                download_file(f["url"], out_path)
            except Exception as e:
                print(
                    f"[row {row_no}] ERROR downloading {f['url']} -> {out_path}: {e}",
                    file=sys.stderr,
                )

        if delay:
            time.sleep(delay)


# ---------- convenience runner for Jupyter ----------

def run_downloader(csv_name: str, workdir: str, delay: float = 0.0) -> None:
    """
    Jupyter-friendly wrapper.

    Parameters
    ----------
    csv_name : str
        CSV file name, e.g. "Tenements2" or "Tenements2.csv".
    workdir : str
        Folder where the CSV lives and where the Reports-<CSV> folder will be created.
    delay : float, optional
        Optional delay (seconds) between batches, if you want to be extra gentle
        on the server.
    """
    workdir = os.path.abspath(workdir)

    # Build CSV path
    base_path = os.path.join(workdir, csv_name)
    if os.path.exists(base_path):
        csv_path = base_path
    else:
        # If user gave "Tenements2", try "Tenements2.csv"
        if not base_path.lower().endswith(".csv"):
            alt = base_path + ".csv"
            if os.path.exists(alt):
                csv_path = alt
            else:
                raise FileNotFoundError(f"CSV not found: {base_path} or {alt}")
        else:
            raise FileNotFoundError(f"CSV not found: {base_path}")

    csv_stem = os.path.splitext(os.path.basename(csv_path))[0]
    out_dir = os.path.join(workdir, f"Reports-{csv_stem}")

    print(f"Working folder : {workdir}")
    print(f"CSV file       : {csv_path}")
    print(f"Output folder  : {out_dir}")

    process_csv(csv_path, out_dir, delay=delay)


In [2]:
run_downloader(
    csv_name="Tenements2",  # or "Tenements2.csv"
    workdir=r"C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy",
    delay=0.0,
)


Working folder : C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy
CSV file       : C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Tenements2.csv
Output folder  : C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2
[row 1] Skipping non-CR report id 'GR670(*)'
[row 2] Skipping non-CR report id 'GR558(*)'
[row 3] CR2020-0572 (EL32287): handle https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/91641
  [get ] https://geoscience.nt.gov.au/gemis/ntgsjspui/bitstream/1/91641/1/GDC_FrewenaAMAGRAD_FinalReport.pdf (attempt 1/3)
  [save] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\CR20200572\01_GDC_FrewenaAMAGRAD_FinalReport.pdf
[row 3] Skipping non-CR report id 'GR577(*)'
[row 4] CR2025-0025 (ML33779): handle https://geoscience.nt.gov.au/gemis/ntgsj

[row 2814] ERROR resolving handle for CR1994-0660(*): Could not find RP_ID 'CR1994-0660(*)' on search pages starting at https://geoscience.nt.gov.au/gemis/ntgsjspui/simple-search?query=EL2653


[row 2815] CR1977-0063 (EL935): handle https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/64390
  [get ] https://geoscience.nt.gov.au/gemis/ntgsjspui/bitstream/1/64390/1/CR19770063.pdf (attempt 1/3)
  [save] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\CR19770063\01_CR19770063.pdf
[row 2815] CR1976-0076 (EL935): handle https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/60467
  [get ] https://geoscience.nt.gov.au/gemis/ntgsjspui/bitstream/1/60467/1/CR19760076.pdf (attempt 1/3)
  [save] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\00_Projects\96_NT-Gold-CaseStudy\Reports-Tenements2\CR19760076\01_CR19760076.pdf
[row 2815] CR1976-0075 (EL935): handle https://geoscience.nt.gov.au/gemis/ntgsjspui/handle/1/60466
  [get ] https://geoscience.nt.gov.au/gemis/ntgsjspui/bitstream/1/60466/1/CR19760075.pdf (attempt 1/3)
  [save] C:\Users\Julian.Diaz\OneDrive - XENITH CONSULTING PTY LTD\Documents\0