In [2]:
import os
import sys
import time
import argparse
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse, parse_qs

In [None]:
BASE_FILESERVER_URL = "https://www.ncei.noaa.gov/thredds-ocean/fileServer/ndbc"
BASE_CATALOG_URL = "https://www.ncei.noaa.gov/thredds-ocean/catalog/ndbc"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

# Determine which dataset uses which structure
LAYOUT_MAP = {
    'cmanwx': 'year_month',
    'co-ops': 'year_month',
    'tao-buoy': 'year_month',
    'oceansites': 'nested',
    'tao-buoys-delayed-mode': 'nested',
    'tao-ctd': 'nested'
}

In [27]:
if True:
    dataset = 'tao-buoys-delayed-mode' #'oceansites'
    savedir = "d:/backup/NOAA/ndbc"
    layout = LAYOUT_MAP.get(dataset, "year_month")
    base_fileserver_url = f"{BASE_FILESERVER_URL}/{dataset}"
    catalog_root_url = f"{BASE_CATALOG_URL}/{dataset}"
    save_root = os.path.join(savedir, dataset)     

In [24]:
DOWNLOAD = False
def crawl_and_download(base_fileserver_url, catalog_url, save_root, dataset_name, relative_path=""):
    try:
        full_catalog_url = f"{catalog_url}/{relative_path}/catalog.html" if relative_path else f"{catalog_url}/catalog.html"
        print(f"[INDEX] Crawling: {full_catalog_url}")
        r = requests.get(full_catalog_url, headers=HEADERS, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'lxml')
        links = soup.find_all('a', href=True)
        print(f"[LINK] Found {links}")

        for a in links:
            href = a['href']
            print(f"[LINK] Found link: {href}")
            if href == "../":
                continue
            # Absolute URLs (external) → Skip
            if href.startswith("http://") or href.startswith("https://"):
                print(f"[SKIP] External link: {href}")
                continue

            # Internal server links starting with '/' → Skip
            if href.startswith("/"):
                print(f"[SKIP] Internal server link: {href}")
                continue

            if "dataset=ndbc" in href:
                query = urlparse(href).query
                parsed = parse_qs(query)
                dataset = parsed.get("dataset", [])
                if dataset:
                    file_path = dataset[0]
                    prefix = f"ndbc/{dataset_name}/"
                    if file_path.startswith(prefix):
                        file_relative_path = file_path[len(prefix):]
                    else:
                        file_relative_path = file_path
                    if file_relative_path.endswith(".nc"):
                        file_folder = os.path.dirname(file_relative_path)
                        filename = os.path.basename(file_relative_path)
                        if DOWNLOAD:
                            download_nested_file(base_fileserver_url, save_root, file_folder, filename)
                        else:
                            # Correct URL
                            url_path = '/'.join(file_folder.split(os.sep))  # turn backslash into slash for URL
                            url = f"{base_fileserver_url}/{url_path}/{filename}" if url_path else f"{base_fileserver_url}/{filename}"
                            local_path = os.path.join(save_root, file_folder, filename)
                            print(f"[DOWNLOAD FILE] {url} → {os.path.dirname(local_path)}")
                continue
             
            # Subdirectory (should end with catalog.html, e.g., DATA/catalog.html)
            if href.endswith("/catalog.html"):
                subfolder = href[:-len("/catalog.html")].strip('/')
                new_relative_path = '/'.join(filter(None, [relative_path, subfolder]))
                crawl_and_download(base_fileserver_url, catalog_url, save_root, dataset_name, new_relative_path)
                continue

            # Otherwise (unknown case) → skip
            print(f"[SKIP] Unknown or unsupported link: {href}")
    except Exception as e:
        print(f"[ERROR] Failed to access {full_catalog_url}: {e}")

def download_nested_file(base_fileserver_url, save_root, file_relative_path, filename):
    url_path = '/'.join(file_relative_path.split(os.sep))
    url = f"{base_fileserver_url}/{url_path}/{filename}" if url_path else f"{base_fileserver_url}/{filename}"
    local_path = os.path.join(save_root, file_relative_path, filename)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    if os.path.exists(local_path):
        print(f"[SKIP] {filename} already exists")
        return
    try:
        print(f"[DOWNLOADING] {url}")
        r = requests.get(url, headers=HEADERS, stream=True, timeout=15)
        r.raise_for_status()
        with open(local_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"[SAVED] {local_path}")
    except Exception as e:
        print(f"[ERROR] Failed to download {url}: {e}")
        log_path = os.path.join(save_root, "data_lost_filelist.log")
        with open(log_path, 'a') as log_file:
            log_file.write(f"{url}\n")

In [28]:
crawl_and_download(base_fileserver_url, catalog_root_url, save_root, dataset, "")

[INDEX] Crawling: https://www.ncei.noaa.gov/thredds-ocean/catalog/ndbc/tao-buoys-delayed-mode/catalog.html
[LINK] Found [<a href="catalog.html?dataset=ndbc/tao-buoys-delayed-mode"><tt>Tropical Atmosphere Ocean (TAO) array of moored buoys delayed-mode full-resolution data</tt></a>, <a href="2020/catalog.html"><tt>2020/</tt></a>, <a href="2019/catalog.html"><tt>2019/</tt></a>, <a href="2018/catalog.html"><tt>2018/</tt></a>, <a href="2017/catalog.html"><tt>2017/</tt></a>, <a href="2016/catalog.html"><tt>2016/</tt></a>, <a href="2015/catalog.html"><tt>2015/</tt></a>, <a href="2014/catalog.html"><tt>2014/</tt></a>, <a href="2013/catalog.html"><tt>2013/</tt></a>, <a href="2012/catalog.html"><tt>2012/</tt></a>, <a href="2011/catalog.html"><tt>2011/</tt></a>, <a href="2010/catalog.html"><tt>2010/</tt></a>, <a href="2009/catalog.html"><tt>2009/</tt></a>, <a href="2008/catalog.html"><tt>2008/</tt></a>, <a href="2007/catalog.html"><tt>2007/</tt></a>, <a href="2006/catalog.html"><tt>2006/</tt></a>

KeyboardInterrupt: 