In [None]:
# LiDAR data this script was originally created to download:
# https://noaa-nos-coastal-lidar-pds.s3.amazonaws.com/dem/HI_Kahoo_Lanai_Maui_Molo_Oahu_DEM_2022_10335/index.html
# https://noaa-nos-coastal-lidar-pds.s3.amazonaws.com/dem/HI_BigIsland_DEM_2023_10336/index.html

In [1]:
import os
import pandas as pd
import requests
from pathlib import Path
import time
from datetime import datetime

In [2]:

# Timestamp string in format: YYYYMMDD_HHMMSS
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define paths
notebook_dir = Path().resolve() #current path of notebook

csv_path = notebook_dir / "full.csv" #input CSV file. Top row should contain "URL" and each line is one full path to a downloadable file

download_dir = notebook_dir / "dl" #define path for dl folder
download_dir.mkdir(exist_ok=True) #create dl folder

# Create log and retry filenames with timestamp
log_file = notebook_dir / f"download_log_{timestamp}.txt"
output_csv = notebook_dir / f"retry_{timestamp}.csv"

In [3]:
# Load the list of all download URLs
df = pd.read_csv(csv_path)
if 'URL' not in df.columns:
    raise ValueError("CSV must contain a column titled 'URL'.")

# Extract expected filenames from URLs
df['filename'] = df['URL'].apply(lambda u: u.split("/")[-1])

# Get set of filenames already downloaded
downloaded_files = set(f.name for f in download_dir.iterdir() if f.is_file())

# Filter out entries where the file already exists
missing_df = df[~df['filename'].isin(downloaded_files)]

# Export to retry.csv
missing_df.to_csv(output_csv, index=False)

print(f"✅ Exported {len(missing_df)} missing files to: {output_csv.name}")


✅ Exported 33528 missing files to: retry_20250607_232309.csv


In [4]:
# Load CSV with URLs
#df = pd.read_csv(csv_path)
df = pd.read_csv(output_csv)

if 'URL' not in df.columns:
    raise ValueError("CSV must contain a column titled 'URL'.")

# Open log file for appending
with open(log_file, 'a') as log:
    for url in df['URL']:
        looptimestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = url.split("/")[-1]
        dest_path = download_dir / filename

        # Skip if file already exists
        if dest_path.exists():
            message = f"Already exists: {filename}"
            print(message)
            log.write(message + '\n')
            continue

        message = f"Downloading {filename}..."
        print(message)
        log.write(message + '\n')

        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(dest_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            message = f"{looptimestamp} Success: {filename}"
        except Exception as e:
            message = f"{looptimestamp} Failed: {filename} | {e}"
        
        
        time.sleep(2)
        
        print(message)
        log.write(message + '\n')

Downloading tileindex_HI_Kahoo_Lanai_Maui_Molo_Oahu_DEM_2022.zip...
20250607_232310 Success: tileindex_HI_Kahoo_Lanai_Maui_Molo_Oahu_DEM_2022.zip
Downloading hi2022_kah_lanai_maui_molo_oahu_dem_m10335_met.xml...
20250607_232313 Success: hi2022_kah_lanai_maui_molo_oahu_dem_m10335_met.xml
Downloading hi2022_kah_lanai_maui_molo_oahu_dem_m10335_met_forHumans.html...
20250607_232316 Success: hi2022_kah_lanai_maui_molo_oahu_dem_m10335_met_forHumans.html
Downloading index.html...
20250607_232319 Success: index.html
Downloading 4QFJ750340.tif...
20250607_232323 Success: 4QFJ750340.tif
Downloading 4QFJ755335.tif...
20250607_232326 Success: 4QFJ755335.tif
Downloading 4QFJ755340.tif...
20250607_232331 Success: 4QFJ755340.tif
Downloading 4QFJ755345.tif...
20250607_232337 Success: 4QFJ755345.tif
Downloading 4QFJ755350.tif...
20250607_232340 Success: 4QFJ755350.tif
Downloading 4QFJ760330.tif...
20250607_232343 Success: 4QFJ760330.tif
Downloading 4QFJ760335.tif...
20250607_232346 Success: 4QFJ760335.

KeyboardInterrupt: 

In [5]:
print("Done with current list! Run this script again to retry downloads that don't already exist in the folder.")

Done with current list! Run this script again to retry downloads that don't already exist in the folder.
