<a href="https://colab.research.google.com/github/gustavozanin/python-nao-programadores-2687000/blob/main/1_BAIXAR_IMAGENS_CBERS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, re, glob, requests
from pathlib import Path
from google.colab import drive

# ========= AJUSTE AQUI (sua pasta no Drive) =========
BASE_DIR    = "/content/drive/MyDrive/SUZANO_MYBD/PDI_CBERS_SUZANO/1_CBERS_IMAGENS_BAIXADAS"
CATALOG_EXACT = f"{BASE_DIR}/inpe_catalog_2025_9_25_14_24_43.txt"  # seu arquivo
OUTPUT_ROOT = BASE_DIR  # onde salvar as pastas por prefixo (use outra pasta se quiser)
# ====================================================

CHUNK_BYTES  = 1 << 20  # 1 MB
TIMEOUT_S    = 180

RE_PREFIX = re.compile(r"(CBERS_4A_WPM_\d{8}_\d{3}_\d{3}_L\d)", re.IGNORECASE)
RE_BAND   = re.compile(r"(BAND[0-9]|PAN10M|PAN5M|PAN|MUX|WFI|AWFI|B\d{1,2}|BAND_[0-9])", re.IGNORECASE)

def ensure_drive_mounted():
    drive.mount('/content/drive')

def pick_catalog():
    """Retorna o caminho do catálogo: usa o CATALOG_EXACT se existir,
    senão escolhe o mais recente por padrão inpe_catalog_*.txt em BASE_DIR."""
    if os.path.exists(CATALOG_EXACT):
        print("Catálogo (exato) encontrado:", CATALOG_EXACT)
        return CATALOG_EXACT
    candidates = glob.glob(os.path.join(BASE_DIR, "inpe_catalog_*.txt"))
    if not candidates:
        raise FileNotFoundError(
            f"Nenhum catálogo encontrado. Verifique a pasta:\n{BASE_DIR}\n"
            "ou ajuste CATALOG_EXACT/BASE_DIR."
        )
    latest = max(candidates, key=os.path.getmtime)
    print("Catálogo (mais recente) escolhido automaticamente:", latest)
    return latest

def safe_filename_from_url(url: str) -> str:
    return url.split('/')[-1].split('?')[0]

def detect_prefix(text: str):
    m = RE_PREFIX.search(text)
    return m.group(1) if m else None

def detect_band(text: str):
    m = RE_BAND.search(text)
    if not m:
        return None
    band = m.group(1).upper().replace("BAND_", "BAND")
    return band

def stream_download(url: str, dst_path: Path):
    tmp = dst_path.with_suffix(dst_path.suffix + ".part")
    tmp.parent.mkdir(parents=True, exist_ok=True)
    with requests.get(url, stream=True, timeout=TIMEOUT_S) as r:
        r.raise_for_status()
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(chunk_size=CHUNK_BYTES):
                if chunk:
                    f.write(chunk)
    tmp.rename(dst_path)

def read_urls(catalog_file: str):
    with open(catalog_file, "r", encoding="utf-8", errors="ignore") as f:
        urls = [ln.strip() for ln in f if ln.strip() and ln.strip().lower().startswith("http")]
    if not urls:
        raise ValueError("Nenhuma URL válida no catálogo (linhas devem iniciar com http).")
    return urls

def download_images_from_catalog(catalog_file: str, output_root: str):
    output_root = Path(output_root)
    output_root.mkdir(parents=True, exist_ok=True)

    urls = read_urls(catalog_file)
    print(f"Total de URLs no catálogo: {len(urls)}")

    for i, url in enumerate(urls, start=1):
        original_name = safe_filename_from_url(url)
        prefix = detect_prefix(url) or detect_prefix(original_name)
        if not prefix:
            print(f"[{i}/{len(urls)}] AVISO: sem prefixo CBERS_4A_WPM_... — pulando:\n  {url}")
            continue

        scene_dir = output_root / prefix
        scene_dir.mkdir(parents=True, exist_ok=True)

        band = detect_band(url) or detect_band(original_name)
        ext = os.path.splitext(original_name)[1] or ".tif"

        if band:
            dst_name = f"{prefix}_{band}{ext}"
        else:
            clean_original = original_name
            if original_name.upper().startswith(prefix.upper()):
                clean_original = original_name[len(prefix):].lstrip("_-")
            dst_name = f"{prefix}_{clean_original}"

        dst_path = scene_dir / dst_name

        if dst_path.exists() and dst_path.stat().st_size > 0:
            print(f"[{i}/{len(urls)}] OK (já existe): {dst_path.relative_to(output_root)}")
            continue

        try:
            print(f"[{i}/{len(urls)}] Baixando → {dst_path.relative_to(output_root)}")
            stream_download(url, dst_path)
        except requests.exceptions.RequestException as e:
            print(f"[{i}/{len(urls)}] ERRO HTTP: {e}")
        except Exception as e:
            print(f"[{i}/{len(urls)}] ERRO inesperado: {e}")

    print("Concluído.")

# ===== Execução =====
print("Montando Google Drive…")
ensure_drive_mounted()
print("Drive pronto.")

print("Verificando catálogo…")
CATALOG_FILE = pick_catalog()

print("Iniciando download…")
download_images_from_catalog(CATALOG_FILE, OUTPUT_ROOT)


Montando Google Drive…
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive pronto.
Verificando catálogo…
Catálogo (exato) encontrado: /content/drive/MyDrive/SUZANO_MYBD/PDI_CBERS_SUZANO/1_CBERS_IMAGENS_BAIXADAS/inpe_catalog_2025_9_25_14_24_43.txt
Iniciando download…
Total de URLs no catálogo: 20
[1/20] Baixando → CBERS_4A_WPM_20250807_208_118_L2/CBERS_4A_WPM_20250807_208_118_L2_BAND0.tif
