In [0]:
%pip install beautifulsoup4
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Imports

import os
import requests
import json
import shutil
import time
from bs4 import BeautifulSoup
from datetime import datetime
from requests.adapters import    HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from pyspark.sql import SparkSession

# Funções Utilitárias
def log(msg: str, level: str = "INFO"):
    """Logger padronizado."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {msg}")

# Inicializar Spark
spark = SparkSession.builder.getOrCreate()

# Widgets
dbutils.widgets.text("catalog", "main", "1. Catálogo")
dbutils.widgets.text("schema", "default", "2. Schema")
# URL Api
dbutils.widgets.text("api_url", "https://opendatasus.saude.gov.br/api/3/action/package_search", "3. URL API") 
dbutils.widgets.text("target_years", "2024,2025", "4. Anos Alvo")

# Leitura
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
api_url = dbutils.widgets.get("api_url")
years_input = dbutils.widgets.get("target_years")
target_years = [y.strip() for y in years_input.split(",") if y.strip()]

# Configuração 
volume_path = f"/Volumes/{catalog}/{schema}/raw/srag_downloads"
os.makedirs(volume_path, exist_ok=True)

# Log Inicial
log("--- INICIANDO CONFIGURAÇÃO DO PIPELINE SRAG ---")
log(f"Destino: {volume_path}")
log(f"Anos Alvo: {target_years}")

[2026-01-07 22:28:23] [INFO] --- INICIANDO CONFIGURAÇÃO DO PIPELINE SRAG ---
[2026-01-07 22:28:23] [INFO] Destino: /Volumes/srag_dev/raw/raw/srag_downloads
[2026-01-07 22:28:23] [INFO] Anos Alvo: ['2024', '2025']


In [0]:
# Suporte

def safe_request(url: str, params: dict = None) -> requests.Response:
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    try:
        response = session.get(url, params=params, timeout=60)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        log(f"Erro de conexão: {e}", level="ERROR")
        raise

def download_file_to_volume(url: str, volume_dir: str, file_name: str) -> dict:
    local_file_path = os.path.join(volume_dir, file_name)
    log(f"Baixando: {file_name}...", level="INFO")
    
    start_time = time.time()
    try:
        with requests.get(url, stream=True, timeout=120) as r:
            r.raise_for_status()
            with open(local_file_path, 'wb') as f:
                shutil.copyfileobj(r.raw, f, length=16*1024*1024)
        
        file_stats = os.stat(local_file_path)
        if file_stats.st_size < 1024:
            raise ValueError("Arquivo suspeito (< 1KB).")

        return {
            "file_name": file_name,
            "status": "SUCCESS",
            "file_size_mb": round(file_stats.st_size / (1024*1024), 2),
            "download_duration_sec": round(time.time() - start_time, 2)
        }
    except Exception as e:
        if os.path.exists(local_file_path): os.remove(local_file_path)
        log(f"Erro no download: {e}", level="ERROR")
        raise

def get_srag_links_from_html(years_list: list) -> dict:
    """
    Busca os links reais (S3) dos CSVs SRAG a partir das páginas de resource.
    """
    base_url = "https://dadosabertos.saude.gov.br"
    dataset_url = f"{base_url}/dataset/srag-2021-a-2024"

    log(f"Baixando HTML do dataset: {dataset_url}")
    dataset_html = safe_request(dataset_url).text
    dataset_soup = BeautifulSoup(dataset_html, "html.parser")

    # 1. Coleta links de resource
    resource_links = set()
    for a in dataset_soup.find_all("a", href=True):
        href = a["href"]
        if "/resource/" in href:
            if href.startswith("/"):
                href = base_url + href
            resource_links.add(href)

    log(f"Encontrados {len(resource_links)} resources")

    found = {}

    # 2. Visita cada resource
    for resource_url in resource_links:
        try:
            html = safe_request(resource_url).text
            soup = BeautifulSoup(html, "html.parser")

            # 3. Procura link S3
            for a in soup.find_all("a", href=True):
                href = a["href"]
                if "ckan.saude.gov.br/SRAG" in href and href.lower().endswith(".csv"):
                    for year in years_list:
                        if f"/{year}/" in href:
                            found[year] = href
                            log(f"CSV SRAG [{year}]: {href}")

        except Exception as e:
            log(f"Falha ao processar resource {resource_url}: {e}", level="WARNING")

    return found


In [0]:
import time

def run_extraction_pipeline(max_retries=3):
    log("=== INICIANDO PIPELINE DE EXTRAÇÃO SRAG ===")
    
    report_data = []
    
    # 1. Limpeza prévia do diretório de staging
    log("Verificando e limpando arquivos antigos no Staging...")
    files_cleaned = 0
    try:
        for f in os.listdir(volume_path):
            if f.startswith("srag_") and f.endswith(".csv"):
                os.remove(os.path.join(volume_path, f))
                files_cleaned += 1
        log(f"Limpeza concluída. {files_cleaned} arquivos removidos.")
    except Exception as e:
        log(f"Aviso: Erro ao limpar diretório: {str(e)}", level="WARNING")
    
    # 2. Obtenção dos Links
    links_map = get_srag_links_from_html(target_years)

    # 3. Validação de Regra de Negócio
    if not links_map:
        raise ValueError("Falha crítica: Não foi possível obter os links da API após várias tentativas.")
    
    if len(links_map) < len(target_years):
        missing = set(target_years) - set(links_map.keys())
        log(f"ATENÇÃO: Não foram encontrados links para os anos: {missing}", level="WARNING")

    # 4. Download e Ingestão
    for year, url in links_map.items():
        file_name = f"srag_{year}.csv"
        
        try:
            log(f"Iniciando processamento do ano: {year}")
            metadata = download_file_to_volume(url, volume_path, file_name)
            metadata['reference_year'] = year
            report_data.append(metadata)
            log(f"Sucesso: {file_name} ({metadata['file_size_mb']} MB)")
            
        except Exception as e:
            log(f"Falha crítica no ano {year}: {str(e)}. Pipeline interrompido.", level="CRITICAL")
            raise e 

    # 5. Relatório Final
    log("=== EXTRAÇÃO FINALIZADA COM SUCESSO ===")
    print("\n--- Relatório de Governança (JSON) ---")
    print(json.dumps(report_data, indent=4))
    
    return report_data

# Executa o Pipeline
try:
    execution_report = run_extraction_pipeline()
except Exception as e:
    print(f"\n❌ O Pipeline falhou após as tentativas: {e}")

[2026-01-07 22:28:23] [INFO] === INICIANDO PIPELINE DE EXTRAÇÃO SRAG ===
[2026-01-07 22:28:23] [INFO] Verificando e limpando arquivos antigos no Staging...
[2026-01-07 22:28:24] [INFO] Limpeza concluída. 0 arquivos removidos.
[2026-01-07 22:28:24] [INFO] Baixando HTML do dataset: https://dadosabertos.saude.gov.br/dataset/srag-2021-a-2024
[2026-01-07 22:28:24] [INFO] Encontrados 31 resources
[2026-01-07 22:28:33] [INFO] CSV SRAG [2025]: https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2025/INFLUD25-22-12-2025.csv
[2026-01-07 22:28:33] [INFO] CSV SRAG [2025]: https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2025/INFLUD25-22-12-2025.csv
[2026-01-07 22:28:36] [INFO] CSV SRAG [2024]: https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2024/INFLUD24-26-06-2025.csv
[2026-01-07 22:28:36] [INFO] CSV SRAG [2024]: https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2024/INFLUD24-26-06-2025.csv
[2026-01-07 22:28:43] [INFO] Iniciando processamento do ano: 2025
[2026-01