In [1]:
import os, sys, pathlib

# (recomendado) JDK 17
os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk-20"
os.environ["PATH"] = os.environ["JAVA_HOME"] + r"\bin;" + os.environ["PATH"]

# Hadoop winutils (necessário no Windows)
os.environ["HADOOP_HOME"] = r"C:\hadoop"
os.environ["PATH"] = os.environ["HADOOP_HOME"] + r"\bin;" + os.environ["PATH"]

# Use o Python do próprio kernel/notebook para driver e executors
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["PYSPARK_PYTHON"] = sys.executable

# Diretório temporário estável (evita bloqueio do antivírus)
pathlib.Path(r"C:\spark-tmp").mkdir(parents=True, exist_ok=True)
os.environ["SPARK_LOCAL_DIRS"] = r"C:\spark-tmp"
os.environ["TMP"] = r"C:\spark-tmp"
os.environ["TEMP"] = r"C:\spark-tmp"


In [2]:
import sys, os
from pyspark.sql import SparkSession

def start_spark_in_notebook(app_name="cnpj-notebook"):
    base = (
        SparkSession.builder
        .appName(app_name)
        .config("spark.sql.session.timeZone","UTC")
        .config("spark.sql.legacy.timeParserPolicy","LEGACY")
        .config("spark.sql.sources.partitionOverwriteMode","dynamic")
        .config("spark.sql.shuffle.partitions","4")
        # garante o Python deste kernel
        .config("spark.pyspark.python", sys.executable)
        .config("spark.pyspark.driver.python", sys.executable)
        # estabilidade/diagnóstico no Windows
        .config("spark.python.worker.reuse","false")
        .config("spark.python.worker.faulthandler.enabled","true")
        .config("spark.sql.execution.pyspark.udf.faulthandler.enabled","true")
        .config("spark.local.dir", os.environ.get("SPARK_LOCAL_DIRS", r"C:\spark-tmp"))
        .config("spark.network.timeout","300s")
        .config("spark.executor.heartbeatInterval","60s")
    )
    try:
        # se delta-spark estiver instalado, habilita Delta Lake
        from delta import configure_spark_with_delta_pip
        spark = configure_spark_with_delta_pip(base
            .config("spark.databricks.delta.schema.autoMerge.enabled","true")
        ).getOrCreate()
        print("Spark com Delta Lake ✅")
        return spark
    except Exception:
        spark = base.getOrCreate()
        print("Spark sem Delta (Parquet) ✅")
        return spark

spark = start_spark_in_notebook()
spark


Spark com Delta Lake ✅


In [5]:
import os, sys, re, io, zipfile, hashlib, shutil
from pathlib import Path
import requests
from datetime import datetime
from typing import List, Tuple, Optional, Dict

BASE_URL = "https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/"
RAW_DIR  = Path("./data/_raw").resolve()
OUT_DIR  = Path("./data/dev_out").resolve()         # pasta “dev” pra não misturar com produção
LOG_DIR  = Path("./data/_metadata").resolve()
LOG_PATH = LOG_DIR / "ingestion_log_spark.parquet"  # log só do Spark

RAW_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

def month_dirs() -> List[str]:
    """Lista subdiretórios AAAA-MM/ disponíveis na raiz."""
    html = requests.get(BASE_URL, timeout=60).text
    # captura AAAA-MM/ (com barra ao final)
    return sorted(set(re.findall(r'href="(\d{4}-\d{2}/)"', html)))

def list_zip_urls(month:str) -> List[Tuple[str,str]]:
    """Lista (nome, url) dos .zip do mês informado (ex: '2025-09/')."""
    url = BASE_URL + month
    html = requests.get(url, timeout=60).text
    zips = re.findall(r'href="([^"]+\.zip)"', html, flags=re.IGNORECASE)
    return [(z, url + z) for z in zips]

def download(url:str, dest:Path) -> Dict[str,str]:
    """Baixa arquivo c/ HEAD+GET e retorna metadados simples."""
    with requests.Session() as s:
        h = s.head(url, timeout=60)
        h.raise_for_status()
        etag = h.headers.get("ETag","")
        lm   = h.headers.get("Last-Modified","")
        cl   = h.headers.get("Content-Length","")
        r = s.get(url, timeout=600, stream=True)
        r.raise_for_status()
        m = hashlib.md5()
        with open(dest, "wb") as f:
            for chunk in r.iter_content(1024*1024):
                if chunk:
                    f.write(chunk); m.update(chunk)
    return {
        "etag": etag, "last_modified": lm, "content_length": cl,
        "content_md5": m.hexdigest()
    }

def sniff_dataset_from_zipname(zip_name:str) -> str:
    """Mapeia o zip para um nome de dataset amigável (heurística)."""
    n = zip_name.lower()
    if "empresa" in n: return "empresas"
    if "estabele" in n: return "estabelecimentos"
    if "simples" in n: return "dados_do_simples"
    if "socio" in n: return "socios"
    if "pais" in n: return "paises"
    if "municip" in n: return "municipios"
    if "qualific" in n: return "qualificacoes_socios"
    if "natureza" in n: return "naturezas_juridicas"
    if "cnae" in n: return "cnaes"
    return "desconhecido"


In [6]:
from difflib import get_close_matches

SCHEMAS = {
    "empresas": {
        "columns": [
            ("cnpj_basico","string"),
            ("razao_social","string"),
            ("natureza_juridica","string"),
            ("qualificacao_responsavel","string"),
            ("capital_social","double"),
            ("porte","string"),
            ("ente_federativo_responsavel","string"),
        ]
    },
    "estabelecimentos": {
        "columns": [
            ("cnpj_basico","string"),("cnpj_ordem","string"),("cnpj_dv","string"),
            ("identificador_matriz_filial","byte"),("nome_fantasia","string"),
            ("situacao_cadastral","string"),("data_situacao_cadastral","string"),
            ("motivo_situacao_cadastral","string"),("nome_cidade_exterior","string"),
            ("pais","string"),("data_inicio_atividade","string"),
            ("cnae_fiscal_principal","string"),("cnae_fiscal_secundaria","string"),
            ("tipo_logradouro","string"),("logradouro","string"),("numero","string"),
            ("complemento","string"),("bairro","string"),("cep","string"),("uf","string"),
            ("municipio","string"),("ddd_telefone_1","string"),("telefone_1","string"),
            ("ddd_telefone_2","string"),("telefone_2","string"),("ddd_fax","string"),
            ("fax","string"),("email","string"),("situacao_especial","string"),
            ("data_situacao_especial","string"),
        ]
    },
    "dados_do_simples": {
        "columns": [
            ("cnpj_basico","string"),("opcao_pelo_simples","string"),
            ("data_opcao_pelo_simples","string"),("data_exclusao_do_simples","string"),
            ("opcao_pelo_mei","string"),("data_opcao_pelo_mei","string"),
            ("data_exclusao_do_mei","string"),
        ]
    },
    "socios": {
        "columns": [
            ("cnpj_basico","string"),("identificador_socio","string"),
            ("nome_socio","string"),("cnpj_cpf_socio","string"),
            ("qualificacao_socio","string"),("data_entrada_sociedade","string"),
            ("pais","string"),("representante_legal","string"),
            ("nome_representante_legal","string"),
            ("qualificacao_representante_legal","string"),
            ("faixa_etaria","string"),
        ]
    },
    "paises": {"columns":[("codigo","string"),("descricao","string")]},
    "municipios": {"columns":[("codigo","string"),("descricao","string")]},
    "qualificacoes_socios": {"columns":[("codigo","string"),("descricao","string")]},
    "naturezas_juridicas": {"columns":[("codigo","string"),("descricao","string")]},
    "cnaes": {"columns":[("codigo","string"),("descricao","string")]},
}

def pick_schema_for_zip(zip_name:str) -> Tuple[str, List[Tuple[str,str]]]:
    # 1) tentativa direta pelo zip
    ds = sniff_dataset_from_zipname(zip_name)
    if ds in SCHEMAS:
        return ds, SCHEMAS[ds]["columns"]
    # 2) aproximação por nome
    best = get_close_matches(ds, list(SCHEMAS.keys()), n=1, cutoff=0.6)
    if best:
        return best[0], SCHEMAS[best[0]]["columns"]
    # fallback
    return "desconhecido", []


In [9]:
# 1) Descobrir meses e escolher o mais recente
months = month_dirs()
months[-1]

'2025-09/'

In [11]:
# 2) Listar .zip do mês alvo
target_month = months[-1]      # ex.: '2025-09/'
zips = list_zip_urls(target_month)
zips[0]

('Cnaes.zip',
 'https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/2025-09/Cnaes.zip')

In [12]:
# 3) Escolher um ZIP “leve” para teste — ex.: Cnaes.zip
chosen = None
for name, url in zips:
    if "cnae" in name.lower():
        chosen = (name, url)
        break
chosen

('Cnaes.zip',
 'https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/2025-09/Cnaes.zip')

In [13]:
# 4) Baixar o ZIP e inspecionar conteúdo
zip_name, zip_url = chosen
local_zip = RAW_DIR / f"{target_month.strip('/')}_{zip_name}"
meta = download(zip_url, local_zip)
local_zip, meta

(WindowsPath('C:/Users/edmar/MeusProjetos/white-cube/data-ingestion-mvp/data/_raw/2025-09_Cnaes.zip'),
 {'etag': '"563e-63ec7eb241b5c"',
  'last_modified': 'Sun, 14 Sep 2025 19:30:24 GMT',
  'content_length': '22078',
  'content_md5': '48c53ff64378f2546af1546b4024ac3e'})

In [14]:
# 5) Ver arquivos internos do ZIP
with zipfile.ZipFile(local_zip, "r") as z:
    members = z.namelist()
members

['F.K03200$Z.D50913.CNAECSV']

In [15]:
TMP_EXTRACT = RAW_DIR / "_tmp_extract"
if TMP_EXTRACT.exists():
    shutil.rmtree(TMP_EXTRACT)
TMP_EXTRACT.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(local_zip, "r") as z:
    for m in z.infolist():
        if m.is_dir():
            continue
        # nomes "estranhos" -> força .csv
        out_name = Path(m.filename).name
        if not out_name.lower().endswith(".csv"):
            out_name = out_name + ".csv"
        z.extract(m, TMP_EXTRACT)
        src = TMP_EXTRACT / m.filename
        dst = TMP_EXTRACT / out_name
        if src != dst:
            src.replace(dst)

sorted([p.name for p in TMP_EXTRACT.iterdir() if p.is_file()])


['F.K03200$Z.D50913.CNAECSV.csv']

In [None]:
# Pega o primeiro CSV (no dataset CNAE deve existir só 1)
csv_path = sorted(TMP_EXTRACT.glob("*.csv"))[0]
csv_path

WindowsPath('C:/Users/edmar/MeusProjetos/white-cube/data-ingestion-mvp/data/_raw/_tmp_extract/F.K03200$Z.D50913.CNAECSV.csv')

In [19]:
# Define schema Spark a partir do dicionário (sem cabeçalho no arquivo)
from pyspark.sql.types import StructType, StructField, StringType, ByteType, DoubleType

ds_name, schema_cols = pick_schema_for_zip(zip_name)
assert ds_name != "desconhecido", f"Schema não identificado para {zip_name}"

def to_spark_type(t:str):
    t = t.lower()
    if t in ("string","str"): return StringType()
    if t in ("byte","int8"): return ByteType()
    if t in ("double","float64","float","decimal"): return DoubleType()
    # fallback
    return StringType()

schema = StructType([StructField(col, to_spark_type(tp), True) for col,tp in schema_cols])

# Lê CSV sem header, separador ; (o padrão desses arquivos), com encoding latin-1
df = (spark.read
      .format("csv")
      .option("header","false")
      .option("sep",";")
      .option("encoding","ISO-8859-1")
      .schema(schema)
      .load(str(csv_path)))

df.show(5, truncate=False)
df.printSchema()


+-------+---------------------------------------------------------+
|codigo |descricao                                                |
+-------+---------------------------------------------------------+
|0111301|Cultivo de arroz                                         |
|0111302|Cultivo de milho                                         |
|0111303|Cultivo de trigo                                         |
|0111399|Cultivo de outros cereais não especificados anteriormente|
|0112101|Cultivo de algodão herbáceo                              |
+-------+---------------------------------------------------------+
only showing top 5 rows
root
 |-- codigo: string (nullable = true)
 |-- descricao: string (nullable = true)



In [21]:
from pyspark.sql import functions as F

ref_month = target_month.strip("/")   # ex: '2025-09'
df2 = df.withColumn("ref_month", F.lit(ref_month))

# Ajuste de número de arquivos ao escrever (para o teste, 1 arquivo):
df2 = df2.coalesce(1)

# Escolhe formato conforme sua sessão:
USE_DELTA = False


dest = OUT_DIR / ds_name
if USE_DELTA:
    (df2.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("ref_month")
        .save(str(dest)))
else:
    (df2.write
        .mode("overwrite")
        .partitionBy("ref_month")
        .parquet(str(dest)))

print("gravado em:", dest, "| delta?" , USE_DELTA)


gravado em: C:\Users\edmar\MeusProjetos\white-cube\data-ingestion-mvp\data\dev_out\cnaes | delta? False


In [22]:
if USE_DELTA:
    outdf = spark.read.format("delta").load(str(dest))
else:
    outdf = spark.read.parquet(str(dest))

outdf.where(F.col("ref_month")==ref_month).show(5, truncate=False)
outdf.select("ref_month").distinct().show()


+-------+---------------------------------------------------------+---------+
|codigo |descricao                                                |ref_month|
+-------+---------------------------------------------------------+---------+
|0111301|Cultivo de arroz                                         |2025-09  |
|0111302|Cultivo de milho                                         |2025-09  |
|0111303|Cultivo de trigo                                         |2025-09  |
|0111399|Cultivo de outros cereais não especificados anteriormente|2025-09  |
|0112101|Cultivo de algodão herbáceo                              |2025-09  |
+-------+---------------------------------------------------------+---------+
only showing top 5 rows
+---------+
|ref_month|
+---------+
|  2025-09|
+---------+



In [27]:
from pyspark.sql.types import StructType, StructField, TimestampType

log_schema = (StructType()
    .add("file_url","string")
    .add("zip_name","string")
    .add("dataset","string")
    .add("ref_month","string")
    .add("etag","string")
    .add("last_modified","string")
    .add("content_length","string")
    .add("content_md5","string")
    .add("processed_at", TimestampType()))

def read_log() -> 'DataFrame':
    p = LOG_PATH
    if p.exists():
        return spark.read.parquet(str(p))
    else:
        return spark.createDataFrame([], log_schema)

def write_log_append(rows_df):
    (rows_df
     .coalesce(1)
     .write
     .mode("append")
     .parquet(str(LOG_PATH)))

logdf = read_log()
logdf.orderBy("processed_at", ascending=False).show(5, truncate=False)


+--------------------------------------------------------------------------------------+---------+-------+---------+--------------------+-----------------------------+--------------+--------------------------------+--------------------------+
|file_url                                                                              |zip_name |dataset|ref_month|etag                |last_modified                |content_length|content_md5                     |processed_at              |
+--------------------------------------------------------------------------------------+---------+-------+---------+--------------------+-----------------------------+--------------+--------------------------------+--------------------------+
|https://arquivos.receitafederal.gov.br/dados/cnpj/dados_abertos_cnpj/2025-09/Cnaes.zip|Cnaes.zip|cnaes  |2025-09  |"563e-63ec7eb241b5c"|Sun, 14 Sep 2025 19:30:24 GMT|22078         |48c53ff64378f2546af1546b4024ac3e|2025-09-18 20:06:23.267002|
+---------------------------

In [24]:
# Checagem de já processado (por hash ou ETag+len+lm+ref_month)
def already_processed(logdf, file_url, ref_month, content_md5, etag, last_modified, content_length) -> bool:
    cond = (
        (logdf.file_url == file_url) &
        (logdf.ref_month == ref_month) &
        ((logdf.content_md5 == content_md5) | (
            (logdf.etag == etag) & (logdf.last_modified == last_modified) & (logdf.content_length == content_length)
        ))
    )
    return logdf.filter(cond).limit(1).count() > 0

ap = already_processed(logdf, zip_url, ref_month, meta["content_md5"], meta["etag"], meta["last_modified"], meta["content_length"])
ap


False

In [25]:
# Se não processado, append no log com processed_at=now()
if not ap:
    now = datetime.utcnow()
    rows = [(zip_url, zip_name, ds_name, ref_month,
             meta["etag"], meta["last_modified"], meta["content_length"], meta["content_md5"], now)]
    to_append = spark.createDataFrame(rows, schema=log_schema)
    write_log_append(to_append)
    print("Log atualizado")
else:
    print("Já processado anteriormente; log mantido")


Log atualizado


In [26]:
logdf = read_log()
logdf.groupBy("dataset","ref_month").count().orderBy("dataset","ref_month").show(50, truncate=False)

+-------+---------+-----+
|dataset|ref_month|count|
+-------+---------+-----+
|cnaes  |2025-09  |1    |
+-------+---------+-----+



In [28]:
# === PIPELINE DO MÊS MAIS RECENTE (varre todos .zip) ===
import os, io, re, zipfile, shutil, hashlib
from pathlib import Path
from datetime import datetime
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, ByteType, DoubleType, TimestampType

# -------- helpers locais da célula (auto-contidos) ----------
def is_delta_enabled(spark) -> bool:
    try:
        ext = spark.conf.get("spark.sql.extensions")
        cat = spark.conf.get("spark.sql.catalog.spark_catalog")
        return ("delta" in (ext or "").lower()) and ("delta" in (cat or "").lower())
    except Exception:
        return False

def to_spark_type(t:str):
    t = (t or "").lower()
    if t in ("string","str"): return StringType()
    if t in ("byte","int8"):  return ByteType()
    if t in ("double","float64","float","decimal"): return DoubleType()
    return StringType()

LOG_SCHEMA = (StructType()
    .add("file_url","string")
    .add("zip_name","string")
    .add("dataset","string")
    .add("ref_month","string")
    .add("etag","string")
    .add("last_modified","string")
    .add("content_length","string")
    .add("content_md5","string")
    .add("processed_at", TimestampType()))

def read_log_df():
    p = LOG_PATH
    if p.exists():
        return spark.read.parquet(str(p))
    else:
        return spark.createDataFrame([], LOG_SCHEMA)

def write_log_append(df):
    # append com 1 arquivo (log pequeno)
    (df.coalesce(1)
       .write.mode("append")
       .parquet(str(LOG_PATH)))

def already_processed(logdf, file_url, ref_month, content_md5, etag, last_modified, content_length) -> bool:
    cond = (
        (logdf.file_url == file_url) &
        (logdf.ref_month == ref_month) &
        (
            (logdf.content_md5 == content_md5) | 
            (
                (logdf.etag == etag) &
                (logdf.last_modified == last_modified) &
                (logdf.content_length == content_length)
            )
        )
    )
    return logdf.filter(cond).limit(1).count() > 0

def extract_zip_to_csvs(local_zip: Path, extract_dir: Path) -> list[Path]:
    """Extrai tudo e força extensão .csv nos arquivos de dados."""
    if extract_dir.exists():
        shutil.rmtree(extract_dir)
    extract_dir.mkdir(parents=True, exist_ok=True)
    csvs = []
    with zipfile.ZipFile(local_zip, "r") as z:
        for m in z.infolist():
            if m.is_dir():
                continue
            out_name = Path(m.filename).name
            if not out_name.lower().endswith(".csv"):
                out_name = out_name + ".csv"
            z.extract(m, extract_dir)
            src = extract_dir / m.filename
            dst = extract_dir / out_name
            if src != dst:
                dst.parent.mkdir(parents=True, exist_ok=True)
                src.replace(dst)
            if dst.suffix.lower() == ".csv":
                csvs.append(dst)
    return sorted(csvs)

# -------- começa o fluxo ----------
months = month_dirs()
if not months:
    raise SystemExit("Nenhum mês encontrado na raiz do site.")

target_month = months[-1]              # ex.: '2025-09/'
ref_month = target_month.strip("/")    # '2025-09'
print(f"➡️  Mês alvo: {ref_month}")

zips = list_zip_urls(target_month)
print(f"🧾 {len(zips)} arquivos .zip encontrados no mês {ref_month}.")

USE_DELTA = False
print(f"💾 Formato de saída: {'Delta Lake' if USE_DELTA else 'Parquet'}")

logdf = read_log_df()
processed_count = 0
skipped_count = 0
errors = []

for zip_name, zip_url in zips:
    ds_name, schema_cols = pick_schema_for_zip(zip_name)
    print(f"\n— Baixando: {zip_name}  → dataset='{ds_name}'")

    local_zip = RAW_DIR / f"{ref_month}_{zip_name}"
    try:
        meta = download(zip_url, local_zip)
    except Exception as e:
        print(f"  [ERRO] Falha no download: {e}")
        errors.append((zip_name, "download", str(e)))
        continue

    # verificação de já processado
    try:
        if already_processed(logdf, zip_url, ref_month, meta["content_md5"], meta["etag"], meta["last_modified"], meta["content_length"]):
            print("  ✓ já processado anteriormente (mesmos metadados) — pulando.")
            skipped_count += 1
            continue
    except Exception as e:
        print(f"  [WARN] Não consegui consultar log (seguindo em frente): {e}")

    # schema
    if not schema_cols:
        print("  [WARN] Schema desconhecido — pulando este zip.")
        skipped_count += 1
        continue

    # extrai e lê CSV(s)
    try:
        tmp_dir = RAW_DIR / f"_extract_{ref_month}_{Path(zip_name).stem}"
        csv_paths = extract_zip_to_csvs(local_zip, tmp_dir)
        if not csv_paths:
            print("  [WARN] ZIP sem CSVs legíveis — pulando.")
            skipped_count += 1
            continue

        # monta schema Spark
        schema = StructType([StructField(col, to_spark_type(tp), True) for col, tp in schema_cols])

        # concatena todos os CSVs do zip (se houver mais de um)
        df_parts = []
        for csv_path in csv_paths:
            df_part = (spark.read
                       .format("csv")
                       .option("header","false")
                       .option("sep",";")
                       .option("encoding","ISO-8859-1")   # <- importante no Spark
                       .schema(schema)
                       .load(str(csv_path)))
            df_parts.append(df_part)

        from functools import reduce
        from pyspark.sql import DataFrame as SDF
        df = reduce(SDF.unionByName, df_parts) if len(df_parts) > 1 else df_parts[0]

        # adiciona ref_month e grava
        df2 = df.withColumn("ref_month", F.lit(ref_month)).coalesce(1)
        dest = OUT_DIR / ds_name

        if USE_DELTA:
            (df2.write
                .format("delta")
                .mode("overwrite")                  # overwrite dinâmico só nas partições tocadas
                .partitionBy("ref_month")
                .save(str(dest)))
        else:
            (df2.write
                .mode("overwrite")
                .partitionBy("ref_month")
                .parquet(str(dest)))

        # atualiza log
        now = datetime.utcnow()
        rows = [(zip_url, zip_name, ds_name, ref_month,
                 meta["etag"], meta["last_modified"], meta["content_length"], meta["content_md5"], now)]
        to_append = spark.createDataFrame(rows, schema=LOG_SCHEMA)
        write_log_append(to_append)

        print(f"  ✅ ok: {zip_name} → {ds_name} (rows={df2.count()})")
        processed_count += 1

    except Exception as e:
        print(f"  [ERRO] Processando {zip_name}: {e}")
        errors.append((zip_name, "process", str(e)))
        continue

print("\n===== RESUMO =====")
print(f"✔️ processados: {processed_count}")
print(f"⏭️ pulados:     {skipped_count}")
print(f"❌ erros:        {len(errors)}")
if errors:
    for name, where, msg in errors[:10]:
        print(f"  - {name} @ {where}: {msg}")

# espiada no log final
final_log = read_log_df()
display(final_log.orderBy(F.col("processed_at").desc()))

➡️  Mês alvo: 2025-09
🧾 37 arquivos .zip encontrados no mês 2025-09.
💾 Formato de saída: Parquet

— Baixando: Cnaes.zip  → dataset='cnaes'
  ✓ já processado anteriormente (mesmos metadados) — pulando.

— Baixando: Empresas0.zip  → dataset='empresas'
  ✅ ok: Empresas0.zip → empresas (rows=24038054)

— Baixando: Empresas1.zip  → dataset='empresas'
  ✅ ok: Empresas1.zip → empresas (rows=4494860)

— Baixando: Empresas2.zip  → dataset='empresas'
  ✅ ok: Empresas2.zip → empresas (rows=4494860)

— Baixando: Empresas3.zip  → dataset='empresas'
  ✅ ok: Empresas3.zip → empresas (rows=4494860)

— Baixando: Empresas4.zip  → dataset='empresas'
  ✅ ok: Empresas4.zip → empresas (rows=4494860)

— Baixando: Empresas5.zip  → dataset='empresas'
  ✅ ok: Empresas5.zip → empresas (rows=4494860)

— Baixando: Empresas6.zip  → dataset='empresas'
  ✅ ok: Empresas6.zip → empresas (rows=4494860)

— Baixando: Empresas7.zip  → dataset='empresas'
  ✅ ok: Empresas7.zip → empresas (rows=4494860)

— Baixando: Empresas8

DataFrame[file_url: string, zip_name: string, dataset: string, ref_month: string, etag: string, last_modified: string, content_length: string, content_md5: string, processed_at: timestamp]