In [1]:
# 01 - Ingestão e Validação (dados processados 2019–2024)
# Fonte: projeto_aplicado_grupo_12/data/processed/

import os
import sys
import pandas as pd
from pathlib import Path

# Garantir import de módulos locais quando executado do diretório notebooks/
BASE_DIR = Path.cwd().parent
SRC_DIR = BASE_DIR / "src"
if str(BASE_DIR) not in sys.path:
    sys.path.append(str(BASE_DIR))
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

from src.io_utils import read_processed_concat
from src.cleaning import normalize_text_columns, parse_datetime, validate_coordinates

RAW_DIR = os.path.join(str(BASE_DIR), "data", "processed")
INTERIM_DIR = os.path.join(str(BASE_DIR), "data", "interim")
FIG_DIR = os.path.join(str(BASE_DIR), "docs", "figures")

Path(INTERIM_DIR).mkdir(parents=True, exist_ok=True)

# Carregar e unificar diretamente dos CSVs
focos = read_processed_concat(str(BASE_DIR))

# Normalizar textos
focos = normalize_text_columns(focos, ["pais", "estado", "municipio", "bioma"]) if any(col in focos.columns for col in ["pais","estado","municipio","bioma"]) else focos

# Parse datetime
for candidate in ["data_pas", "data", "timestamp", "datahora"]:
    if candidate in focos.columns:
        focos = parse_datetime(focos, candidate)
        break

# Validar coordenadas, se existirem
focos = validate_coordinates(focos)

# Persistir CSV unificado
out_csv = os.path.join(INTERIM_DIR, "focos_2019_2024.csv")
focos.to_csv(out_csv, index=False)

# Metadados básicos
meta = {
    "num_linhas": int(len(focos)),
    "colunas": {c: str(focos[c].dtype) for c in focos.columns},
}
meta_path = os.path.join(INTERIM_DIR, "metadata.json")
pd.Series(meta).to_json(meta_path, force_ascii=False)

focos.head()


Unnamed: 0,month,Amazônia,Caatinga,Cerrado,Mata Atlântica,Pampa,Pantanal,nan,ACRE,ALAGOAS,...,Vargas,Vaupés,Vichada,Wanica,Yaracuy,Zamora Chinchipe,Zulia,Ñeembucú,day,focos
0,2019-01,1419.0,433.0,1213.0,608.0,20.0,337.0,8427.0,,,...,,,,,,,,,,
1,2019-02,1368.0,73.0,574.0,607.0,32.0,211.0,15019.0,,,...,,,,,,,,,,
2,2019-03,3383.0,58.0,936.0,661.0,82.0,93.0,14349.0,,,...,,,,,,,,,,
3,2019-04,1702.0,28.0,753.0,276.0,50.0,33.0,10269.0,,,...,,,,,,,,,,
4,2019-05,854.0,59.0,1719.0,240.0,23.0,68.0,3200.0,,,...,,,,,,,,,,


In [2]:
# Validações: comparar total de linhas do unificado vs soma dos CSVs
import glob

csv_files = sorted(glob.glob(str(BASE_DIR / "data" / "processed" / "*.csv")))
rows_csv = 0
cols_set = set()
for p in csv_files:
    tmp = pd.read_csv(p, nrows=1000, low_memory=False)  # amostra para colunas
    cols_set.update(tmp.columns.tolist())
    rows_csv += sum(1 for _ in open(p, 'r', encoding='utf-8', errors='ignore')) - 1  # linhas sem header

rows_unificado = len(focos)
validation = pd.DataFrame({
    "checagem": ["linhas_total", "num_colunas_unificadas"],
    "csvs": [rows_csv, len(cols_set)],
    "unificado": [rows_unificado, len(focos.columns)]
})
validation_path = BASE_DIR / "data" / "interim" / "validation_ingestao.csv"
validation.to_csv(validation_path, index=False)
validation


Unnamed: 0,checagem,csvs,unificado
0,linhas_total,2318,2318
1,num_colunas_unificadas,241,241
