In [9]:
from pathlib import Path
import os
import sys
import shutil
import importlib
import subprocess
import yaml
import pandas as pd

# Ensure we run from project root
root = Path.cwd().resolve()
while root != root.parent and not (root / "configs").exists():
    root = root.parent
os.chdir(root)

if str(root) not in sys.path:
    sys.path.insert(0, str(root))

# Optimizaciones de rendimiento
print("="*60)
print("INICIALIZANDO OPTIMIZACIONES")
print("="*60)

try:
    from src.utils.performance import enable_pandas_performance, get_optimal_workers
    enable_pandas_performance()
    workers = get_optimal_workers()
    print(f"Optimizaciones habilitadas")
    print(f"CPU cores disponibles: {os.cpu_count()}")
    print(f"Workers configurados: {workers}")
except Exception as e:
    print(f"No se pudieron cargar optimizaciones: {e}")
    workers = 1

# Ensure geopandas is available for heatmap rendering
try:
    import geopandas as gpd
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "geopandas"])
    import geopandas as gpd
print(f"geopandas {gpd.__version__}")

# Reload modules to pick up latest changes
import src.etl.collapse_ruc as collapse_ruc
import src.metrics.geografia as geografia
import src.viz.figures as figures
import src.reporting.export_artifacts as export_artifacts
importlib.reload(collapse_ruc)
importlib.reload(geografia)
importlib.reload(figures)
importlib.reload(export_artifacts)

from src.reporting.export_artifacts import run_provincia

provincias_cfg_path = Path("configs") / "provincias.yaml"
provincias_cfg = {}
if provincias_cfg_path.exists():
    provincias_cfg = yaml.safe_load(provincias_cfg_path.read_text(encoding="utf-8")) or {}

prov_map = provincias_cfg.get("provincias", {}) or {}

# Ordenar provincias con PROVINCIA al final
all_provs = sorted(prov_map.keys())
if "PROVINCIA" in all_provs:
    all_provs.remove("PROVINCIA")
    available_provinces = all_provs + ["PROVINCIA"]
else:
    available_provinces = all_provs

print("\nProvincias disponibles:")
for idx, prov in enumerate(available_provinces, start=1):
    print(f"  {idx:02d}. {prov}")

# Seleccion de provincia o todas
print("\n" + "="*60)
run_all = input("Ejecutar todas las provincias? (s/N): ").strip().lower() == "s"

# Modo paralelo
use_parallel = False
if run_all:
    selected_province = None
    if workers > 1:
        parallel_choice = input(f"Usar procesamiento PARALELO con {workers} workers? (S/n): ").strip().lower()
        use_parallel = parallel_choice != 'n'
        if use_parallel:
            print(f"Modo PARALELO activado - {workers}x mas rapido!")
        else:
            print("Modo secuencial seleccionado")
else:
    selected_province = None
    while selected_province is None:
        choice = input("\nProvincia (numero o nombre): ").strip()
        if not choice:
            print("Debes ingresar una provincia valida.")
            continue
        if choice.isdigit():
            idx = int(choice)
            if 1 <= idx <= len(available_provinces):
                selected_province = available_provinces[idx - 1]
                break
        else:
            choice_norm = choice.upper().strip()
            if choice_norm in available_provinces:
                selected_province = choice_norm
                break
        print("Provincia invalida, intenta otra vez.")

# Seleccion de intervalo
interval_mode = input("\nUsar todo el intervalo 2000-2024? (s/N): ").strip().lower() == "s"

start_year = 2000
end_year = 2024
if not interval_mode:
    presets = {
        "1": (2000, 2009),
        "2": (2010, 2016),
        "3": (2017, 2024),
    }
    print("\nIntervalos predefinidos:")
    print("  1) 2000-2009")
    print("  2) 2010-2016")
    print("  3) 2017-2024")
    preset_choice = input("Selecciona intervalo (1-3, Enter=2000-2009): ").strip()
    if not preset_choice:
        preset_choice = "1"
    while preset_choice not in presets:
        print("Intervalo invalido, intenta otra vez.")
        preset_choice = input("Selecciona intervalo (1-3): ").strip()
    start_year, end_year = presets[preset_choice]

print(f"Intervalo: {start_year}-{end_year}")

# Modo publico activado automaticamente - no exporta data/raw_filtrado.csv
public_mode = True
print("\nModo publico: activado (data anonimizado)")

def resolve_raw_path(province: str) -> str | None:
    meta = prov_map.get(province.upper())
    if not meta:
        return None
    return meta.get("raw_path")

def geo_path_for(province: str) -> Path:
    geo_base = Path("data") / "geo" / "provincias"
    prov_folder = province.upper().strip().replace(" ", "_")
    path = geo_base / prov_folder / f"{prov_folder.lower()}.geojson"
    if path.exists():
        return path
    return geo_base / "ECUADOR.geojson"

def build_configs_override(start_year: int, end_year: int, use_all_years: bool) -> Path:
    base = Path("configs")
    target = Path("notebooks") / "configs_override"
    target.mkdir(parents=True, exist_ok=True)

    global_cfg = yaml.safe_load((base / "global.yaml").read_text(encoding="utf-8")) or {}
    if not use_all_years:
        global_cfg["window_start_year"] = int(start_year)
        global_cfg["window_end_year"] = int(end_year)
    (target / "global.yaml").write_text(yaml.safe_dump(global_cfg, sort_keys=False), encoding="utf-8")

    if (base / "provincias.yaml").exists():
        (target / "provincias.yaml").write_text(
            (base / "provincias.yaml").read_text(encoding="utf-8"),
            encoding="utf-8",
        )
    return target

print("\n" + "="*60)
print("INICIANDO PROCESAMIENTO")
print("="*60 + "\n")

cfg_dir = build_configs_override(start_year, end_year, interval_mode)
outputs = []

# Procesamiento optimizado
from time import perf_counter
start_time = perf_counter()

if run_all and use_parallel:
    # Modo paralelo
    print(f"Procesamiento PARALELO con {workers} workers\n")
    
    try:
        from src.utils.parallel import process_provinces_parallel
        
        raw_paths_dict = {prov: resolve_raw_path(prov) for prov in available_provinces}
        
        results = process_provinces_parallel(
            provincias=available_provinces,
            configs_dir=str(cfg_dir),
            raw_dir="data/raw",
            raw_paths=raw_paths_dict,
            public_mode=public_mode,
            max_workers=workers
        )
        
        outputs = [{"province": r["provincia"], "output": r["output"], "status": r.get("status", "unknown")} 
                   for r in results]
    except Exception as e:
        print(f"Error en procesamiento paralelo: {e}")
        print("Recurriendo a procesamiento secuencial...")
        use_parallel = False

if not use_parallel:
    # Modo secuencial
    provinces_to_process = available_provinces if run_all else [selected_province]
    
    for prov in provinces_to_process:
        raw_path = resolve_raw_path(prov)
        geo_path = geo_path_for(prov)
        if not geo_path.exists():
            print(f"GeoJSON no encontrado para {prov}: {geo_path}")
        
        try:
            out_base = run_provincia(
                prov,
                configs_dir=str(cfg_dir),
                raw_dir="data/raw",
                raw_path=raw_path,
                public_mode=public_mode,
            )
            outputs.append({"province": prov, "output": str(out_base), "status": "success"})
        except Exception as e:
            print(f"Error en {prov}: {e}")
            outputs.append({"province": prov, "output": None, "status": "error"})

elapsed = perf_counter() - start_time

# Resultados
print("\n" + "="*60)
print("RESULTADOS")
print("="*60)

success_count = sum(1 for o in outputs if o.get("status") == "success")
error_count = len(outputs) - success_count

for item in outputs:
    status = "[OK]" if item.get("status") == "success" else "[ERROR]"
    print(f"{status} {item['province']}: {item.get('output', 'ERROR')}")

print(f"\nTiempo total: {elapsed:.2f}s ({elapsed/60:.2f} min)")
if len(outputs) > 0:
    print(f"Promedio: {elapsed/len(outputs):.2f}s por provincia")
print(f"Exitosas: {success_count}")
print(f"Fallidas: {error_count}")

if use_parallel and workers > 1:
    estimated_sequential = elapsed * workers
    speedup = estimated_sequential / elapsed
    print(f"\nSpeedup estimado: {speedup:.1f}x mas rapido que secuencial")

shutil.rmtree(cfg_dir, ignore_errors=True)
print(f"\nCleanup: configs_override eliminado")
print("\nProceso completado!")


INICIALIZANDO OPTIMIZACIONES
Optimizaciones habilitadas
CPU cores disponibles: 12
Workers configurados: 8
geopandas 1.1.2

Provincias disponibles:
  01. AZUAY
  02. BOLIVAR
  03. CARCHI
  04. CAÑAR
  05. CHIMBORAZO
  06. COTOPAXI
  07. EL ORO
  08. ESMERALDAS
  09. GALAPAGOS
  10. GUAYAS
  11. IMBABURA
  12. LOJA
  13. LOS RIOS
  14. MANABI
  15. MORONA SANTIAGO
  16. NAPO
  17. ORELLANA
  18. PASTAZA
  19. PICHINCHA
  20. SANTA ELENA
  21. SANTO DOMINGO DE LOS TSACHILAS
  22. SUCUMBIOS
  23. TUNGURAHUA
  24. ZAMORA CHINCHIPE
  25. PROVINCIA

Intervalo: 2000-2024

Modo publico: activado (data anonimizado)

INICIANDO PROCESAMIENTO

Limpieza previa: 4 carpetas __pycache__ eliminadas.
Iniciando pipeline para BOLIVAR (15 etapas)...
  01. Carga del raw
  02. Normalización de campos
  03. Filtro universo (SOCIEDAD)
  04. Filtro provincia
  05. Filtro codigos omitidos
  06. QC raw
  07. Colapso a RUC
  08. QC RUC
  09. Demografía y cohortes
  10. Cantones y geografía
  11. Macro sectores y ac