In [1]:
import warnings
import os, sys, platform
import glob
from astropy.units import UnitsWarning
from pathlib import Path

# Ignorar warnings comunes de librerías externas
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UnitsWarning)
print("🔇 Warnings silenciados: UserWarning, FutureWarning")
# Desactivar Rich para evitar problemas de compatibilidad
os.environ["RICH_NO_RICH"] = "1"

# Asegurar que el directorio 'src' esté en el path de Python
src_path = Path("src").resolve()
if src_path not in sys.path:
    sys.path.append(str(src_path))
from src.script_1_eb import main as run_script_1

# Detectar entorno local/AWS y ejecutar el script
try:
    import sagemaker
    is_sagemaker = True
except ImportError:
    is_sagemaker = False

#existing = len(glob.glob("/home/ec2-user/backup/data/raw/kepler/*.csv")) + len(glob.glob("/home/ec2-user/backup/data/raw/tess/*.csv"))
existing = len(glob.glob("/home/ec2-user/backup/data/raw/tess/*.csv"))
print(f"🗃️ Curvas ya existentes en disco: {existing}", flush=True)

# Ejecutar con la opción adecuada
if is_sagemaker:
    print("🔁 Ejecutando en SageMaker → catálogo completo")
    # Solo TESS, solo pendientes
    run_script_1(mission="TESS", only_pending=True, max_workers=12)
else:
    print("💻 Ejecutando en entorno local → catálogo de prueba")
    run_script_1(use_sample=True)


📁 Usando RAW_DIR: /home/ec2-user/backup/data/raw


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
🗃️ Curvas ya existentes en disco: 0
🔁 Ejecutando en SageMaker → catálogo completo
[⬇] Descargando catálogos completos de Kepler y TESS...
[⬇] Descargando catálogo Kepler EB...
[📂] Cargando catálogo Kepler EB desde copia local...
[⬇] Descargando catálogo TESS EB...
[⬇] Generando CSV de entrada para descarga de curvas...
📝 CSV generado con 0 estrellas → data/lists/eb_ids.csv
[⬇] Descargando curvas de luz...

🚀 Procesando misión: TESS (only_pending=True)
📝 CSV generado con 0 estrellas → data/lists/eb_ids_tess_pendientes.csv
[⬇] Descargando 0 curvas en paralelo con 12 hilos...


🚀 Descargando curvas: 0it [00:00, ?it/s]


	📦 0 objetos procesados hasta ahora...
[✓] Descarga finalizada: 0 objetos procesados.
[⭢] Leyendo y fusionando curvas descargadas...





[📁] Usando 16 lotes ya existentes en /home/ec2-user/temp_batches
[⏳] Uniendo todos los lotes en /home/ec2-user/backup/processed/dataset_eb_tess.parquet...


📚 Uniendo lotes: 100%|██████████| 16/16 [03:14<00:00, 12.13s/it]


[✅] Dataset parquet construido → /home/ec2-user/backup/processed/dataset_eb_tess.parquet
🧹 Carpeta temporal eliminada: /home/ec2-user/temp_batches
[⏱] Tiempo total: 196.25 segundos


### BACKUPS ANTES DE APAGAR INSTANCIA

In [None]:
import shutil, os
from pathlib import Path

# Crear carpeta de backup dentro de SageMaker
backup_dir = Path.home() / "SageMaker" / "backup"
backup_dir.mkdir(exist_ok=True)
print(f"📁 Carpeta de backup creada en: {backup_dir}")

# 🔁 NUEVO: Copiar curvas desde la nueva ubicación temporal
#raw_dir_kepler = Path("/home/ec2-user/backup/raw/kepler")
raw_dir_tess = Path("/home/ec2-user/backup/raw/tess")
if raw_dir.exists():
    #shutil.make_archive(backup_dir / "curvas_raw_kepler", "zip", raw_dir_kepler)
    #print("✅ Copiado: curvas en backup/raw/kepler → curvas_raw_kepler.zip")
    shutil.make_archive(backup_dir / "curvas_raw_tess", "zip", raw_dir_tess)
    print("✅ Copiado: curvas en backup/raw/tess → curvas_raw_tess.zip")
else:
    print("⚠️ No se encontró backup/raw")

# Copiar dataset procesado
processed_dir = Path("SageMaker/astro_transformer/data/processed")
if processed_dir.exists():
    for file in processed_dir.glob("*.parquet"):
        shutil.copy(file, backup_dir)
        print(f"✅ Copiado: {file.name}")
else:
    print("⚠️ No se encontró data/processed")

# Copiar log de errores si existe
error_log = Path("SageMaker/astro_transformer/logs/fallos_descarga.csv")
if error_log.exists():
    shutil.copy(error_log, backup_dir)
    print("✅ Copiado: logs/fallos_descarga.csv")

# Copiar caché de lightkurve si existe
lk_cache = Path.home() / ".lightkurve" / "cache"
if lk_cache.exists():
    shutil.make_archive(backup_dir / "lightkurve_cache", "zip", lk_cache)
    print("✅ Copiado: ~/.lightkurve/cache → lightkurve_cache.zip")

print("🎉 Backup completado.")

In [None]:
# Borrar cache de lightcurve
!rm -rf ~/.lightkurve/cache/*

In [1]:
%run src/script_5a_normalize_ids.py --mision vsx_tess


🔧 Procesando VSX_TESS → dataset_vsx_tess_labeled.parquet
📝 Procesando batch #1...
✅ Batch #1 guardado en dataset_vsx_tess_labeled_batch_0000.parquet (131072 filas)
📝 Procesando batch #2...
✅ Batch #2 guardado en dataset_vsx_tess_labeled_batch_0001.parquet (131072 filas)
📝 Procesando batch #3...
✅ Batch #3 guardado en dataset_vsx_tess_labeled_batch_0002.parquet (131072 filas)
📝 Procesando batch #4...
✅ Batch #4 guardado en dataset_vsx_tess_labeled_batch_0003.parquet (131072 filas)
📝 Procesando batch #5...
✅ Batch #5 guardado en dataset_vsx_tess_labeled_batch_0004.parquet (131072 filas)
📝 Procesando batch #6...
✅ Batch #6 guardado en dataset_vsx_tess_labeled_batch_0005.parquet (131072 filas)
📝 Procesando batch #7...
✅ Batch #7 guardado en dataset_vsx_tess_labeled_batch_0006.parquet (131072 filas)
📝 Procesando batch #8...
✅ Batch #8 guardado en dataset_vsx_tess_labeled_batch_0007.parquet (131072 filas)
📝 Procesando batch #9...
✅ Batch #9 guardado en dataset_vsx_tess_labeled_batch_0008.pa

In [1]:
import pandas as pd
df = pd.read_parquet("data/processed/dataset_vsx_tic_labeled_north.parquet")
print(df.shape)
print(df["clase_variable"].value_counts())


(27999, 9)
clase_variable
ROT                14332
EW                  2018
RS                  2012
DSCT|GDOR|SXPHE     1813
E                   1658
                   ...  
IN:                    1
NL+E                   1
Microlens              1
CWA                    1
EA/GS                  1
Name: count, Length: 115, dtype: int64


In [1]:
from src.utils.inspect_and_export_summary import inspect_and_export_summary

OUTPUT_PARQUET_SOUTH = "data/processed/dataset_vsx_tess_labeled_south.parquet"
OUTPUT_PARQUET_NORTH = "data/processed/dataset_vsx_tess_labeled_north.parquet"

inspect_and_export_summary(OUTPUT_PARQUET_SOUTH, output_format="csv")
inspect_and_export_summary(OUTPUT_PARQUET_NORTH, output_format="csv")



📁 Inspeccionando: data/processed/dataset_vsx_tess_labeled_south.parquet


🧮 Procesando por lotes: 24it [00:38,  1.60s/it]


✅ Resumen exportado a: data/processed/summary\dataset_vsx_tess_labeled_south_summary.csv

📁 Inspeccionando: data/processed/dataset_vsx_tess_labeled_north.parquet


🧮 Procesando por lotes: 434it [14:01,  1.94s/it]

✅ Resumen exportado a: data/processed/summary\dataset_vsx_tess_labeled_north_summary.csv





In [1]:
# Monitoriza el progreso de descarga de curvas ZTF
import time
from pathlib import Path

CURVES_DIR = Path("data/processed/ztf_curves")
TOTAL_OBJETIVO = 313_939  # Cambia este valor si tu objetivo es otro

while True:
    n_csv = len(list(CURVES_DIR.glob("*.csv")))
    pct = 100 * n_csv / TOTAL_OBJETIVO
    print(f"CSV generados: {n_csv} / {TOTAL_OBJETIVO} ({pct:.2f}%)")
    if n_csv >= TOTAL_OBJETIVO:
        print("✅ Descarga completada.")
        break
    time.sleep(60)  # Espera 1 minuto antes de volver a comprobar

CSV generados: 214537 / 313939 (68.34%)
CSV generados: 214577 / 313939 (68.35%)
CSV generados: 214651 / 313939 (68.37%)
CSV generados: 214711 / 313939 (68.39%)
CSV generados: 214755 / 313939 (68.41%)
CSV generados: 214835 / 313939 (68.43%)
CSV generados: 214876 / 313939 (68.45%)
CSV generados: 214960 / 313939 (68.47%)
CSV generados: 215001 / 313939 (68.48%)
CSV generados: 215040 / 313939 (68.50%)
CSV generados: 215079 / 313939 (68.51%)
CSV generados: 215118 / 313939 (68.52%)
CSV generados: 215156 / 313939 (68.53%)
CSV generados: 215194 / 313939 (68.55%)
CSV generados: 215230 / 313939 (68.56%)
CSV generados: 215312 / 313939 (68.58%)
CSV generados: 215356 / 313939 (68.60%)
CSV generados: 215399 / 313939 (68.61%)
CSV generados: 215481 / 313939 (68.64%)
CSV generados: 215520 / 313939 (68.65%)
CSV generados: 215608 / 313939 (68.68%)
CSV generados: 215643 / 313939 (68.69%)
CSV generados: 215681 / 313939 (68.70%)
CSV generados: 215720 / 313939 (68.71%)
CSV generados: 215756 / 313939 (68.73%)


: 

In [None]:
from pathlib import Path
import pyarrow.parquet as pq
import time
from IPython.display import clear_output

INPUT_DIR = Path("data/processed")
FILES = [
    "dataset_k2varcat_labeled_fixed.parquet",
    "dataset_vsx_tess_labeled_fixed.parquet",
    "dataset_gaia_complemented_normalized.parquet",
    "dataset_eb_kepler_labeled_fixed.parquet",
    "dataset_eb_tess_labeled_fixed.parquet",
    "dataset_vsx_tess_labeled_south.parquet",
    "dataset_vsx_tess_labeled_north.parquet",
    "dataset_vsx_tess_labeled_ampliado.parquet",
    "dataset_ztf_labeled.parquet"
]

def print_progress_bar(done, total, length=30):
    pct = done / total if total else 0
    filled = int(length * pct)
    bar = "█" * filled + "-" * (length - filled)
    return f"[{bar}] {pct*100:5.1f}%"

while True:
    output_lines = []
    for fname in FILES:
        path = INPUT_DIR / fname
        tmp_dir = path.parent / f"{path.stem}_unify_tmp"
        unified_path = path.parent / f"{path.stem}-unified.parquet"
        if not path.exists():
            continue  # Solo mostrar los que existen
        try:
            total_rows = pq.read_metadata(path).num_rows
        except Exception:
            continue
        batch_size = 1000
        total_batches = (total_rows + batch_size - 1) // batch_size

        # Progreso del fichero unificado (si existe)
        unified_info = ""
        if unified_path.exists():
            try:
                unified_rows = pq.read_metadata(unified_path).num_rows
                pct_unified = 100 * unified_rows / total_rows if total_rows else 0
                unified_info = f" | unified: {unified_rows}/{total_rows} filas ({pct_unified:.2f}%)"
            except Exception:
                unified_info = " | unified: error leyendo filas"
        else:
            unified_info = ""

        if unified_path.exists():
            try:
                unified_rows = pq.read_metadata(unified_path).num_rows
                if unified_rows == total_rows:
                    continue  # Ya está completo, no mostrar
            except Exception:
                pass  # Si hay error, sigue mostrando progreso

        if tmp_dir.exists():
            done_batches = len(list(tmp_dir.glob("batch_*.parquet")))
        else:
            done_batches = 0
        if done_batches < total_batches:
            progress_bar = print_progress_bar(done_batches, total_batches)
            output_lines.append(f"{fname}: {done_batches}/{total_batches} batches procesados ({done_batches*batch_size}/{total_rows} filas){unified_info} {progress_bar}")
        elif not unified_path.exists():
            # Si no hay batches pendientes pero tampoco unified, mostrar aviso
            output_lines.append(f"{fname}: batches terminados pero falta el fichero unified.")
        else:
            # unified existe pero incompleto
            if unified_info:
                output_lines.append(f"{fname}: unified incompleto{unified_info}")
    clear_output(wait=True)
    if output_lines:
        print("\n".join(output_lines))
        print("⏳ Esperando 2 minutos para la siguiente comprobación...\n")
    else:
        print("✅ Todos los ficheros de la lista han sido completados.")
        break
    time.sleep(120)

dataset_eb_tess_labeled_fixed.parquet: 102426/120748 batches procesados (102426000/120747997 filas) [█████████████████████████-----]  84.8%
⏳ Esperando 2 minutos para la siguiente comprobación...



In [2]:
# Los otros parquet muy fragmentados se han compactado desde data/compactar_parquet_fragmentados.bat (con Duck DB)
# Este fichero era demasiado grande y lo hemos compactado con este script

import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm

# Configuración
SRC = "data/processed/dataset_eb_kepler_labeled_fixed-unified.parquet"
DST = "data/processed/dataset_eb_kepler_labeled_fixed-compact.parquet"
ROW_GROUP = 50000
COMPRESSION = "snappy"

# Leer batches pequeños y agruparlos para escribir row groups grandes
scanner = ds.dataset(SRC, format="parquet").scanner()
batch_iter = scanner.to_batches()

writer = None
buffer = []
buffer_rows = 0

print(f"📂 Origen:   {SRC}")
print(f"📦 Destino:  {DST}")
print(f"🔄 Compactando por lotes de {ROW_GROUP} filas...\n")

for batch in tqdm(batch_iter, desc="Compactando"):
    buffer.append(batch)
    buffer_rows += batch.num_rows
    if buffer_rows >= ROW_GROUP:
        table = pa.Table.from_batches(buffer)
        if writer is None:
            writer = pq.ParquetWriter(DST, table.schema, compression=COMPRESSION)
        writer.write_table(table, row_group_size=None)
        buffer = []
        buffer_rows = 0

# Escribir cualquier resto
if buffer:
    table = pa.Table.from_batches(buffer)
    if writer is None:
        writer = pq.ParquetWriter(DST, table.schema, compression=COMPRESSION)
    writer.write_table(table, row_group_size=None)

if writer is not None:
    writer.close()

# Resumen
metadata = pq.ParquetFile(DST).metadata
print("\n📊 Resumen de salida:")
print(f"• Row groups generados: {metadata.num_row_groups}")
print(f"• Filas totales:        {metadata.num_rows:,}")
print(f"• Columnas:             {metadata.num_columns}")
print("✅ Compactación completada correctamente.")


📂 Origen:   data/processed/dataset_eb_kepler_labeled_fixed-unified.parquet
📦 Destino:  data/processed/dataset_eb_kepler_labeled_fixed-compact.parquet
🔄 Compactando por lotes de 50000 filas...



Compactando: 71771it [06:35, 181.51it/s]



📊 Resumen de salida:
• Row groups generados: 1434
• Filas totales:        71,735,011
• Columnas:             42
✅ Compactación completada correctamente.


In [5]:
from pathlib import Path
import pyarrow.parquet as pq
from src.utils.dataset_paths import DATASET_PATHS

total_row_groups = 0
total_rows = 0
for path in DATASET_PATHS:
    p = Path(path)
    try:
        pf = pq.ParquetFile(p)
        n_row_groups = pf.num_row_groups
        n_rows = pf.metadata.num_rows
        print(f"{p.name}: {n_row_groups} row groups, {n_rows:,} filas")
        total_row_groups += n_row_groups
        total_rows += n_rows
    except Exception as e:
        print(f"{p.name}: error ({e})")

print(f"\nTOTAL: {total_row_groups} row groups (batches), {total_rows:,} filas")
print("Este es el número de batches que recorrerá el script de validación.")

# Imprimir algunos ejemplos de filas sin cargar todo el dataset
# for path in DATASET_PATHS:
#     try:
#         pf = pq.ParquetFile(path)
#         print(f"\nEjemplos de filas de {path.name}:")
#         for i in range(min(3, pf.metadata.num_rows)):
#             row = pf.read_row_group(0).slice(i, 1).to_pandas()
#             print(row.to_string(index=False))
#     except Exception as e:
#         print(f"{path.name}: error al leer filas ({e})")
        

dataset_eb_kepler_labeled_fixed-compact-unified.parquet: 1434 row groups, 71,735,011 filas
dataset_eb_tess_labeled_fixed1-compact-unified.parquet: 1000 row groups, 49,975,930 filas
dataset_eb_tess_labeled_fixed2-compact-unified.parquet: 1048 row groups, 52,403,954 filas
dataset_eb_tess_labeled_fixed3-compact-unified.parquet: 181 row groups, 18,142,465 filas
dataset_k2varcat_labeled_fixed-unified2.parquet: 358 row groups, 17,895,159 filas
dataset_vsx_tess_labeled_fixed-unified.parquet: 40 row groups, 41,143,597 filas
dataset_gaia_complemented_normalized-unified.parquet: 29 row groups, 29,794,442 filas
dataset_vsx_tess_labeled_south-unified.parquet: 3 row groups, 3,095,437 filas
dataset_vsx_tess_labeled_north-compact-unified.parquet: 1135 row groups, 56,757,771 filas
dataset_vsx_tess_labeled_ampliado-compact-unified.parquet: 1149 row groups, 57,452,150 filas
dataset_ztf_labeled-unified.parquet: 12 row groups, 12,551,038 filas
dataset_asassn_gband-unified.parquet: 4049 row groups, 202,436

In [1]:
from src.utils.normalization_dict import normalize_label

clases_prueba = [
    "EB", "DSCT|GDOR|SXPHE", "ROT", "SR", "VAR", "AP", "QP", "L", "Rotational", "DSCT",
    "RR Lyrae", "Irregular", "SRB", "RS", "YSO", "EW", "V1093HER", "LB", "EA", "ZZA",
    "RRAB", "DSCT:", "MISC", "UG", "RRC", "WD", "E", "UGSS", "Delta Scuti", "SRA",
    "UGSU", "UNKNOWN", "EP", " P", "UGZ", "BY", "RRC|EC", "CV", "V361HYA", "EA+BY+UV",
    "roAp", "BCEP+SPB", "ACV|roAm|roAp|SXARI", "CV:", "ELL/DW:", "EC|ESD", "SRS",
    "ACEP", "Variable", "S", "RRAB/BL:", "LC", "SRB:", "VAR:", "SR:", "EC|DSCT|ESD",
    "M", "DSCT|EC|ESD", "EC", "ESD|EC", "DSCTC", "ESD|CW-FU|CW-FO|EC", "UGSU+E",
    "ESD|ACV|ED", "EC|RRC|ESD", "BY:", "CW-FU|EC", "ED", "BCEP|DSCT", "EA:", "UV",
    "UGWZ", "NL", "SRD", "SRC", "RRAB/BL", "SR|EA", "ACV", "EC|BCEP|DSCT|DSCTr|ESD",
    "NL:", "SR|M", "ESD|CW-FU|EC", "LPV", "EW/RS", "RRD"
]

print("Clase original".ljust(30), "→", "Clase normalizada")
print("-" * 55)
unknowns = []
for clase in clases_prueba:
    norm = normalize_label(clase)
    print(clase.ljust(30), "→", norm)
    if norm == "Unknown":
        unknowns.append(clase)

if unknowns:
    print("\nClases clasificadas como Unknown:")
    for u in unknowns:
        print(" -", u)
else:
    print("\nTodas las clases han sido normalizadas correctamente.")

Clase original                 → Clase normalizada
-------------------------------------------------------
EB                             → Eclipsing Binary
DSCT|GDOR|SXPHE                → Delta Scuti
ROT                            → Rotational
SR                             → Irregular
VAR                            → Variable
AP                             → Irregular
QP                             → Variable
L                              → Irregular
Rotational                     → Rotational
DSCT                           → Delta Scuti
RR Lyrae                       → RR Lyrae
Irregular                      → Irregular
SRB                            → Irregular
RS                             → Rotational
YSO                            → Young Stellar Object
EW                             → Eclipsing Binary
V1093HER                       → White Dwarf
LB                             → Irregular
EA                             → Eclipsing Binary
ZZA                            → White

In [None]:
batches_procesados = 81787
tiempo_transcurrido_seg = 2 * 3600 + 15 * 60  # ≈ 8100 s
total_estimado_batches = 4896250

velocidad_batches = batches_procesados / tiempo_transcurrido_seg  # ≈ 10.02 batches/s
tiempo_restante_seg = (total_estimado_batches - batches_procesados) / velocidad_batches
tiempo_restante_horas = tiempo_restante_seg / 3600

print(f"⏳ Tiempo restante estimado: {tiempo_restante_horas:.2f} horas")
# en dias
tiempo_restante_dias = tiempo_restante_horas / 24
print(f"⏳ Tiempo restante estimado: {tiempo_restante_dias:.2f} días")


⏳ Tiempo restante estimado: 133.35 horas
⏳ Tiempo restante estimado: 5.56 días
