# Descarga y consolidación de curvas Kepler y TESS (Script 1)

In [None]:
# Configuración general para evitar errores de warnings y compatibilidad
import warnings
import os
warnings.filterwarnings("ignore")
os.environ["RICH_NO_RICH"] = "1"
print("Configuración de entorno aplicada.")

In [None]:
# Install missing packages
%pip install torch lightkurve
%pip install -q pyarrow

In [None]:
import torch, lightkurve as lk

print("Torch:", torch.__version__)
print("Lightkurve:", lk.__version__)

### 📥 Script 1: descarga y consolidación de curvas de Kepler y TESS de la clase EB (entorno local o SageMaker)

In [None]:
# ...existing code from script_1_kepler_tess_eb...
import warnings
import os, sys, platform
import glob
from astropy.units import UnitsWarning
from pathlib import Path

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UnitsWarning)
print("🔇 Warnings silenciados: UserWarning, FutureWarning")
os.environ["RICH_NO_RICH"] = "1"

src_path = Path("src").resolve()
if src_path not in sys.path:
    sys.path.append(str(src_path))
from src.fase1.script_1_kepler_tess_eb import main as run_script_1

try:
    import sagemaker
    is_sagemaker = True
except ImportError:
    is_sagemaker = False

existing = len(glob.glob("/home/ec2-user/backup/data/raw/kepler/*.csv")) + len(glob.glob("/home/ec2-user/backup/data/raw/tess/*.csv"))
print(f"🗃️ Curvas ya existentes en disco: {existing}", flush=True)

if is_sagemaker:
    print("🔁 Ejecutando en SageMaker → catálogo completo")
    run_script_1(mission="Kepler", only_pending=True)
else:
    print("💻 Ejecutando en entorno local → catálogo de prueba")
    run_script_1(use_sample=True)

In [None]:
from src.utils.cleanup_raw import cleanup_raw_data

# Limpiar los datos de prueba
#cleanup_raw_data('/home/ec2-user/backup/data/raw', confirm=True)

##### 📥 Comprobación de los fichero de curvas Kepler y TESS (EB) mergeado y consolidado

In [None]:
import pyarrow.parquet as pq
from pathlib import Path

for name in ["kepler", "tess"]:
    path = Path(f"data/processed/dataset_eb_{name}_labeled.parquet")
    if not path.exists():
        print(f"❌ Archivo no encontrado: {path}")
        continue

    try:
        parquet_file = pq.ParquetFile(path)
        schema = parquet_file.schema_arrow
        columns = schema.names

        print(f"\n📦 {name.upper()} contiene {len(columns)} columnas:")
        print(columns)

        if "clase_variable" in columns:
            print(f"✅ 'clase_variable' está presente en {name.upper()}")
        else:
            print(f"⚠️  'clase_variable' NO está presente en {name.upper()}")
    except Exception as e:
        print(f"❌ Error leyendo {path}: {e}")

### ✅ FIX de la columna "clase_variable"

In [None]:
from pathlib import Path
import pyarrow.dataset as ds
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

def reparar_parquet_streaming(parquet_path: Path, output_path: Path, clase_default: str = "EB"):
    dataset = ds.dataset(parquet_path, format="parquet")
    sample_batch = next(dataset.to_batches(batch_size=100))
    schema_original = sample_batch.schema
    schema_nueva = schema_original.append(pa.field("clase_variable", pa.string()))
    writer = pq.ParquetWriter(output_path, schema=schema_nueva, compression="snappy")
    print(f"[🔧] Reparando parquet: {parquet_path.name}")
    fragmentos = dataset.to_batches(batch_size=50000)
    for batch in tqdm(fragmentos, desc="🛠️ Reparando por lotes"):
        n = batch.num_rows
        columna_clase = pa.array([clase_default] * n, type=pa.string())
        batch_corregido = batch.append_column("clase_variable", columna_clase)
        writer.write_table(pa.Table.from_batches([batch_corregido], schema=schema_nueva))
    writer.close()
    print(f"[✅] Reparación completada → {output_path}")

# Ejemplo de uso:
reparar_parquet_streaming(Path("data/processed/dataset_eb_tess.parquet"),
                          Path("data/processed/dataset_eb_tess_labeled.parquet"))