In [13]:
import asyncio
from pathlib import Path
from playwright.async_api import async_playwright, Error, TimeoutError
import pandas as pd
import json, re, random, time
from datetime import datetime, timedelta


### Notas

Ejecuta la celda final (`await scrape_flights(...)`) tras instalar Playwright y los navegadores.


In [14]:
import sys, importlib.util
print(sys.executable)
print(importlib.util.find_spec("playwright"))

/Users/danilosuarezvargas/anaconda3/envs/nuevo_entorno/bin/python
ModuleSpec(name='playwright', loader=<_frozen_importlib_external.SourceFileLoader object at 0x1014bba90>, origin='/Users/danilosuarezvargas/anaconda3/envs/nuevo_entorno/lib/python3.11/site-packages/playwright/__init__.py', submodule_search_locations=['/Users/danilosuarezvargas/anaconda3/envs/nuevo_entorno/lib/python3.11/site-packages/playwright'])


In [15]:
# -------- CONFIGURACIÓN --------
BASE_URL = "https://www.avianca.com/es/"  # Home (para setear locale CO/ES)
BOOK_URL = "https://www.avianca.com/es/booking/select/"  # Flujo de compra
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

ORIGIN = "BOG"
DESTINOS = [
    "ADZ",
    "AEP",
    "AUA",
    "AUC",
    "ASU",
    "AXM",
    "BGA",
    "BAQ",
    "BCN",
    "BOS",
    "CDG",
    "CLO",
    "CCS",
    "CTG",
    "CUC",
    "CUN",
    "CUR",
    "CUZ",
    "DFW",
    "EJA",
    "EYP",
    "EZE",
    "FLL",
    "GEO",
    "GIG",
    "GRU",
    "GUA",
    "GYE",
    "HAV",
    "IAD",
    "IBE",
    "IPI",
    "JFK",
    "LET",
    "LHR",
    "LIM",
    "LPB",
    "MAD",
    "MAO",
    "MCO",
    "MDE",
    "MEX",
    "MIA",
    "MTR",
    "MVD",
    "NVA",
    "ORD",
    "PEI",
    "PPN",
    "PSO",
    "PTY",
    "PUJ",
    "RCH",
    "SAL",
    "SCL",
    "SDQ",
    "SJO",
    "SJU",
    "SMR",
    "TQO",
    "UIB",
    "UIO",
    "VUP",
    "VVC",
    "VVI",
    "XPL",
    "YUL",
    "YYZ",
    "TPA",
    "BSB",
]
FECHA_INICIO = "2026-02-01"
FECHA_FIN = "2026-03-31"  # inclusive
IDA_VUELTA = False  # True -> ida y vuelta; False -> solo ida

MIN_DELAY_BETWEEN_RESULTS_MS = 2000
MAX_DELAY_BETWEEN_RESULTS_MS = 5000
DEST_COOLDOWN_RANGE_MS = (11000, 18000)
LONG_PAUSE_EVERY = 15  # cada N búsquedas realiza un descanso largo
LONG_PAUSE_SECONDS = 40
DATE_COOLDOWN_SECONDS = 10
CAPTCHA_BACKOFF_SECONDS = 180  # descanso tras resolver captcha
CAPTCHA_BOOST_SEARCHES = 20  # destinos con cooldown alto tras captcha
CAPTCHA_DEST_COOLDOWN_RANGE_MS = (30000, 60000)
PROGRESS_PATH = Path("avianca_scrape_progress.json")
LOCAL_PARQUET_PATH = Path("avianca_busquedas_local.parquet")
RESUME_FROM_DATE = None  # e.g. "2026-01-15" para retomar desde cierta fecha
SAVE_PROGRESS_EVERY = 5
SHUFFLE_DESTINOS = False
MAX_NAVIGATION_RETRIES = 3
NAVIGATION_RETRY_DELAY_SECONDS = 45

# Parquet en OneLake / Fabric
PARQUET_PATH = (
    "abfss://<container>@onelake.dfs.fabric.microsoft.com/"
    "<workspaceid>/Files/avianca_busquedas.parquet"
)

# -------- HELPERS --------
def daterange(start_date, end_date):
    cur = start_date
    while cur <= end_date:
        yield cur
        cur += timedelta(days=1)


def normalize_ts(s):
    try:
        # Avianca suele devolver ISO 8601 con Z
        s = s.replace("Z", "")
        return datetime.fromisoformat(s)
    except Exception:
        return None


def extract_from_journeys(payload_json):
    """Extrae dicts normalizados de la respuesta de journeys/schedules."""
    rows = []
    for jpr in payload_json.get("journeyPriceResponses", []):
        for sch in jpr.get("schedules", []):
            sch_date = sch.get("date")
            availability = sch.get("availability")
            for jy in sch.get("journeys", []):
                origin = jy.get("origin")
                dest = jy.get("destination")
                fares = jy.get("fares", [])
                total_amt = fares[0].get("totalAmount") if fares else None
                available_seats = fares[0].get("availableSeats") if fares else None

                segments = jy.get("segments", [])
                stops = max(0, len(segments) - 1)
                for seg in segments:
                    tr = seg.get("transport", {}) or {}
                    rows.append(
                        {
                            "fecha_programada": sch_date,
                            "origen": origin,
                            "destino": dest,
                            "num_vuelo": tr.get("number"),
                            "matricula": tr.get("registration"),
                            "capacidad": tr.get("capacity"),
                            "std": seg.get("std"),
                            "sta": seg.get("sta"),
                            "duracion": seg.get("duration"),
                            "precio_total": total_amt,
                            "sillas_disponibles": available_seats,
                            "stops": stops,
                            "availability": availability,
                            "raw_cabin": jy.get("cabin"),
                        }
                    )
    return rows


def load_progress():
    if not PROGRESS_PATH.exists():
        return {}
    try:
        return json.loads(PROGRESS_PATH.read_text())
    except Exception:
        return {}


def save_progress(progress):
    PROGRESS_PATH.write_text(json.dumps(progress, indent=2, ensure_ascii=False))


def write_checkpoint_parquet(rows):
    if not rows:
        return
    df_tmp = pd.DataFrame(rows)
    if df_tmp.empty:
        return
    df_tmp = df_tmp.drop_duplicates(
        subset=[
            "num_vuelo",
            "origen",
            "destino",
            "std",
            "sta",
            "fecha_programada",
        ]
    )
    try:
        df_tmp.to_parquet(LOCAL_PARQUET_PATH, index=False)
        print(f"Checkpoint parquet actualizado ({len(df_tmp)} filas)")
    except Exception as exc:
        print(f"Advertencia: no se pudo actualizar el parquet: {exc}")


async def goto_with_retry(page, url, wait_until="load", timeout=60000, max_attempts=4):
    """Intenta cargar una URL varias veces degradando espera y desactivando HTTP/2."""
    last_exc = None
    for attempt in range(1, max_attempts + 1):
        current_wait = wait_until if attempt == 1 else "domcontentloaded"
        try:
            return await page.goto(url, wait_until=current_wait, timeout=timeout)
        except Error as exc:
            last_exc = exc
            if (
                ("ERR_HTTP2_PROTOCOL_ERROR" in str(exc) or "INTERNAL_ERROR" in str(exc))
                and attempt < max_attempts
            ):
                await page.wait_for_timeout(3000)
                continue
            raise
        except TimeoutError as exc:
            last_exc = exc
            if attempt < max_attempts:
                await page.wait_for_timeout(3000)
                continue
            raise
    if last_exc:
        raise last_exc

def build_search_url(origin, dest, date_out, ida_vuelta=False, date_back=None):
    """Construye la URL del flujo de resultados con los parámetros necesarios."""
    from urllib.parse import urlencode

    params = {
        "origin1": origin,
        "destination1": dest,
        "departure1": date_out.strftime("%Y-%m-%d"),
        "adt1": 1,
        "tng1": 0,
        "chd1": 0,
        "inf1": 0,
        "currency": "COP",
        "posCode": "CO",
        "tripType": "R" if ida_vuelta and date_back else "O",
        "lang": "es",
        "searchType": "normal",
    }
    if ida_vuelta and date_back:
        params.update(
            {
                "origin2": dest,
                "destination2": origin,
                "departure2": date_back.strftime("%Y-%m-%d"),
                "adt2": 1,
                "tng2": 0,
                "chd2": 0,
                "inf2": 0,
            }
        )
    return f"{BOOK_URL}?{urlencode(params)}"

async def resolve_booking_frame(page, retries=20, delay=1.0):
    """Espera a que el iframe del formulario esté listo y lo devuelve."""
    for _ in range(retries):
        try:
            await page.wait_for_selector("iframe", timeout=1000)
        except Exception:
            pass
        for frame in page.frames:
            frame_url = (frame.url or "").lower()
            if any(token in frame_url for token in ("booking", "fare", "select")):
                return frame
            try:
                title = (await frame.title()) if hasattr(frame, "title") else ""
            except Exception:
                title = ""
            if any(token in (title or "").lower() for token in ("booking", "select", "fare")):
                return frame
        await asyncio.sleep(delay)
    return page

async def ensure_booking_ready(frame):
    """Garantiza que las entradas del formulario estén visibles antes de interactuar."""
    candidate_inputs = [
        "input[placeholder*='Origen']",
        "input[placeholder*='From']",
        "input[name*='from']",
        "input[id*='from']",
    ]
    for _ in range(15):
        for selector in candidate_inputs:
            try:
                handle = await frame.wait_for_selector(selector, timeout=1000, state="visible")
            except Exception:
                handle = None
            if handle:
                return handle
        await asyncio.sleep(0.5)
    return None

async def wait_if_verification(page, *, timeout_minutes=20):
    """Si Avianca muestra verificación, espera a que el usuario la complete."""
    try:
        has_banner = (await page.locator("text=Verificación necesaria").count()) > 0
        has_robot = (await page.locator("text=No soy un robot").count()) > 0
    except Exception:
        return False
    if not (has_banner or has_robot):
        return False
    print("[!] Se detectó verificación (captcha). Marca 'No soy un robot' y pulsa 'Proceder' en la ventana de Chrome.")
    try:
        await page.wait_for_function(
            "() => !document.body || (!document.body.innerText.includes('Verificación necesaria') && !document.body.innerText.includes('No soy un robot'))",
            timeout=timeout_minutes * 60 * 1000,
        )
        print("[ok] Verificación completada, continúo...")
        return True
    except TimeoutError:
        print(f"[!] Timeout esperando verificación ({timeout_minutes} min).")
        return True

async def do_search(page, origin, dest, date_out, ida_vuelta=False, date_back=None):
    """Navega directo a la URL de resultados para evitar rellenar el formulario."""
    target_url = build_search_url(origin, dest, date_out, ida_vuelta=ida_vuelta, date_back=date_back)
    for nav_attempt in range(1, MAX_NAVIGATION_RETRIES + 1):
        try:
            await goto_with_retry(page, target_url, wait_until="domcontentloaded", timeout=60000)
            break
        except Error as exc:
            msg = str(exc)
            if "ERR_ABORTED" in msg or "105.1" in msg:
                print(f"Sin disponibilidad: {origin}->{dest} {date_out.isoformat()} (aviso 105.1)")
                return False
            raise
        except TimeoutError as exc:
            if nav_attempt >= MAX_NAVIGATION_RETRIES:
                print(f"Timeout navegando {origin}->{dest} {date_out.isoformat()} tras {nav_attempt} intentos: {exc}")
                return False
            delay = NAVIGATION_RETRY_DELAY_SECONDS * nav_attempt
            print(f"Timeout navegando {origin}->{dest} {date_out.isoformat()} (intento {nav_attempt}/{MAX_NAVIGATION_RETRIES}); reintento en {delay}s")
            await page.wait_for_timeout(delay * 1000)
    else:
        return False

    # Acepta cookies si aparece algún banner sobrepuesto
    try:
        await page.get_by_role("button", name=re.compile("Aceptar|Accept", re.I)).click(timeout=3000)
    except Exception:
        pass

    captcha_seen = await wait_if_verification(page)
    if captcha_seen:
        await goto_with_retry(page, target_url, wait_until="domcontentloaded", timeout=60000)
        if CAPTCHA_BACKOFF_SECONDS:
            print(f"[captcha] descanso {CAPTCHA_BACKOFF_SECONDS}s para reducir frecuencia...")
            await page.wait_for_timeout(CAPTCHA_BACKOFF_SECONDS * 1000)

    await page.wait_for_timeout(2500)
    return captcha_seen





async def scrape_flights(origin, destinos, start_date, end_date, ida_vuelta=False):
    all_rows = []
    if LOCAL_PARQUET_PATH.exists():
        try:
            df_existente = pd.read_parquet(LOCAL_PARQUET_PATH)
            if not df_existente.empty:
                all_rows.extend(df_existente.to_dict("records"))
                print(f"Se cargaron {len(all_rows)} filas previas desde {LOCAL_PARQUET_PATH}.")
        except Exception as exc:
            print(f"Advertencia: no se pudo leer {LOCAL_PARQUET_PATH}: {exc}")

    async with async_playwright() as p:
        context = await p.chromium.launch_persistent_context(
            "./perfil_avianca",
            executable_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
            headless=False,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-http2",
            ],
        )

        progress = load_progress()

        try:
            page = context.pages[0] if context.pages else await context.new_page()

            async def _connection_close(route):
                await route.continue_(headers={**route.request.headers, "Connection": "close"})

            await context.route("**/*", _connection_close)

            await goto_with_retry(page, BASE_URL + "?poscode=CO", wait_until="load", timeout=60000)
            await page.wait_for_timeout(1500)

            try:
                await page.get_by_role("button", name=re.compile("Aceptar|Accept", re.I)).click(timeout=4000)
            except Exception:
                pass

            captcha_boost_remaining = CAPTCHA_BOOST_SEARCHES if await wait_if_verification(page) else 0

            captured = []

            async def handle_response(resp):
                try:
                    if resp.request.resource_type not in ("xhr", "fetch"):
                        return
                    if resp.status != 200:
                        return
                    data = await resp.json()
                except Exception:
                    return
                if not isinstance(data, dict):
                    return
                if "journeyPriceResponses" not in data:
                    return
                rows = extract_from_journeys(data)
                if rows:
                    captured.extend(rows)

            page.on("response", lambda resp: asyncio.create_task(handle_response(resp)))

            start_dt = datetime.strptime(start_date, "%Y-%m-%d").date()
            end_dt = datetime.strptime(end_date, "%Y-%m-%d").date()

            dates = list(daterange(start_dt, end_dt))
            if RESUME_FROM_DATE:
                resume_dt = datetime.strptime(RESUME_FROM_DATE, "%Y-%m-%d").date()
                dates = [d for d in dates if d >= resume_dt]

            search_counter = 0

            eta_state = {'total': 0, 'done': 0, 'ema_s': None, 'start': time.perf_counter()}

            for dd in dates:
                dd_iso = dd.isoformat()
                day = progress.get(dd_iso, {}) or {}
                for dst in destinos:
                    if day.get(dst) is None:
                        eta_state['total'] += 1

            def fmt_seconds(sec):
                sec = int(max(0, sec))
                h, rem = divmod(sec, 3600)
                m, s = divmod(rem, 60)
                if h:
                    return f'{h}h {m:02d}m'
                if m:
                    return f'{m}m {s:02d}s'
                return f'{s}s'

            def maybe_print_eta(last_elapsed_s):
                if eta_state['total'] <= 0:
                    return
                eta_state['done'] += 1
                alpha = 0.2
                ema = eta_state['ema_s']
                ema = last_elapsed_s if ema is None else (alpha * last_elapsed_s + (1 - alpha) * ema)
                eta_state['ema_s'] = ema
                remaining = max(0, eta_state['total'] - eta_state['done'])
                eta_seconds = ema * remaining
                eta_end = datetime.now() + timedelta(seconds=eta_seconds)
                elapsed_total = time.perf_counter() - eta_state['start']
                done = eta_state['done']
                total = eta_state['total']
                print(f'[ETA] {done}/{total} | avg {ema:.1f}s | restante ~{fmt_seconds(eta_seconds)} | fin ~{eta_end:%Y-%m-%d %H:%M} | elapsed {fmt_seconds(elapsed_total)}')

            total = eta_state['total']
            if total > 0:
                print(f'[ETA] búsquedas pendientes: {total}')
            else:
                print('[ETA] no hay búsquedas pendientes en este rango (según progress).')

            for d in dates:
                date_iso = d.isoformat()
                destino_sequence = random.sample(destinos, len(destinos)) if SHUFFLE_DESTINOS else list(destinos)

                print(f"===== Fecha {date_iso} =====")
                for dest in destino_sequence:
                    already_done = progress.get(date_iso, {}).get(dest)
                    if already_done is not None:
                        print(f"{origin}->{dest} {date_iso}: ya registrado ({already_done} vuelos) — se omite")
                        continue

                    captured.clear()
                    dest_t0 = time.perf_counter()
                    attempt_count = 0
                    while attempt_count <= MAX_NAVIGATION_RETRIES:
                        captcha_seen = await do_search(page, origin, dest, d, ida_vuelta=ida_vuelta)
                        if captcha_seen:
                            captcha_boost_remaining = max(captcha_boost_remaining, CAPTCHA_BOOST_SEARCHES)
                        await page.wait_for_timeout(random.randint(MIN_DELAY_BETWEEN_RESULTS_MS, MAX_DELAY_BETWEEN_RESULTS_MS))

                        batch = captured.copy()
                        flights_count = len(batch)
                        if flights_count == 0:
                            attempt_count += 1
                            if attempt_count > MAX_NAVIGATION_RETRIES:
                                print(f"{origin}->{dest} {date_iso}: sin vuelos tras {attempt_count} intentos (posible aviso 105.1)")
                                break
                            retry_delay = NAVIGATION_RETRY_DELAY_SECONDS * attempt_count
                            print(f"{origin}->{dest} {date_iso}: sin vuelos, reintento {attempt_count}/{MAX_NAVIGATION_RETRIES} en {retry_delay}s")
                            captured.clear()
                            await page.wait_for_timeout(retry_delay * 1000)
                            continue
                        else:
                            print(f"{origin}->{dest} {date_iso}: {flights_count} vuelos capturados")
                            break

                    if not captured:
                        progress.setdefault(date_iso, {})[dest] = 0
                        maybe_print_eta(time.perf_counter() - dest_t0)
                        continue

                    for r in batch:
                        r["origen_solicitado"] = origin
                        r["destino_solicitado"] = dest
                        r["fecha_busqueda"] = date_iso
                        r["std_dt"] = normalize_ts(r.get("std") or "")
                        r["sta_dt"] = normalize_ts(r.get("sta") or "")
                        r["fecha_programada_dt"] = normalize_ts(r.get("fecha_programada") or "")

                    all_rows.extend(batch)
                    captured.clear()

                    progress.setdefault(date_iso, {})[dest] = flights_count
                    search_counter += 1

                    if SAVE_PROGRESS_EVERY and search_counter % SAVE_PROGRESS_EVERY == 0:
                        save_progress(progress)
                        write_checkpoint_parquet(all_rows)

                    if search_counter % LONG_PAUSE_EVERY == 0:
                        print(f"Pausa preventiva de {LONG_PAUSE_SECONDS}s tras {search_counter} búsquedas...")
                        await page.wait_for_timeout(LONG_PAUSE_SECONDS * 1000)

                    if captcha_boost_remaining > 0:
                        cooldown_ms = random.randint(*CAPTCHA_DEST_COOLDOWN_RANGE_MS)
                        captcha_boost_remaining -= 1
                        print(f"Cooldown (modo captcha) entre destinos ({dest}): {cooldown_ms/1000:.1f}s")
                    else:
                        cooldown_ms = random.randint(*DEST_COOLDOWN_RANGE_MS)
                        print(f"Cooldown entre destinos ({dest}): {cooldown_ms/1000:.1f}s")
                    await page.wait_for_timeout(cooldown_ms)
                    maybe_print_eta(time.perf_counter() - dest_t0)

                if DATE_COOLDOWN_SECONDS:
                    print(f"Descanso por fecha {date_iso}: {DATE_COOLDOWN_SECONDS}s")
                    await page.wait_for_timeout(DATE_COOLDOWN_SECONDS * 1000)

                save_progress(progress)
                write_checkpoint_parquet(all_rows)

        finally:
            save_progress(progress)
            write_checkpoint_parquet(all_rows)
            await context.close()

    df = pd.DataFrame(all_rows).drop_duplicates(
        subset=["num_vuelo", "origen", "destino", "std", "sta", "fecha_programada"]
    )
    return df



In [16]:
# Ejecutar en celdas separadas para evitar conflictos con el event loop de Jupyter.
df = await scrape_flights(ORIGIN, DESTINOS, FECHA_INICIO, FECHA_FIN, ida_vuelta=IDA_VUELTA)
print("Vuelos obtenidos:", len(df))
# Guardar a Parquet (si usas Fabric con abfss, asegúrate de tener montado el FS)
# df.to_parquet(PARQUET_PATH, index=False)
# Como alternativa local:
df.to_parquet("avianca_busquedas_local.parquet", index=False)
print("OK.")


Se cargaron 16987 filas previas desde avianca_busquedas_local.parquet.
Checkpoint parquet actualizado (16987 filas)


TimeoutError: Page.goto: Timeout 60000ms exceeded.
Call log:
  - navigating to "https://www.avianca.com/es/?poscode=CO", waiting until "load"


In [None]:
mask = (
    pd.notna(df["std_dt"]) &
    pd.notna(df["sta_dt"]) &
    (df["sta_dt"] > df["std_dt"]) &
    df["num_vuelo"].ne("001")
)

df_limpio = df.loc[mask].copy()
print("Vuelos limpios:", len(df_limpio))
df_limpio.head()
