# LATAM - scraping de ofertas de vuelos

### Notas
- Instala Playwright y los navegadores: `pip install playwright` y luego `playwright install chromium`.
- Ejecuta las celdas en orden; la ultima dispara `await scrape_latam_flights(...)`.
- Ajusta rutas de guardado o progreso si trabajas en otro entorno.

In [5]:
import asyncio
import json
import random
import re
import uuid
from datetime import datetime, timedelta, time
from pathlib import Path
from urllib.parse import urlencode

import pandas as pd
from playwright.async_api import async_playwright, TimeoutError, Error


In [7]:

# -------- CONFIGURACION --------
BASE_URL = "https://www.latamairlines.com"
MARKET_PATH = "co/es"  # pais/idioma
OFFERS_PATH = "ofertas-vuelos"
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

ORIGIN = "BOG"
DESTINOS = ["MDE"]

FECHA_INICIO = "2025-10-29"
FECHA_FIN = "2025-10-29"
IDA_VUELTA = False  # soporte experimental
RETORNO_OFFSET_DIAS = 3  # usado solo si IDA_VUELTA es True
SEARCH_TIME_UTC = time(hour=17, minute=0)

ADULTOS = 1
NINOS = 0
BEBES = 0
CABIN_CLASS = "Economy"
REDEMPTION = "false"
SORT_ORDER = "RECOMMENDED"

REQUEST_TIMEOUT_MS = 90000
NAVIGATION_RETRIES = 3
NAVIGATION_RETRY_DELAY_SECONDS = 35
WAIT_FOR_DATA_SECONDS = 45
DOM_FALLBACK_WAIT_MS = 60000  # espera max para encontrar tarjetas en DOM
DOM_FALLBACK_EXTRA_DELAY_MS = 3000  # pausa adicional antes de leer DOM

MIN_DELAY_BETWEEN_DEST_MS = 4000
MAX_DELAY_BETWEEN_DEST_MS = 9000
DATE_COOLDOWN_SECONDS = 12
LONG_PAUSE_EVERY = 12
LONG_PAUSE_SECONDS = 35
SAVE_PROGRESS_EVERY = 4

PROGRESS_PATH = Path("latam_scrape_progress.json")
LOCAL_PARQUET_PATH = Path("latam_busquedas_local.parquet")
RAW_PAYLOAD_DIR = Path("latam_payloads")
SAVE_RAW_PAYLOADS = True
SAVE_PRELOADED_STATE = True
PRINT_DEBUG_FETCH = True
DEBUG_FETCH_LIMIT = 20
SAVE_DEBUG_FETCHES = True
SAVE_PAGE_HTML = True

HEADLESS = False  # ponlo True si quieres volver a modo headless
BROWSER_CHANNEL = "chrome"  # usa None para Chromium por defecto
BROWSER_ARGS = [
    "--disable-blink-features=AutomationControlled",
    "--disable-dev-shm-usage",
    "--disable-extensions",
    "--disable-infobars",
    "--no-sandbox",
    "--disable-background-networking",
]

EXTRA_HEADERS = {
    "sec-ch-ua": '"Google Chrome";v="126", "Not.A/Brand";v="8", "Chromium";v="126"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "accept-language": "es-ES,es;q=0.9,en;q=0.8",
    "upgrade-insecure-requests": "1",
}

INIT_JS_SCRIPTS = [
    "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});",
]

PRE_LOAD_HOME = True
COOKIE_BANNER_PATTERNS = ["Aceptar", "Aceptar todo", "Aceptar todas", "Accept", "Allow all"]
COOKIE_DISMISS_SELECTORS = [
    "button[data-testid='cookie-banner-accept']",
    "button[data-testid='cookie-banner-accept-all']",
]
COOKIE_WAIT_SECONDS = 3
SESSION_WARMUP_DELAY_MS = 3000

STORAGE_STATE_PATH = Path("latam_storage_state.json")
SAVE_STORAGE_STATE = False  # ponlo True para refrescar cookies al terminar

RESUME_FROM_DATE = None  # ejemplo: "2025-10-30"
BROWSER_LOCALE = "es-CO"
VIEWPORT = {"width": 1280, "height": 720}

INTERESTING_ENDPOINT_KEYWORDS = [
    "air-offers",
    "flight-offers",
    "availability",
    "journeys",
    "search",
    "offers",
]


In [8]:

# -------- HELPERS --------

def ensure_parent(path: Path) -> None:
    if path is None:
        return
    parent = path.parent
    if not parent.exists():
        parent.mkdir(parents=True, exist_ok=True)


def load_progress(path: Path = PROGRESS_PATH) -> dict:
    if not path or not path.exists():
        return {}
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return {}


def save_progress(progress: dict, path: Path = PROGRESS_PATH) -> None:
    if not path:
        return
    ensure_parent(path)
    path.write_text(json.dumps(progress, indent=2, sort_keys=True), encoding="utf-8")


def write_checkpoint_parquet(rows, path: Path = LOCAL_PARQUET_PATH) -> None:
    if not rows:
        return
    df = pd.DataFrame(rows)
    if df.empty:
        return
    df = df.drop_duplicates(
        subset=[
            "fecha_busqueda",
            "origen_solicitado",
            "destino_solicitado",
            "itinerario_id",
            "salida_programada",
        ],
        keep="last",
    )
    ensure_parent(path)
    df.to_parquet(path, index=False)


def daterange(start_date, end_date):
    current = start_date
    while current <= end_date:
        yield current
        current += timedelta(days=1)


def iso_with_time(date_obj, search_time=SEARCH_TIME_UTC):
    if isinstance(search_time, str):
        parts = search_time.split(":")
        hour = int(parts[0])
        minute = int(parts[1]) if len(parts) > 1 else 0
        search_time = time(hour=hour, minute=minute)
    dt = datetime.combine(date_obj, search_time)
    return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")


def build_offers_url(origin, destination, departure_date, *, return_date=None, market_path=MARKET_PATH):
    params = {
        "origin": origin,
        "destination": destination,
        "outbound": iso_with_time(departure_date, SEARCH_TIME_UTC),
        "adt": ADULTOS,
        "chd": NINOS,
        "inf": BEBES,
        "trip": "OW",
        "cabin": CABIN_CLASS,
        "redemption": REDEMPTION,
        "sort": SORT_ORDER,
        "exp_id": str(uuid.uuid4()),
    }
    if return_date:
        params["inbound"] = iso_with_time(return_date, SEARCH_TIME_UTC)
        params["trip"] = "RT"
    base = f"{BASE_URL}/{market_path}/{OFFERS_PATH}"
    return f"{base}?{urlencode(params)}"


def coerce_float(value):
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)
    if isinstance(value, str):
        cleaned = value.strip().replace(" ", "")
        if not cleaned:
            return None
        if cleaned.count(",") == 1 and cleaned.count(".") == 0:
            cleaned = cleaned.replace(",", ".")
        else:
            cleaned = cleaned.replace(",", "")
        cleaned = re.sub(r"[^0-9.\-]", "", cleaned)
        try:
            return float(cleaned)
        except Exception:
            return None
    return None


DURATION_PATTERN = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?")


def parse_duration_minutes(value):
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return int(value)
    if isinstance(value, str):
        stripped = value.strip().upper()
        if not stripped:
            return None
        if stripped.isdigit():
            return int(stripped)
        match = DURATION_PATTERN.fullmatch(stripped)
        if match:
            hours = int(match.group(1) or 0)
            minutes = int(match.group(2) or 0)
            seconds = int(match.group(3) or 0)
            return hours * 60 + minutes + (1 if seconds >= 30 else 0)
    return None


def parse_datetime_candidate(value):
    if not value:
        return None
    if isinstance(value, str):
        return value.strip() or None
    if isinstance(value, dict):
        for key in ("iso", "isoString", "utc", "isoUtc", "dateTime", "value", "text", "formatted", "display"):
            candidate = parse_datetime_candidate(value.get(key))
            if candidate:
                return candidate
    return None


def normalize_ts(value):
    if not value:
        return None
    if isinstance(value, str) and value.endswith("Z"):
        value = value[:-1] + "+00:00"
    try:
        return datetime.fromisoformat(value)
    except Exception:
        return None


def get_airport_code(segment, role):
    node = segment.get(role)
    if isinstance(node, dict):
        for key in ("code", "iataCode", "iata", "airportCode"):
            val = node.get(key)
            if val:
                return val
    return segment.get(f"{role}Code") or segment.get(f"{role}_code")


def get_carrier(segment, role="marketingCarrier"):
    node = segment.get(role)
    if isinstance(node, dict):
        for key in ("code", "iataCode", "iata", "carrierCode"):
            val = node.get(key)
            if val:
                return val
    value = segment.get(f"{role}Code")
    if value:
        return value
    return None


def get_flight_number(segment):
    for key in ("flightNumber", "number", "operatingFlightNumber", "marketingFlightNumber", "flight"):
        value = segment.get(key)
        if isinstance(value, dict):
            for inner_key in ("code", "number", "value"):
                inner_val = value.get(inner_key)
                if inner_val:
                    return str(inner_val)
        elif value:
            return str(value)
    carrier = get_carrier(segment, "marketingCarrier") or get_carrier(segment, "carrier")
    number = segment.get("operatingFlightNumber") or segment.get("flightCode")
    if carrier and number:
        return f"{carrier}{number}"
    return None


def get_cabin(segment):
    for key in ("cabin", "cabinType", "cabinClass", "cabinName"):
        value = segment.get(key)
        if isinstance(value, dict):
            for inner_key in ("code", "name", "value"):
                inner_val = value.get(inner_key)
                if inner_val:
                    return inner_val
        elif value:
            return value
    return None


def collect_segments(itinerary):
    segments = []
    for container_key in ("bounds", "slices", "journeys"):
        container = itinerary.get(container_key)
        if isinstance(container, list):
            for item in container:
                segs = []
                if isinstance(item, dict):
                    segs = item.get("segments") or item.get("legs") or []
                if isinstance(segs, list):
                    segments.extend(seg for seg in segs if isinstance(seg, dict))
    if not segments:
        direct = itinerary.get("segments") or itinerary.get("legs")
        if isinstance(direct, list):
            segments.extend(seg for seg in direct if isinstance(seg, dict))
    return segments


def extract_price_info(itinerary):
    result = {"currency": None, "total": None, "per_pax": None, "raw": None}

    seen = set()

    def walk(node):
        if isinstance(node, (list, tuple)):
            for item in node:
                walk(item)
            return
        if not isinstance(node, dict):
            return
        node_id = id(node)
        if node_id in seen:
            return
        seen.add(node_id)

        currency = node.get("currency") or node.get("currencyCode")
        amount = node.get("amount") or node.get("total") or node.get("value") or node.get("totalAmount")
        if currency and not result["currency"]:
            result["currency"] = currency
        amount_value = coerce_float(amount)
        if amount_value is not None:
            if result["total"] is None or amount_value > result["total"]:
                result["total"] = amount_value
        passenger_type = node.get("type") or node.get("passengerType")
        if passenger_type and passenger_type.upper() in ("ADT", "ADULT", "PAX") and result["per_pax"] is None:
            result["per_pax"] = amount_value
        for key in ("perPassenger", "perPassengerAmount", "totalByPassengerType"):
            value = node.get(key)
            if isinstance(value, dict):
                adt = value.get("ADT") or value.get("ADULT") or value.get("adult")
                if isinstance(adt, dict):
                    adt_amount = coerce_float(adt.get("amount") or adt.get("value"))
                    if adt_amount is not None:
                        result["per_pax"] = adt_amount
                    currency = adt.get("currency") or adt.get("currencyCode")
                    if currency and not result["currency"]:
                        result["currency"] = currency
                else:
                    adt_amount = coerce_float(adt)
                    if adt_amount is not None and result["per_pax"] is None:
                        result["per_pax"] = adt_amount
        for child in node.values():
            walk(child)

    walk(itinerary.get("price") or itinerary.get("prices") or itinerary)

    if result["total"] is None:
        fares = itinerary.get("fares")
        if isinstance(fares, list):
            for fare in fares:
                amount_value = coerce_float(fare.get("amount") or fare.get("totalAmount"))
                if amount_value is not None:
                    result["total"] = amount_value
                    if not result["currency"]:
                        result["currency"] = fare.get("currency") or fare.get("currencyCode")
                    break

    if result["per_pax"] is None and result["total"] is not None and ADULTOS:
        result["per_pax"] = result["total"] / ADULTOS

    result["raw"] = itinerary.get("price") or itinerary.get("prices")
    return result


def extract_fare_brand(itinerary):
    for key in ("fareBrand", "brandCode", "brandId", "brandName", "fareClass"):
        value = itinerary.get(key)
        if value:
            return value
    fares = itinerary.get("fares")
    if isinstance(fares, list):
        for fare in fares:
            for key in ("brandCode", "brandId", "brandName", "fareBrand"):
                value = fare.get(key)
                if value:
                    return value
    return None


def find_itineraries(payload):
    itineraries = []
    seen = set()
    stack = [payload]
    while stack:
        current = stack.pop()
        if isinstance(current, dict):
            if "bounds" in current and isinstance(current["bounds"], list):
                itinerary_id = current.get("id") or current.get("itineraryId") or current.get("identifier")
                key = itinerary_id or id(current)
                if key not in seen:
                    seen.add(key)
                    itineraries.append(current)
            stack.extend(current.values())
        elif isinstance(current, list):
            stack.extend(current)
    return itineraries


def itinerary_to_row(itinerary, meta):
    segments = collect_segments(itinerary)
    if not segments:
        return None

    first_seg = segments[0]
    last_seg = segments[-1]

    departure_iso = parse_datetime_candidate(first_seg.get("departure") or first_seg.get("departureDateTime"))
    arrival_iso = parse_datetime_candidate(last_seg.get("arrival") or last_seg.get("arrivalDateTime"))

    segment_records = []
    unique_cabins = set()
    unique_marketing = set()
    unique_operating = set()
    flight_numbers = []

    for seg in segments:
        seg_dep = parse_datetime_candidate(seg.get("departure") or seg.get("departureDateTime"))
        seg_arr = parse_datetime_candidate(seg.get("arrival") or seg.get("arrivalDateTime"))
        duration_min = parse_duration_minutes(
            seg.get("duration")
            or seg.get("durationMinutes")
            or seg.get("durationInMinutes")
        )
        flight_number = get_flight_number(seg)
        marketing = get_carrier(seg, "marketingCarrier") or get_carrier(seg, "carrier")
        operating = get_carrier(seg, "operatingCarrier")

        if flight_number:
            flight_numbers.append(flight_number)
        if marketing:
            unique_marketing.add(marketing)
        if operating:
            unique_operating.add(operating)
        cabin = get_cabin(seg)
        if cabin:
            unique_cabins.add(cabin)

        if duration_min is None and seg_dep and seg_arr:
            dep_dt = normalize_ts(seg_dep)
            arr_dt = normalize_ts(seg_arr)
            if dep_dt and arr_dt:
                duration_min = int((arr_dt - dep_dt).total_seconds() // 60)

        segment_records.append(
            {
                "origen": get_airport_code(seg, "origin"),
                "destino": get_airport_code(seg, "destination"),
                "salida": seg_dep,
                "llegada": seg_arr,
                "numero_vuelo": flight_number,
                "operador_marketing": marketing,
                "operador_operating": operating,
                "cabina": cabin,
                "equipo": seg.get("equipment") or seg.get("aircraft") or seg.get("equipmentCode"),
                "duracion_min": duration_min,
            }
        )

    duration_total = sum((s.get("duracion_min") or 0) for s in segment_records) or None
    if duration_total is None and departure_iso and arrival_iso:
        dep_dt = normalize_ts(departure_iso)
        arr_dt = normalize_ts(arrival_iso)
        if dep_dt and arr_dt:
            duration_total = int((arr_dt - dep_dt).total_seconds() // 60)

    price_info = extract_price_info(itinerary)
    fare_brand = extract_fare_brand(itinerary)

    row = {
        "fecha_busqueda": meta["search_date"],
        "origen_solicitado": meta["origin"],
        "destino_solicitado": meta["destination"],
        "itinerario_id": itinerary.get("id") or itinerary.get("itineraryId") or itinerary.get("identifier"),
        "origen": get_airport_code(first_seg, "origin"),
        "destino": get_airport_code(last_seg, "destination"),
        "salida_programada": departure_iso,
        "llegada_programada": arrival_iso,
        "duracion_total_min": duration_total,
        "numero_segmentos": len(segment_records),
        "conexiones": max(0, len(segment_records) - 1),
        "cabinas": sorted(unique_cabins),
        "operadores_marketing": sorted(unique_marketing),
        "operadores_operating": sorted(unique_operating),
        "numeros_vuelo": flight_numbers,
        "precio_total": price_info["total"],
        "precio_por_pasajero": price_info["per_pax"],
        "moneda": price_info["currency"],
        "marca_tarifa": fare_brand,
        "detalle_segmentos": json.dumps(segment_records, ensure_ascii=True),
        "raw_price": json.dumps(price_info["raw"], ensure_ascii=True) if price_info["raw"] is not None else None,
    }

    return row


def extract_rows_from_journey_payload(payload, meta):
    rows = []
    for itinerary in find_itineraries(payload):
        row = itinerary_to_row(itinerary, meta)
        if row:
            rows.append(row)
    return rows


def extract_rows_from_bff_payload(payload, meta):
    content = payload.get("content")
    if not isinstance(content, list):
        return []

    rows = []
    for item in content:
        summary = item.get("summary") or {}
        itinerary = item.get("itinerary") or []
        brands = summary.get("brands") or []

        origin_info = summary.get("origin") or {}
        destination_info = summary.get("destination") or {}

        departure_iso = origin_info.get("departure") or None
        arrival_iso = destination_info.get("arrival") or None
        duration_total = summary.get("duration")

        segment_records = []
        flight_numbers = []
        marketing_carriers = set()
        operating_carriers = set()
        cabins = set()

        for seg in itinerary:
            flight_info = seg.get("flight") or {}
            marketing = flight_info.get("airlineCode")
            operating = flight_info.get("operatingAirlineCode") or flight_info.get("flightOperator")
            flight_number = flight_info.get("flightNumber")
            if flight_number is not None:
                flight_number_str = str(flight_number)
                if marketing:
                    flight_numbers.append(f"{marketing}{flight_number_str}")
                else:
                    flight_numbers.append(flight_number_str)
            if marketing:
                marketing_carriers.add(marketing)
            if operating:
                operating_carriers.add(operating)
            cabin = seg.get("cabinClass")
            if cabin:
                cabins.add(cabin)

            segment_records.append(
                {
                    "origen": seg.get("origin"),
                    "destino": seg.get("destination"),
                    "salida": seg.get("departure"),
                    "llegada": seg.get("arrival"),
                    "numero_vuelo": flight_info.get("flightNumber"),
                    "operador_marketing": marketing,
                    "operador_operating": operating,
                    "cabina": seg.get("cabinClass"),
                    "equipo": seg.get("equipment"),
                    "duracion_min": seg.get("duration"),
                }
            )

        if cabins:
            cabins_readable = sorted(cabins)
        else:
            cabins_readable = []

        if not departure_iso and segment_records:
            departure_iso = segment_records[0].get("salida")
        if not arrival_iso and segment_records:
            arrival_iso = segment_records[-1].get("llegada")
        if duration_total is None and segment_records:
            durations = [seg.get("duracion_min") for seg in segment_records if seg.get("duracion_min") is not None]
            if durations and all(isinstance(v, (int, float)) for v in durations):
                duration_total = sum(int(v) for v in durations)

        summary_operators = summary.get("flightOperators")
        if isinstance(summary_operators, list):
            for op in summary_operators:
                if op:
                    marketing_carriers.add(op)

        for brand in brands:
            price_info = brand.get("price") or {}
            cabin_info = brand.get("cabin") or {}
            cabin_label = cabin_info.get("label")
            if cabin_label:
                cabins_readable_brand = sorted(set(cabins_readable) | {cabin_label})
            else:
                cabins_readable_brand = cabins_readable

            offer_id = brand.get("offerId")
            if not offer_id:
                base_id = summary.get("flightCode") or f"{meta['search_date']}-{meta['origin']}-{meta['destination']}"
                offer_id = f"{base_id}-{brand.get('id') or len(rows)}"

            row = {
                "fecha_busqueda": meta["search_date"],
                "origen_solicitado": meta["origin"],
                "destino_solicitado": meta["destination"],
                "itinerario_id": offer_id,
                "origen": origin_info.get("iataCode") or (segment_records[0].get("origen") if segment_records else None),
                "destino": destination_info.get("iataCode") or (segment_records[-1].get("destino") if segment_records else None),
                "salida_programada": departure_iso,
                "llegada_programada": arrival_iso,
                "duracion_total_min": duration_total,
                "numero_segmentos": len(segment_records) if segment_records else None,
                "conexiones": max(0, len(segment_records) - 1) if segment_records else None,
                "cabinas": cabins_readable_brand,
                "operadores_marketing": sorted(marketing_carriers) if marketing_carriers else [],
                "operadores_operating": sorted(operating_carriers) if operating_carriers else [],
                "numeros_vuelo": flight_numbers,
                "precio_total": price_info.get("amount"),
                "precio_por_pasajero": price_info.get("amount"),
                "moneda": price_info.get("currency"),
                "marca_tarifa": brand.get("brandText") or brand.get("id"),
                "detalle_segmentos": json.dumps(segment_records, ensure_ascii=True),
                "raw_price": json.dumps(brand, ensure_ascii=True),
                "fuente": "json-latam",
            }

            rows.append(row)

    return rows


def extract_rows_from_payload(payload, meta):
    rows = []
    if isinstance(payload, dict):
        if payload.get("content"):
            rows.extend(extract_rows_from_bff_payload(payload, meta))
        if payload.get("journeyPriceResponses"):
            rows.extend(extract_rows_from_journey_payload(payload, meta))
        elif not payload.get("content"):
            rows.extend(extract_rows_from_journey_payload(payload, meta))
    return rows



DOM_TIME_PATTERN = re.compile(r"(\d{1,2}):(\d{2})\s*(a\. m\.|p\. m\.)", re.IGNORECASE)
DOM_DURATION_PATTERN = re.compile(r"(?:(\d+)\s*h)?\s*(?:(\d+)\s*(?:m|min))?", re.IGNORECASE)


def parse_dom_time_iso(date_iso, time_text):
    if not time_text:
        return None
    text = time_text.strip().lower()
    match = DOM_TIME_PATTERN.search(text)
    if not match:
        return None
    hour = int(match.group(1))
    minute = int(match.group(2))
    period = match.group(3)
    if 'p' in period and hour != 12:
        hour += 12
    if 'a' in period and hour == 12:
        hour = 0
    return f"{date_iso}T{hour:02d}:{minute:02d}:00"


def parse_dom_duration_minutes(text):
    if not text:
        return None
    match = DOM_DURATION_PATTERN.search(text)
    if not match:
        return None
    hours = int(match.group(1) or 0)
    minutes = int(match.group(2) or 0)
    total = hours * 60 + minutes
    return total or None


async def extract_rows_from_dom(page, meta):
    """Fallback: raspa informacion visible en el DOM cuando no hay payload JSON."""
    selector = '[data-testid="wrapperBoundCard"]'
    wait_timeout = DOM_FALLBACK_WAIT_MS or (WAIT_FOR_DATA_SECONDS * 1000)
    try:
        await page.wait_for_selector(selector, timeout=wait_timeout)
    except TimeoutError:
        print(f"{meta['origin']}->{meta['destination']} {meta['search_date']}: DOM sin tarjetas tras {wait_timeout/1000:.1f}s")
        return []
    if DOM_FALLBACK_EXTRA_DELAY_MS:
        await page.wait_for_timeout(DOM_FALLBACK_EXTRA_DELAY_MS)

    cards = page.locator(selector)
    try:
        count = await cards.count()
    except Exception:
        return []
    if count == 0:
        print(f"{meta['origin']}->{meta['destination']} {meta['search_date']}: DOM sin tarjetas visibles")
        return []

    results = []

    async def _safe_text(locator, *, timeout=1500):
        try:
            handle = locator.first
            text = await handle.text_content(timeout=timeout)
        except Exception:
            return None
        if text is None:
            return None
        return text.strip() or None

    for idx in range(count):
        card = cards.nth(idx)
        departure_text = await _safe_text(card.locator('[data-testid="bound-card-departure-time"]'))
        arrival_text = await _safe_text(card.locator('[data-testid="bound-card-arrival-time"]'))
        duration_text = await _safe_text(card.locator('[data-testid="bound-card-duration"]'))
        flight_number_text = await _safe_text(card.locator('[data-testid="bound-card-flight-number"]'))

        departure_iso = parse_dom_time_iso(meta['search_date'], departure_text)
        arrival_iso = parse_dom_time_iso(meta['search_date'], arrival_text)
        duration_minutes = parse_dom_duration_minutes(duration_text)

        base_id = f"dom-{meta['search_date']}-{meta['origin']}-{meta['destination']}-{idx}"
        numeros_vuelo = []
        if flight_number_text:
            numeros_vuelo = [flight_number_text.replace('\n', ' ').strip()]

        dom_segment = {
            'origen': meta['origin'],
            'destino': meta['destination'],
            'salida_texto': departure_text,
            'llegada_texto': arrival_text,
            'duracion_texto': duration_text,
            'numero_vuelo_texto': flight_number_text,
        }

        base_row = {
            'fecha_busqueda': meta['search_date'],
            'origen_solicitado': meta['origin'],
            'destino_solicitado': meta['destination'],
            'itinerario_id': base_id,
            'origen': meta['origin'],
            'destino': meta['destination'],
            'salida_programada': departure_iso,
            'llegada_programada': arrival_iso,
            'duracion_total_min': duration_minutes,
            'numero_segmentos': 1 if numeros_vuelo else None,
            'conexiones': 0 if numeros_vuelo else None,
            'cabinas': [],
            'operadores_marketing': [],
            'operadores_operating': [],
            'numeros_vuelo': numeros_vuelo,
            'precio_total': None,
            'precio_por_pasajero': None,
            'moneda': None,
            'marca_tarifa': None,
            'detalle_segmentos': json.dumps([dom_segment], ensure_ascii=True),
            'raw_price': None,
            'fuente': 'dom',
        }

        fares = card.locator('[data-testid="fare-card"]')
        try:
            fares_count = await fares.count()
        except Exception:
            fares_count = 0

        if fares_count == 0:
            results.append(base_row)
            continue

        for fare_idx in range(fares_count):
            fare = fares.nth(fare_idx)
            row = base_row.copy()
            row['itinerario_id'] = f"{base_id}-fare{fare_idx}"
            fare_name = await _safe_text(fare.locator('[data-testid="fare-card-fare-name"]'))
            if not fare_name:
                fare_name = await _safe_text(fare.locator('[data-testid="fare-card-title"]'))
            price_text = await _safe_text(fare.locator('[data-testid="price-text"]'))
            currency_text = await _safe_text(fare.locator('[data-testid="price-currency"]'))
            row['marca_tarifa'] = fare_name
            row['moneda'] = currency_text
            price_value = coerce_float(price_text) if price_text else None
            row['precio_total'] = price_value
            row['precio_por_pasajero'] = price_value
            dom_price = {
                'fare_name': fare_name,
                'price_text': price_text,
                'currency_text': currency_text,
            }
            row['raw_price'] = json.dumps(dom_price, ensure_ascii=True)
            results.append(row)

    return results


async def goto_with_retry(page, url, *, wait_until="networkidle", timeout=REQUEST_TIMEOUT_MS, retries=NAVIGATION_RETRIES):
    attempt = 1
    last_error = None
    while attempt <= retries:
        try:
            await page.goto(url, wait_until=wait_until, timeout=timeout)
            return
        except (TimeoutError, Error) as exc:
            last_error = exc
            print(f"[goto] intento {attempt}/{retries} fallo: {exc}")
            if attempt == retries:
                raise
            await page.wait_for_timeout(NAVIGATION_RETRY_DELAY_SECONDS * 1000)
            attempt += 1
    if last_error:
        raise last_error


async def warmup_session(page):
    """Carga la home y acepta cookies para inicializar cookies/tokens."""
    if not PRE_LOAD_HOME:
        return
    home_url = f"{BASE_URL}/{MARKET_PATH}"
    try:
        await goto_with_retry(page, home_url, wait_until="domcontentloaded")
    except Exception as exc:
        print(f"[warmup] No se pudo cargar home: {exc}")
        return
    if COOKIE_WAIT_SECONDS:
        await page.wait_for_timeout(COOKIE_WAIT_SECONDS * 1000)
    # Intentar cerrar banner de cookies
    for pattern in COOKIE_BANNER_PATTERNS:
        try:
            button = page.get_by_role("button", name=re.compile(pattern, re.IGNORECASE))
            await button.click(timeout=2000)
            print(f"[warmup] Banner cookies cerrado con pattern '{pattern}'")
            break
        except Exception:
            continue
    else:
        for selector in COOKIE_DISMISS_SELECTORS:
            try:
                await page.click(selector, timeout=2000)
                print(f"[warmup] Banner cookies cerrado con selector {selector}")
                break
            except Exception:
                continue
    if SESSION_WARMUP_DELAY_MS:
        await page.wait_for_timeout(SESSION_WARMUP_DELAY_MS)


async def scrape_latam_flights(origin=ORIGIN, destinos=DESTINOS, fecha_inicio=FECHA_INICIO, fecha_fin=FECHA_FIN, ida_vuelta=IDA_VUELTA):
    start_dt = datetime.strptime(fecha_inicio, "%Y-%m-%d").date()
    end_dt = datetime.strptime(fecha_fin, "%Y-%m-%d").date()
    dates = list(daterange(start_dt, end_dt))
    if RESUME_FROM_DATE:
        resume_dt = datetime.strptime(RESUME_FROM_DATE, "%Y-%m-%d").date()
        dates = [d for d in dates if d >= resume_dt]

    all_rows = []
    progress = load_progress()

    if SAVE_RAW_PAYLOADS:
        RAW_PAYLOAD_DIR.mkdir(parents=True, exist_ok=True)

    async with async_playwright() as playwright:
        browser_launch_kwargs = {"headless": HEADLESS}
        if BROWSER_ARGS:
            browser_launch_kwargs["args"] = BROWSER_ARGS
        if BROWSER_CHANNEL:
            browser_launch_kwargs["channel"] = BROWSER_CHANNEL
        browser = await playwright.chromium.launch(**browser_launch_kwargs)
        try:
            context_kwargs = {
                "user_agent": USER_AGENT,
                "locale": BROWSER_LOCALE,
                "viewport": VIEWPORT,
            }
            loaded_state = False
            if STORAGE_STATE_PATH and STORAGE_STATE_PATH.exists():
                try:
                    context_kwargs["storage_state"] = json.loads(STORAGE_STATE_PATH.read_text(encoding="utf-8"))
                    loaded_state = True
                    print(f"Storage state cargado desde {STORAGE_STATE_PATH}")
                except Exception as exc:
                    print(f"No se pudo cargar storage state: {exc}")
            context = await browser.new_context(**context_kwargs)
            if EXTRA_HEADERS:
                await context.set_extra_http_headers(EXTRA_HEADERS)
            if INIT_JS_SCRIPTS:
                for script in INIT_JS_SCRIPTS:
                    try:
                        await context.add_init_script(script)
                    except Exception as exc:
                        print(f"No se pudo registrar init script: {exc}")
            try:
                page = await context.new_page()
                await warmup_session(page)
                search_counter = 0

                for current_date in dates:
                    date_iso = current_date.isoformat()
                    print(f"===== Fecha {date_iso} =====")

                    for dest in destinos:
                        already = progress.get(date_iso, {}).get(dest)
                        if already is not None and already > 0:
                            print(f"{origin}->{dest} {date_iso}: ya registrado ({already} itinerarios)")
                            continue
                        if already == 0:
                            print(f"{origin}->{dest} {date_iso}: registrado previamente con 0 itinerarios, reintentando")

                        return_date = None
                        if ida_vuelta:
                            return_date = current_date + timedelta(days=RETORNO_OFFSET_DIAS)

                        search_url = build_offers_url(origin, dest, current_date, return_date=return_date)

                        payloads = []
                        response_event = asyncio.Event()

                        debug_count = 0

                        async def handle_response(response):
                            nonlocal debug_count
                            try:
                                resource_type = response.request.resource_type
                            except AttributeError:
                                return
                            if resource_type not in ("xhr", "fetch"):
                                return
                            url_lower = response.url.lower()
                            keyword_match = any(keyword in url_lower for keyword in INTERESTING_ENDPOINT_KEYWORDS)
                            if PRINT_DEBUG_FETCH and not keyword_match and debug_count < DEBUG_FETCH_LIMIT:
                                debug_count += 1
                                print(f"[debug] {response.status} {response.url}")
                                if SAVE_DEBUG_FETCHES:
                                    try:
                                        text = await response.text()
                                    except Exception:
                                        text = None
                                    if text:
                                        dump_path = RAW_PAYLOAD_DIR / f"debug_{origin}_{dest}_{date_iso}_{debug_count}.txt"
                                        dump_path.write_text(text, encoding="utf-8", errors="ignore")
                                        print(f"[debug] respuesta guardada en {dump_path}")
                            if not keyword_match:
                                return
                            if response.status != 200:
                                return
                            data = None
                            try:
                                data = await response.json()
                            except Exception:
                                try:
                                    text = await response.text()
                                    data = json.loads(text)
                                except Exception:
                                    pass
                            if data is None:
                                return
                            if not any(p.get("url") == response.url for p in payloads):
                                payloads.append({"url": response.url, "data": data})
                            if not response_event.is_set():
                                response_event.set()

                        page.on("response", handle_response)

                        html_dump_path = None
                        try:
                            await goto_with_retry(page, search_url, wait_until="domcontentloaded")
                            try:
                                await asyncio.wait_for(response_event.wait(), timeout=WAIT_FOR_DATA_SECONDS)
                            except asyncio.TimeoutError:
                                print(f"{origin}->{dest} {date_iso}: sin respuesta JSON util en {WAIT_FOR_DATA_SECONDS}s")
                            if not payloads:
                                awaited_response = None
                                try:
                                    awaited_response = await page.wait_for_response(
                                        lambda resp: resp.ok and any(keyword in resp.url.lower() for keyword in INTERESTING_ENDPOINT_KEYWORDS),
                                        timeout=max(WAIT_FOR_DATA_SECONDS * 1000, DOM_FALLBACK_WAIT_MS or 0),
                                    )
                                except Exception:
                                    awaited_response = None
                                if awaited_response:
                                    try:
                                        data = await awaited_response.json()
                                    except Exception:
                                        try:
                                            text = await awaited_response.text()
                                            data = json.loads(text)
                                        except Exception:
                                            data = None
                                    if data is not None:
                                        if not any(p.get("url") == awaited_response.url for p in payloads):
                                            payloads.append({"url": awaited_response.url, "data": data})
                                        if not response_event.is_set():
                                            response_event.set()
                            await page.wait_for_timeout(2000)
                            if SAVE_RAW_PAYLOADS and SAVE_PAGE_HTML:
                                try:
                                    html = await page.content()
                                    html_dump_path = RAW_PAYLOAD_DIR / f"page_{origin}_{dest}_{date_iso}_{uuid.uuid4().hex}.html"
                                    html_dump_path.write_text(html, encoding="utf-8", errors="ignore")
                                    print(f"[debug] HTML guardado en {html_dump_path}")
                                except Exception as exc:
                                    print(f"[debug] no se guardo HTML: {exc}")
                        except (TimeoutError, Error) as exc:
                            print(f"{origin}->{dest} {date_iso}: error de navegacion {exc}")
                            if SAVE_RAW_PAYLOADS and SAVE_PAGE_HTML and html_dump_path is None:
                                try:
                                    html = await page.content()
                                    html_dump_path = RAW_PAYLOAD_DIR / f"page_{origin}_{dest}_{date_iso}_{uuid.uuid4().hex}.html"
                                    html_dump_path.write_text(html, encoding="utf-8", errors="ignore")
                                    print(f"[debug] HTML (tras error) guardado en {html_dump_path}")
                                except Exception:
                                    pass
                        finally:
                            remover = getattr(page, "off", None) or getattr(page, "remove_listener", None)
                            if remover:
                                remover("response", handle_response)

                        candidate_payloads = [p["data"] for p in payloads]

                        preloaded_state = await page.evaluate(
                            """() => {
                                if (typeof window === 'undefined') { return null; }
                                const candidates = [
                                  window.__LATAM_INITIAL_STATE__,
                                  window.__NUXT__,
                                  window.__INITIAL_STATE__,
                                  window.__PRELOADED_STATE__,
                                  window.__STATE__
                                ];
                                for (const item of candidates) {
                                  if (item) {
                                    try { return JSON.stringify(item); } catch (e) {}
                                  }
                                }
                                return null;
                            }"""
                        )
                        if preloaded_state:
                            if SAVE_RAW_PAYLOADS and SAVE_PRELOADED_STATE:
                                dump_path = RAW_PAYLOAD_DIR / f"preloaded_{origin}_{dest}_{date_iso}_{uuid.uuid4().hex}.json"
                                dump_path.write_text(preloaded_state, encoding="utf-8")
                                print(f"Estado precargado almacenado en {dump_path}")
                            try:
                                candidate_payloads.append(json.loads(preloaded_state))
                            except json.JSONDecodeError as exc:
                                print(f"{origin}->{dest} {date_iso}: preloaded state no parseable ({exc})")

                        meta = {
                            "origin": origin,
                            "destination": dest,
                            "search_date": date_iso,
                        }

                        rows = []
                        for payload in candidate_payloads:
                            try:
                                rows.extend(extract_rows_from_payload(payload, meta))
                            except Exception as exc:
                                print(f"{origin}->{dest} {date_iso}: error extrayendo datos: {exc}")

                        if rows:
                            for r in rows:
                                r.setdefault("fuente", "json")
                            all_rows.extend(rows)
                            progress.setdefault(date_iso, {})[dest] = len(rows)
                            print(f"{origin}->{dest} {date_iso}: {len(rows)} itinerarios capturados")
                        else:
                            try:
                                dom_rows = await extract_rows_from_dom(page, meta)
                            except Error as exc:
                                print(f"{origin}->{dest} {date_iso}: DOM fallback fallo ({exc})")
                                dom_rows = []
                            if dom_rows:
                                all_rows.extend(dom_rows)
                                progress.setdefault(date_iso, {})[dest] = len(dom_rows)
                                print(f"{origin}->{dest} {date_iso}: {len(dom_rows)} itinerarios capturados via DOM")
                            else:
                                progress.setdefault(date_iso, {})[dest] = 0
                                print(f"{origin}->{dest} {date_iso}: sin itinerarios detectados")
                                if SAVE_RAW_PAYLOADS and payloads:
                                    dump_path = RAW_PAYLOAD_DIR / f"payload_{origin}_{dest}_{date_iso}_{uuid.uuid4().hex}.json"
                                    dump_path.write_text(json.dumps(payloads, indent=2, ensure_ascii=True), encoding="utf-8")
                                    print(f"Payload almacenado en {dump_path}")

                        search_counter += 1

                        if SAVE_PROGRESS_EVERY and search_counter % SAVE_PROGRESS_EVERY == 0:
                            save_progress(progress)
                            write_checkpoint_parquet(all_rows)

                        if search_counter % LONG_PAUSE_EVERY == 0:
                            print(f"Pausa preventiva de {LONG_PAUSE_SECONDS}s tras {search_counter} busquedas...")
                            await page.wait_for_timeout(LONG_PAUSE_SECONDS * 1000)

                        cooldown_ms = random.randint(MIN_DELAY_BETWEEN_DEST_MS, MAX_DELAY_BETWEEN_DEST_MS)
                        await page.wait_for_timeout(cooldown_ms)

                    if DATE_COOLDOWN_SECONDS:
                        print(f"Descanso post-fecha {date_iso}: {DATE_COOLDOWN_SECONDS}s")
                        await page.wait_for_timeout(DATE_COOLDOWN_SECONDS * 1000)

                    save_progress(progress)
                    write_checkpoint_parquet(all_rows)

            finally:
                if STORAGE_STATE_PATH and SAVE_STORAGE_STATE:
                    try:
                        state = await context.storage_state()
                        STORAGE_STATE_PATH.write_text(json.dumps(state), encoding="utf-8")
                        print(f"Storage state guardado en {STORAGE_STATE_PATH}")
                    except Exception as exc:
                        print(f"No se pudo guardar storage state: {exc}")
                await context.close()
        finally:
            await browser.close()

    save_progress(progress)
    write_checkpoint_parquet(all_rows)

    df = pd.DataFrame(all_rows)
    if not df.empty:
        df = df.drop_duplicates(
            subset=[
                "fecha_busqueda",
                "origen_solicitado",
                "destino_solicitado",
                "itinerario_id",
                "salida_programada",
            ],
            keep="last",
        )
        df["salida_dt"] = pd.to_datetime(df["salida_programada"], errors="coerce")
        df["llegada_dt"] = pd.to_datetime(df["llegada_programada"], errors="coerce")
    return df


In [9]:
df_latam = await scrape_latam_flights(ORIGIN, DESTINOS, FECHA_INICIO, FECHA_FIN, ida_vuelta=IDA_VUELTA)
print('Itinerarios capturados:', len(df_latam))
df_latam.head()


Storage state cargado desde latam_storage_state.json
===== Fecha 2025-10-29 =====
BOG->MDE 2025-10-29: registrado previamente con 0 itinerarios, reintentando
[debug] 200 https://www.latamairlines.com/es-co/flights/public/locales/es/co.json
[debug] 200 https://www.latamairlines.com/es-co/flights/public/locales/es/common.json
[debug] 200 https://latam.absmartly.io/v1/context?application=website&environment=Prod
[debug] 200 https://latam.absmartly.io/v1/context?application=website&environment=Prod
[debug] 200 https://latam.absmartly.io/v1/context?application=website&environment=Prod
[debug] 200 https://www.google.com/ccm/collect?frm=0&tid=AW-1012797176&en=page_view&dl=https%3A%2F%2Fwww.latamairlines.com%2Fco%2Fes%2Fofertas-vuelos&scrsrc=www.googletagmanager.com&rnd=2098444868.1761835251&dt=Selecci%C3%B3n%20de%20vuelos%20%7C%20LATAM%20Airlines&auid=1064865889.1761777911&navt=n&npa=0&_tu=CA&gtm=45be5at0h2v873735880z8830161026za200zb830161026zd830161026xec&gcs=G111&gcd=13v3v3v3v5l1&dma=0&tag

In [11]:
if LOCAL_PARQUET_PATH.exists():
    df_parquet = pd.read_parquet(LOCAL_PARQUET_PATH)
    print('Datos en Parquet local:', len(df_parquet))
    df_parquet.tail()
else:
    print('Parquet local aun no existe.')


Parquet local aun no existe.


In [12]:
async def scrape_latam_manual_once(origin=ORIGIN, destination="MDE", fecha="2025-10-29", *, ida_vuelta=False):
    """Ejecuta una sola busqueda dejando la pagina abierta para cargar manualmente."""
    meta = {
        "origin": origin,
        "destination": destination,
        "search_date": fecha,
    }
    async with async_playwright() as playwright:
        browser_launch_kwargs = {"headless": HEADLESS}
        if BROWSER_ARGS:
            browser_launch_kwargs["args"] = BROWSER_ARGS
        if BROWSER_CHANNEL:
            browser_launch_kwargs["channel"] = BROWSER_CHANNEL
        browser = await playwright.chromium.launch(**browser_launch_kwargs)
        try:
            context_kwargs = {
                "user_agent": USER_AGENT,
                "locale": BROWSER_LOCALE,
                "viewport": VIEWPORT,
            }
            if STORAGE_STATE_PATH and STORAGE_STATE_PATH.exists():
                try:
                    state = json.loads(STORAGE_STATE_PATH.read_text(encoding="utf-8"))
                    context_kwargs["storage_state"] = state
                    print(f"Storage state cargado desde {STORAGE_STATE_PATH}")
                except Exception as exc:
                    print(f"No se pudo cargar storage state: {exc}")
            context = await browser.new_context(**context_kwargs)
            if EXTRA_HEADERS:
                await context.set_extra_http_headers(EXTRA_HEADERS)
            if INIT_JS_SCRIPTS:
                for script in INIT_JS_SCRIPTS:
                    await context.add_init_script(script)
            page = await context.new_page()
            await warmup_session(page)
            try:
                date_obj = datetime.strptime(fecha, "%Y-%m-%d").date()
                url = build_offers_url(origin, destination, date_obj, return_date=date_obj + timedelta(days=RETORNO_OFFSET_DIAS) if ida_vuelta else None)
                print(f"Abriendo {url}")
                await page.goto(url, wait_until="domcontentloaded")
                print("Cuando veas los vuelos en pantalla, regresa a la celda y presiona Enter para capturarlos…")
                input("Presiona Enter para continuar con la extraccion via DOM → ")
                dom_rows = await extract_rows_from_dom(page, meta)
                print(f"DOM rows capturados: {len(dom_rows)}")
                return pd.DataFrame(dom_rows)
            finally:
                await context.close()
        finally:
            await browser.close()


In [18]:
import json
import pandas as pd
from pathlib import Path
from playwright.async_api import async_playwright

async def extract_from_running_chrome():
    async with async_playwright() as pw:
        browser = await pw.chromium.connect_over_cdp("http://127.0.0.1:9222")
        context = browser.contexts[0]           # primera ventana
        page = context.pages[0]                 # pestaña con los vuelos
        # Opcional: guarda storage_state para reutilizarlo luego
        storage_state = await context.storage_state()
        Path("latam_storage_state.json").write_text(json.dumps(storage_state), encoding="utf-8")
        print("Storage state guardado en latam_storage_state.json")

        meta = {
            "origin": ORIGIN,
            "destination": DESTINOS[0],
            "search_date": FECHA_INICIO,
        }
        dom_rows = await extract_rows_from_dom(page, meta)
        print(f"DOM rows capturados: {len(dom_rows)}")
        return pd.DataFrame(dom_rows)

df_latam = await extract_from_running_chrome()
df_latam.head()


Storage state guardado en latam_storage_state.json
BOG->MDE 2025-10-29: DOM sin tarjetas tras 60.0s
DOM rows capturados: 0


In [19]:
import asyncio
from pathlib import Path
from playwright.async_api import async_playwright

async def dump_current_html():
    async with async_playwright() as pw:
        browser = await pw.chromium.connect_over_cdp("http://127.0.0.1:9222")
        context = browser.contexts[0]
        page = context.pages[0]
        html = await page.content()
        Path("latam_debug.html").write_text(html, encoding="utf-8")
        print("HTML guardado en latam_debug.html")

await dump_current_html()


HTML guardado en latam_debug.html


import asyncio
from pathlib import Path
from playwright.async_api import async_playwright

async def dump_current_html():
    async with async_playwright() as pw:
        browser = await pw.chromium.connect_over_cdp("http://127.0.0.1:9222")
        context = browser.contexts[0]
        page = context.pages[0]
        html = await page.content()
        Path("latam_debug.html").write_text(html, encoding="utf-8")
        print("HTML guardado en latam_debug.html")

await dump_current_html()


In [None]:
c