# DOM Inspector + Scraping Fallback (Requests ‚Üí Playwright)

Este notebook sirve para:

- Pegar una **URL** y ver el **HTML/DOM** que recibes (lo que realmente puedes scrapear).
- Detectar si el sitio es **din√°mico (JS)** y necesitas un fallback.
- Probar un **fallback con Playwright** (renderizado) y comparar.

> √ösalo con responsabilidad: respeta `robots.txt`, t√©rminos de uso, y limita el tr√°fico.


In [1]:
# ‚úÖ Setup (instalaciones opcionales)
# Si no tienes las librer√≠as, descomenta y ejecuta:
# !pip -q install requests beautifulsoup4 lxml pandas tldextract

import re
import json
import time
import textwrap
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

requests_session = requests.Session()
requests_session.headers.update({
    # User-Agent "realista" (ajusta si quieres)
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
})


In [2]:
# üîß Inputs
URL = "https://nexoinmobiliario.pe/inmobiliarias/edifica"
#URL = "https://example.com"   # <-- pega aqu√≠ tu URL
TIMEOUT = 25
SLEEP_BETWEEN = 1.0  # respeta el servidor
MAX_HTML_CHARS_PREVIEW = 40_000


In [3]:
# ü§ñ robots.txt (r√°pido chequeo informativo)
def get_robots_txt(url: str) -> str:
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    try:
        r = requests_session.get(robots_url, timeout=TIMEOUT)
        return f"robots.txt URL: {robots_url}\nStatus: {r.status_code}\n\n{r.text[:4000]}"
    except Exception as e:
        return f"No se pudo obtener robots.txt: {e}"

print(get_robots_txt(URL))


robots.txt URL: https://nexoinmobiliario.pe/robots.txt
Status: 200

User-Agent: *
Disallow: 

Sitemap: https://nexoinmobiliario.pe/sitemap.xml



In [4]:
# üåê Fetch HTML con Requests
time.sleep(SLEEP_BETWEEN)

def fetch_html(url: str):
    r = requests_session.get(url, timeout=TIMEOUT, allow_redirects=True)
    return {
        "final_url": r.url,
        "status_code": r.status_code,
        "content_type": r.headers.get("Content-Type", ""),
        "headers": dict(r.headers),
        "text": r.text,
        "bytes": len(r.content),
    }

resp = fetch_html(URL)
print("Final URL:", resp["final_url"])
print("Status:", resp["status_code"])
print("Content-Type:", resp["content_type"])
print("Bytes:", resp["bytes"])


Final URL: https://nexoinmobiliario.pe/inmobiliarias/edifica
Status: 200
Content-Type: text/html; charset=UTF-8
Bytes: 118464


In [5]:
# üß© DOM preview (lo que realmente te llega)
html = resp["text"]

print("HTML length:", len(html))
print("\n--- HEAD (preview) ---\n")
print(html[:MAX_HTML_CHARS_PREVIEW])


HTML length: 118362

--- HEAD (preview) ---

<!doctype html>
<html class="no-js" lang="es">
<head>
    <!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-PF3NRKW');</script>
<!-- End Google Tag Manager -->

<!-- Google Tag Manager -->
<!-- <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-NT3THRFP');</script> -->
<!-- End Google Tag Manager -->

      
    <meta charset="utf-8">
    <meta htt

In [6]:
# üîé Parseo con BeautifulSoup: resumen r√°pido
soup = BeautifulSoup(html, "lxml")

title = soup.title.get_text(strip=True) if soup.title else None
h1 = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
meta_desc = None
m = soup.find("meta", attrs={"name":"description"})
if m and m.get("content"):
    meta_desc = m["content"].strip()

print("Title:", title)
print("H1:", h1)
print("Meta description:", meta_desc)

# Conteos de elementos √∫tiles
counts = {
    "a_links": len(soup.find_all("a")),
    "images": len(soup.find_all("img")),
    "scripts": len(soup.find_all("script")),
    "json_ld": len(soup.find_all("script", attrs={"type":"application/ld+json"})),
    "tables": len(soup.find_all("table")),
    "forms": len(soup.find_all("form")),
}
counts


Title: Edifica | 15 Proyectos en Venta en Nexo Inmobiliario
H1: EDIFICA
Meta description: Conoce los proyectos de Edifica. Precios desde S/. 419,136, con m√°s de 15 en venta.


{'a_links': 78,
 'images': 24,
 'scripts': 14,
 'json_ld': 1,
 'tables': 0,
 'forms': 6}

In [7]:
# üß† Helpers: encontrar 'pistas' para identificar targets de scraping
def find_candidates_by_text(soup: BeautifulSoup, needle: str, max_matches=20):
    needle_low = needle.lower()
    matches = []
    for tag in soup.find_all(True):
        txt = tag.get_text(" ", strip=True)
        if txt and needle_low in txt.lower():
            matches.append(tag)
            if len(matches) >= max_matches:
                break
    return matches

def css_path(el):
    # path CSS aproximado para inspecci√≥n (no perfecto, pero √∫til)
    path = []
    while el and getattr(el, "name", None) and el.name != "[document]":
        name = el.name
        _id = el.get("id")
        _class = el.get("class")
        if _id:
            seg = f"{name}#{_id}"
        elif _class:
            seg = f"{name}." + ".".join(_class[:3])
        else:
            seg = name
        path.append(seg)
        el = el.parent
    return " > ".join(reversed(path))

# EJEMPLO: busca un texto que veas en la p√°gina (c√°mbialo)
NEEDLE = "precio"  # <-- cambia seg√∫n lo que buscas
cands = find_candidates_by_text(soup, NEEDLE)

print(f"Matches: {len(cands)}\n")
for i, tag in enumerate(cands[:10], 1):
    preview = tag.get_text(" ", strip=True)[:160]
    print(f"[{i}] {tag.name} | css‚âà {css_path(tag)}")
    print("    text:", preview)


Matches: 0



In [8]:
# üîó Links y tablas (√∫til para descubrir endpoints o datos ya disponibles)
# Links
links = []
for a in soup.select("a[href]"):
    href = a.get("href")
    if not href:
        continue
    full = urljoin(resp["final_url"], href)
    links.append({
        "text": a.get_text(" ", strip=True)[:80],
        "href": href,
        "full_url": full
    })

df_links = pd.DataFrame(links).drop_duplicates(subset=["full_url"]).head(30)
df_links


Unnamed: 0,text,href,full_url
0,,https://nexoinmobiliario.pe/,https://nexoinmobiliario.pe/
2,Sello CODIP,https://nexoinmobiliario.pe/sello-codip,https://nexoinmobiliario.pe/sello-codip
3,Inmobiliarias,https://nexoinmobiliario.pe/inmobiliarias,https://nexoinmobiliario.pe/inmobiliarias
4,Nexo-Bancos,https://nexoinmobiliario.pe/bancos,https://nexoinmobiliario.pe/bancos
5,Blog,https://blog.nexoinmobiliario.pe,https://blog.nexoinmobiliario.pe
6,,#,https://nexoinmobiliario.pe/inmobiliarias/edifica
7,Reg√≠strate,#tab-register-fav,https://nexoinmobiliario.pe/inmobiliarias/edif...
8,Iniciar Sesi√≥n,#tab-login-fav,https://nexoinmobiliario.pe/inmobiliarias/edif...
9,T√©rminos y Condiciones,https://nexoinmobiliario.pe/terminos-y-condici...,https://nexoinmobiliario.pe/terminos-y-condici...
10,Pol√≠tica de Privacidad,https://nexoinmobiliario.pe/politicas-privacidad,https://nexoinmobiliario.pe/politicas-privacidad


In [9]:
# üßæ Extraer JSON-LD (a veces trae precios, direcciones, productos, etc.)
jsonlds = []
for s in soup.find_all("script", attrs={"type":"application/ld+json"}):
    raw = s.get_text(strip=True)
    if not raw:
        continue
    try:
        obj = json.loads(raw)
        jsonlds.append(obj)
    except Exception:
        # a veces viene malformado
        jsonlds.append({"_raw": raw[:2000]})

len(jsonlds), (jsonlds[0] if jsonlds else None)


(1,
 {'@context': 'https://schema.org',
  '@type': 'Organization',
  'name': 'EDIFICA',
  'description': 'Somos una empresa inmobiliaria que naci√≥ en el a√±o 2005. Hoy ya son m√°s de 3000 familias que han confiado en nosotros y disfrutan de una nueva vida en cada uno de nuestros proyectos entregados.',
  'url': 'https://nexoinmobiliario.pe/inmobiliarias/edifica',
  'logo': 'https://e.nexoinmobiliario.pe/customers/edifica/logo-47-20250912111830.jpg',
  'location': {'@type': 'Place',
   'address': {'@type': 'PostalAddress',
    'addressLocality': 'Lima',
    'addressCountry': 'PE'}},
  'department': {'@type': 'ItemList',
   'name': 'Proyectos de EDIFICA',
   'numberOfItems': 15,
   'itemListElement': [{'@type': 'Product',
     'name': 'Urban Heights',
     'category': 'Proyecto inmobiliario',
     'url': 'https://nexoinmobiliario.pe/proyecto/venta-de-departamento-3215-urban-heights-miraflores-lima-lima-edifica',
     'offers': {'@type': 'Offer',
      'priceCurrency': 'PEN',
      'pric

In [10]:
# ‚ö†Ô∏è Se√±ales de sitio din√°mico (JS)
signals = {
    "has_react": bool(re.search(r"react|__NEXT_DATA__", html, re.I)),
    "has_nextjs_data": "__NEXT_DATA__" in html,
    "has_vue": bool(re.search(r"__NUXT__|vue", html, re.I)),
    "has_angular": bool(re.search(r"angular|ng-version", html, re.I)),
    "many_scripts": counts["scripts"] > 25,
    "few_text": len(soup.get_text(" ", strip=True)) < 800,
}
signals


{'has_react': False,
 'has_nextjs_data': False,
 'has_vue': False,
 'has_angular': False,
 'many_scripts': False,
 'few_text': False}

In [11]:
# üíæ Guardar snapshot del HTML para inspecci√≥n offline
from pathlib import Path

out_dir = Path("dom_snapshots")
out_dir.mkdir(exist_ok=True)

safe_name = re.sub(r"[^a-zA-Z0-9_-]+", "_", urlparse(resp["final_url"]).netloc)
ts = int(time.time())
html_path = out_dir / f"{safe_name}_{ts}.html"
html_path.write_text(html, encoding="utf-8")

print("Guardado:", html_path.resolve())


Guardado: C:\Users\User\Documents\support\notes\proyecto_ideal\property_scrappers_v2\notebooks\dom_snapshots\nexoinmobiliario_pe_1768742729.html


## üßØ Fallback: Playwright (renderizado)

Si el HTML inicial **no trae datos** (porque se llenan con JS), usa Playwright para:

- Renderizar la p√°gina como navegador
- Esperar a que aparezcan elementos
- Capturar el DOM final y/o interceptar XHR (paso siguiente)

> Playwright puede requerir instalaci√≥n de navegador.


In [12]:
# üßØ Playwright fallback (opcional)
# 1) Instala Playwright:
# !pip -q install playwright
# 2) Instala los navegadores (una vez):
# !playwright install chromium

# Luego ejecuta este bloque.
from pathlib import Path
import asyncio

async def fetch_rendered_dom(url: str, wait_ms: int = 2500, selector_wait: str | None = None):
    from playwright.async_api import async_playwright

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="domcontentloaded", timeout=60_000)

        # Espera extra para JS
        if selector_wait:
            try:
                await page.wait_for_selector(selector_wait, timeout=20_000)
            except Exception:
                pass

        await page.wait_for_timeout(wait_ms)
        content = await page.content()
        final_url = page.url

        # Guardar screenshot + html
        out_dir = Path("dom_snapshots")
        out_dir.mkdir(exist_ok=True)
        safe = re.sub(r"[^a-zA-Z0-9_-]+", "_", urlparse(final_url).netloc)
        ts = int(time.time())
        html_path = out_dir / f"{safe}_{ts}_rendered.html"
        png_path = out_dir / f"{safe}_{ts}_rendered.png"
        html_path.write_text(content, encoding="utf-8")
        await page.screenshot(path=str(png_path), full_page=True)

        await browser.close()
        return final_url, content, html_path, png_path

# Configura si quieres esperar por un elemento espec√≠fico (CSS selector)
WAIT_SELECTOR = None  # e.g. ".card-price"  o  "text=Precio"
WAIT_MS = 3000

# Ejecutar
final_url_pw, html_pw, html_path_pw, png_path_pw = await fetch_rendered_dom(URL, wait_ms=WAIT_MS, selector_wait=WAIT_SELECTOR)

print("Final URL (Playwright):", final_url_pw)
print("HTML rendered length:", len(html_pw))
print("Guardado HTML:", html_path_pw.resolve())
print("Guardado PNG:", png_path_pw.resolve())


ModuleNotFoundError: No module named 'playwright'

In [None]:
# üîÅ Comparar: Requests vs Renderizado (solo si corriste Playwright)
# Si no lo corriste, ignora este bloque.
try:
    ratio = len(html_pw) / max(1, len(html))
    print("len(rendered)/len(requests) =", round(ratio, 2))
    print("\nPreview rendered HEAD:\n")
    print(html_pw[:MAX_HTML_CHARS_PREVIEW])
except NameError:
    print("Playwright no fue ejecutado a√∫n.")


## ‚úÖ Qu√© identificar para scrapear (checklist r√°pido)

1) **Entidad objetivo**: ¬øQu√© quieres extraer? (precio, m¬≤, tipolog√≠a, ubicaci√≥n, disponibilidad, etc.)  
2) **Selector estable**: usa `id`, clases sem√°nticas, o atributos (`data-*`) antes que rutas fr√°giles.  
3) **Modelo de datos**: define tu schema desde ya (campos fijos).  
4) **Fallback**:
   - Si Requests no trae el dato ‚Üí Playwright renderiza.
   - Si el dato viene de XHR/JSON ‚Üí conviene interceptar endpoint (m√°s limpio que parsear HTML).  
5) **Anti-duplicados**: define llave √∫nica (por ejemplo `proyecto + tipolog√≠a + precio + fecha_captura`).

Si quieres, pega:
- la URL real
- qu√© campos necesitas
y te armo los selectores + extracci√≥n + normalizaci√≥n + dedupe.
