In [1]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

BASE = "https://nexoinmobiliario.pe"

PROJECT_RE = re.compile(r"^/proyecto/venta-de-departamento-\d+-")

def fetch_html(url: str, timeout=30) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
    }
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()
    return r.text

def extract_project_links_from_inmobiliaria(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # 1) apunta al contenedor grande (tu class)
    container = soup.select_one("div.SearchResult-projects.scrollNormal") or soup

    links = set()
    # 2) dentro de cada card, el link al proyecto suele estar en el h2 > a
    for a in container.select("div.SearchResult-project-data h2 a[href]"):
        href = a["href"].strip()
        if href.startswith("/"):
            # Nexo está usando /proyecto/... en esta vista (según tu screenshot)
            if PROJECT_RE.match(href):
                links.add(urljoin(BASE, href))
        else:
            if "nexoinmobiliario.pe/proyecto/" in href:
                links.add(href)

    # fallback: por si cambia el HTML, agarramos cualquier /proyecto/ dentro del contenedor
    if not links:
        for a in container.select("a[href]"):
            href = a["href"].strip()
            if href.startswith("/proyecto/"):
                links.add(urljoin(BASE, href))

    return sorted(links)

if __name__ == "__main__":
    url = "https://nexoinmobiliario.pe/inmobiliarias/edifica"
    html = fetch_html(url)
    project_urls = extract_project_links_from_inmobiliaria(html)
    print("N links:", len(project_urls))
    print("\n".join(project_urls[:10]))


N links: 0



# ver otros proyectos dentro de 1

In [3]:
import re
import time
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

BASE = "https://nexoinmobiliario.pe"
ID_RE_END = re.compile(r"-(\d+)$")

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

def fetch_html(url: str, timeout=30) -> str:
    r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text

def extract_otros_links(html: str, base_url: str = BASE):
    """
    Extrae links de 'Otros Departamentos / Otros proyectos' desde el carrusel:
    - a.carousel-extra-section-otros-btn[href] (ideal)
    - div.carousel-extra-section-otros-share[data-url] (fallback)
    Devuelve lista de dicts.
    """
    soup = BeautifulSoup(html, "html.parser")

    # La sección suele contener: section-final-departamentos-inmobiliaria-otros2
    # Pero para hacerlo robusto: buscamos cualquier card de 'carousel-extra-section-otros-card'
    cards = soup.select("div.carousel-extra-section-otros-card")

    items = []
    for card in cards:
        # Link principal (botón Ver proyecto)
        a = card.select_one("a.carousel-extra-section-otros-btn[href]")
        href = a["href"].strip() if a and a.has_attr("href") else None
        url = urljoin(base_url, href) if href else None

        # Fallback: data-url en el share
        if not url:
            share = card.select_one("div.carousel-extra-section-otros-share[data-url]")
            if share and share.has_attr("data-url"):
                url = share["data-url"].strip()

        if not url:
            continue

        title_el = card.select_one("h3.carousel-extra-section-otros-title")
        price_el = card.select_one("div.carousel-extra-section-otros-price")
        badge_el = card.select_one("div.carousel-extra-section-otros-badge")

        title = title_el.get_text(strip=True) if title_el else None
        price_text = price_el.get_text(" ", strip=True) if price_el else None
        badge = badge_el.get_text(" ", strip=True) if badge_el else None

        project_id = None
        m = ID_RE_END.search(urlparse(url).path)
        if m:
            project_id = m.group(1)

        items.append(
            {
                "url": url,
                "project_id": project_id,
                "title": title,
                "price_text": price_text,
                "badge": badge,
            }
        )

    # dedupe por url (manteniendo orden)
    seen = set()
    out = []
    for it in items:
        if it["url"] not in seen:
            seen.add(it["url"])
            out.append(it)
    return out

def run(urls, sleep_s=1.0, out_csv="otros_proyectos.csv"):
    rows = []
    for i, seed_url in enumerate(urls, start=1):
        print(f"[{i}/{len(urls)}] Fetch: {seed_url}")
        try:
            html = fetch_html(seed_url)
            otros = extract_otros_links(html, base_url=BASE)

            print(f"  -> encontrados: {len(otros)}")
            for it in otros:
                rows.append(
                    {
                        "seed_url": seed_url,
                        "other_url": it["url"],
                        "other_project_id": it["project_id"],
                        "other_title": it["title"],
                        "other_price_text": it["price_text"],
                        "other_badge": it["badge"],
                    }
                )

        except Exception as e:
            print(f"  !! error: {e}")

        time.sleep(sleep_s)

    # export CSV
    if rows:
        fieldnames = list(rows[0].keys())
        with open(out_csv, "w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(rows)

    print(f"\nOK. filas: {len(rows)} -> {out_csv}")

if __name__ == "__main__":
    URLS = [
        "https://nexoinmobiliario.pe/departamentos/barranco/grau-10-3809",
        "https://nexoinmobiliario.pe/proyecto/venta-de-departamento-3209-tulipan-cercado-de-lima-lima-lima-abril-grupo-inmobiliario",
        "https://nexoinmobiliario.pe/proyecto/venta-de-departamento-2308-smart-a-santa-beatriz-cercado-de-lima-lima-lima-grupo-mg",
        # agrega aquí tus otras 2…
    ]
    run(URLS, sleep_s=1.2, out_csv="otros_proyectos.csv")


[1/3] Fetch: https://nexoinmobiliario.pe/departamentos/barranco/grau-10-3809
  -> encontrados: 17
[2/3] Fetch: https://nexoinmobiliario.pe/proyecto/venta-de-departamento-3209-tulipan-cercado-de-lima-lima-lima-abril-grupo-inmobiliario
  -> encontrados: 16
[3/3] Fetch: https://nexoinmobiliario.pe/proyecto/venta-de-departamento-2308-smart-a-santa-beatriz-cercado-de-lima-lima-lima-grupo-mg
  -> encontrados: 21

OK. filas: 54 -> otros_proyectos.csv


In [2]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

BASE = "https://nexoinmobiliario.pe"

def extract_otros_proyectos(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # Sección: "Otros Departamentos en ..."
    section = soup.select_one("div.section-final-departamentos-inmobiliaria-otros2")
    if not section:
        return []

    items = []
    for card in section.select("div.carousel-extra-section-otros-card"):
        # 1) Link principal (botón "Ver proyecto")
        a = card.select_one("a.carousel-extra-section-otros-btn[href]")
        href = a["href"].strip() if a and a.has_attr("href") else None
        url = urljoin(BASE, href) if href else None

        # 2) Fallback: data-url del share
        if not url:
            share = card.select_one("div.carousel-extra-section-otros-share[data-url]")
            if share and share.has_attr("data-url"):
                url = share["data-url"].strip()

        title_el = card.select_one("h3.carousel-extra-section-otros-title")
        price_el = card.select_one("div.carousel-extra-section-otros-price")
        badge_el = card.select_one("div.carousel-extra-section-otros-badge")

        title = title_el.get_text(strip=True) if title_el else None
        price = price_el.get_text(" ", strip=True) if price_el else None
        badge = badge_el.get_text(" ", strip=True) if badge_el else None

        # opcional: sacar id del final (…-3530)
        project_id = None
        if url:
            m = re.search(r"-(\d+)$", url)
            project_id = m.group(1) if m else None

        if url:
            items.append({
                "url": url,
                "project_id": project_id,
                "title": title,
                "price_text": price,
                "badge": badge,
            })

    # dedupe por url
    seen = set()
    out = []
    for it in items:
        if it["url"] not in seen:
            seen.add(it["url"])
            out.append(it)
    return out
