In [41]:
import json
from http import HTTPStatus
from typing import List, Optional, Any
from pydantic import BaseModel

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import HTTPError

In [26]:

class Price(BaseModel):
    mainValue: int | None = None
    emptyValue: bool | None = None
    belowPrice: bool | None = None
    multiplePrices: bool | None = None

class Address(BaseModel):
    city: str | None = None
    stateAcronym: str | None = None
    neighborhood: str | None = None
    isApproximateLocation: bool | None = None

class Image(BaseModel):
    src: str | None = None
    alt: str | None = None
    isPriority: bool | None = None

class Amenities(BaseModel):
    usableAreas: str | None = None
    bedrooms: str | None = None
    bathrooms: str | None = None
    parkingSpaces: str | None = None
    values: List[str] | None = None

class RealEstate(BaseModel):
    id: str | None = None
    legacyId: int | None = None
    name: str | None = None
    advertiserUrl: str | None = None
    tier: str | None = None
    license: str | None = None
    createdDate: str | None = None
    phoneNumbers: List[str] | None = None
    whatsAppNumber: str | None = None
    defaultMessage: str | None = None
    totalCountByFilter: int | None = None
    totalCountByAdvertiser: int | None = None
    isVerified: bool | None = None
    isPremium: bool | None = None
    imageUrl: str | None = None

class AdvertiserLogo(BaseModel):
    src: str | None = None
    alt: str | None = None

class Element(BaseModel):
    id: str
    externalId: str
    contractType: str | None = None
    href: str | None = None
    prices: Price | None = None
    address: Address | None = None
    business: str | None = None
    highlight: str | None = None
    imageList: List[Image] | None = None
    amenities: Amenities | None = None
    realEstate: RealEstate | None = None
    visualized: bool | None = None
    description: str | None = None
    isNoWarrantorRent: bool | None = None
    constructionStatus: str | None = None
    expansionType: str | None = None
    sourceId: str | None = None
    stamps: List[str] | None = None
    unitTypes: List[str] | None = None
    displayAddressType: str | None = None
    advertiserLogo: AdvertiserLogo | None = None


In [11]:
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"

ACTION = "venda"
LOCALIZATION = "mg+pocos-de-caldas"
TYPE = "imoveis"
PAGE = 2

In [88]:
import time
from dataclasses import dataclass

import backoff
from loguru import logger


@dataclass
class RequestedPageResponse:
    html: Any
    code: int = 200
    exception: Exception | None = None

def get_page(url: str, timeout: int = 20, verbose: int = 0):

    request = Request(url)

    request.add_header("User-Agent", USER_AGENT)

    try:
        return RequestedPageResponse(html=urlopen(request, timeout=timeout))
    except HTTPError as e:
        logger.error("[error]", e)
        return RequestedPageResponse(html=None, code=e.getcode(), exception=e)



def backoff_hdlr(details):
    time.sleep(3)
    logger.warning("Backing off {wait:0.1f} seconds after {tries} tries "
           "calling function {target} with args {args} and kwargs "
           "{kwargs}".format(**details))
    
@backoff.on_exception(
    backoff.expo,
    HTTPError,
    max_tries=3,
    logger=logger,
    on_backoff=backoff_hdlr,
)
def get_page_html(page, action, type, localization):
    url = f"https://www.zapimoveis.com.br/{action}/{type}/{localization}/?pagina={page}"
    logger.debug(f"Requesting info from '{url}'")

    response = get_page(url)

    if response.code != HTTPStatus.OK:
        raise response.exception

    return response.html

def get_real_state_data(page_html):
    soup = BeautifulSoup(page_html, "html.parser")
    script = soup.find('script', id='__NEXT_DATA__')
    listings = json.loads(script.text)
    raw_data = (
        listings
        .get("props", {})
        .get("pageProps", {})
        .get("initialProps", {})
        .get("data", {})
    )
    return [Element(**d) for d in raw_data]


In [None]:
# Como o loop deve ficar na funcao principal
for page in [1,2, 1000, 3]:
    try:
        page_html = get_page_html(page=1, action=ACTION, type=TYPE, localization=LOCALIZATION)
    except HTTPError:
        break

In [89]:
page_html = get_page_html(page=1, action=ACTION, type=TYPE, localization=LOCALIZATION)

data = get_real_state_data(page_html)

2024-11-21 04:34:35.858 | DEBUG    | __main__:get_page_html:43 - Requesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=1'


In [125]:
[d.unitTypes for d in data]

[['APARTMENT'],
 ['APARTMENT'],
 ['HOME'],
 ['SHED_DEPOSIT_WAREHOUSE'],
 ['APARTMENT'],
 ['APARTMENT'],
 ['APARTMENT'],
 ['COMMERCIAL_PROPERTY'],
 ['APARTMENT'],
 ['FLAT'],
 ['APARTMENT'],
 ['BUSINESS'],
 ['APARTMENT'],
 ['APARTMENT'],
 ['BUSINESS']]

In [134]:
real_state_element = data[2]

In [135]:
real_state_element.unitTypes

['HOME']

In [136]:
real_state_element.prices.mainValue

480000

In [137]:
real_state_element.address.neighborhood

'Monte Verde'

In [138]:
real_state_element.amenities

Amenities(usableAreas='161', bedrooms='4', bathrooms='3', parkingSpaces='1', values=['DEPOSIT', 'BATHTUB', 'SERVICE_AREA'])

In [139]:
real_state_element.description

'CASA COM 04 DORMITÓRIOS, SENDO 01 SUÍTE, SALA, COZINHA, BANHEIRO SOCIAL, ÁREA DE SERVIÇO COM DORMITÓRIO, DESPENSA E GARAGEM.\n\nIMOVEL COM LIGAÇÃO DE AQUECEDOR SOLAR, CERCA ELÉTRICA, PORTÃO ELETRÔNICO. \n\nPARA SEU CONFORTO, POSSUÍ BANHEIRA DE HIDROMASSAGEM NA SUÍTE. NO QUINTAL, POSSUÍ UMA EDÍCULA COMPOSTA POR DORMITÓRIO E BANHEIRO.\n\nRUA DE FÁCIL ACESSO E PRÓXIMO À GRANDE VARIEDADES COMERCIAIS NA REGIÃO...\n\n- SUPERMERCADOS,\n- PADARIAS,\n- ESCOLAS,\n- FARMÁCIAS,\n- PET SHOPS,\n- RESTAURANTE,\n- PONTO DE ÔNIBUS...!!\n\nVENHA CONHECER ESTE LINDO IMÓVEL.. LIGUE E AGENDE UMA VISITA!!. ref. 58832 atualizado em 19/11.'

In [29]:
data[0]



Element(id='2696156001', externalId='V54803', contractType='REAL_ESTATE', href='https://www.zapimoveis.com.br/imovel/venda-terreno-lote-condominio-jardim-europa-pocos-de-caldas-mg-480m2-id-2696156001/', prices=Price(mainValue=575000, emptyValue=False, belowPrice=False, multiplePrices=False), address=Address(city='Poços de Caldas', stateAcronym='MG', neighborhood='Jardim Europa', isApproximateLocation=True), business='SALE', highlight='SUPER', imageList=[Image(src='https://resizedimgs.zapimoveis.com.br/crop/614x297/vr.images.sp/5daa172b250f9fb3cbf5028b3dfbe3c5.webp', alt='Imagem do imóvel', isPriority=True), Image(src='https://resizedimgs.zapimoveis.com.br/crop/614x297/vr.images.sp/f133ac4e9c94ad1d32d236db41923968.webp', alt='Imagem do imóvel', isPriority=False)], amenities=Amenities(usableAreas='480', bedrooms='0', bathrooms='0', parkingSpaces='0', values=[]), realEstate=RealEstate(id='281c0fe5-98c1-c9f9-d96c-a2e58397c49c', legacyId=3618805, name='BG & C IMOVEIS LTDA', advertiserUrl='/

In [30]:
raw_data[0]

{'id': '2696156001',
 'externalId': 'V54803',
 'contractType': 'REAL_ESTATE',
 'href': 'https://www.zapimoveis.com.br/imovel/venda-terreno-lote-condominio-jardim-europa-pocos-de-caldas-mg-480m2-id-2696156001/',
 'prices': {'mainValue': 575000,
  'emptyValue': False,
  'belowPrice': False,
  'multiplePrices': False},
 'address': {'city': 'Poços de Caldas',
  'stateAcronym': 'MG',
  'neighborhood': 'Jardim Europa',
  'isApproximateLocation': True},
 'business': 'SALE',
 'highlight': 'SUPER',
 'imageList': [{'src': 'https://resizedimgs.zapimoveis.com.br/crop/614x297/vr.images.sp/5daa172b250f9fb3cbf5028b3dfbe3c5.webp',
   'alt': 'Imagem do imóvel',
   'isPriority': True},
  {'src': 'https://resizedimgs.zapimoveis.com.br/crop/614x297/vr.images.sp/f133ac4e9c94ad1d32d236db41923968.webp',
   'alt': 'Imagem do imóvel',
   'isPriority': False}],
 'amenities': {'usableAreas': '480',
  'bedrooms': '0',
  'bathrooms': '0',
  'parkingSpaces': '0',
  'values': []},
 'realEstate': {'id': '281c0fe5-98c