In [2]:
class Price(BaseModel):
    mainValue: int | None = None
    emptyValue: bool | None = None
    belowPrice: bool | None = None
    multiplePrices: bool | None = None

class Address(BaseModel):
    city: str | None = None
    stateAcronym: str | None = None
    neighborhood: str | None = None
    isApproximateLocation: bool | None = None

class Image(BaseModel):
    src: str | None = None
    alt: str | None = None
    isPriority: bool | None = None

class Amenities(BaseModel):
    usableAreas: str | None = None
    bedrooms: str | None = None
    bathrooms: str | None = None
    parkingSpaces: str | None = None
    values: List[str] | None = None

class RealEstate(BaseModel):
    id: str | None = None
    legacyId: int | None = None
    name: str | None = None
    advertiserUrl: str | None = None
    tier: str | None = None
    license: str | None = None
    createdDate: str | None = None
    phoneNumbers: List[str] | None = None
    whatsAppNumber: str | None = None
    defaultMessage: str | None = None
    totalCountByFilter: int | None = None
    totalCountByAdvertiser: int | None = None
    isVerified: bool | None = None
    isPremium: bool | None = None
    imageUrl: str | None = None

class AdvertiserLogo(BaseModel):
    src: str | None = None
    alt: str | None = None

class RealEstateElement(BaseModel):
    id: str
    externalId: str
    contractType: str | None = None
    href: str | None = None
    prices: Price | None = None
    address: Address | None = None
    business: str | None = None
    highlight: str | None = None
    imageList: List[Image] | None = None
    amenities: Amenities | None = None
    realEstate: RealEstate | None = None
    visualized: bool | None = None
    description: str | None = None
    isNoWarrantorRent: bool | None = None
    constructionStatus: str | None = None
    expansionType: str | None = None
    sourceId: str | None = None
    stamps: List[str] | None = None
    unitTypes: List[str] | None = None
    displayAddressType: str | None = None
    advertiserLogo: AdvertiserLogo | None = None


In [3]:
# -- Create Real Estate Table
# CREATE TABLE real_estate (
#     id VARCHAR PRIMARY KEY, 
#     estate_id VARCHAR,
#     search_date TIMESTAMP,
#     external_id VARCHAR,
#     contract_type VARCHAR,
#     href VARCHAR,
    
#     -- Campos do modelo Price (prefixados com 'price_')
#     price_main_value INTEGER,
#     price_empty_value BOOLEAN,
#     price_below_price BOOLEAN,
#     price_multiple_prices BOOLEAN,
    
#     -- Campos do modelo Address (prefixados com 'address_')
#     address_city VARCHAR,
#     address_state_acronym VARCHAR,
#     address_neighborhood VARCHAR,
#     address_is_approximate_location BOOLEAN,
    
#     business VARCHAR,
#     highlight VARCHAR,
    
#     -- Lista de imagens armazenada como JSONB
#     image_list JSONB,
    
#     -- Campos do modelo Amenities (prefixados com 'amenities_')
#     amenities_usable_areas VARCHAR,
#     amenities_bedrooms VARCHAR,
#     amenities_bathrooms VARCHAR,
#     amenities_parking_spaces VARCHAR,
#     amenities_values JSONB, -- Lista de valores armazenada como JSONB
    
#     -- Campos do modelo RealEstate (prefixados com 'real_estate_')
#     real_estate_id VARCHAR,
#     real_estate_legacy_id INTEGER,
#     real_estate_name VARCHAR,
#     real_estate_advertiser_url VARCHAR,
#     real_estate_tier VARCHAR,
#     real_estate_license VARCHAR,
#     real_estate_created_date TIMESTAMP,
#     real_estate_phone_numbers JSONB, -- Lista de números de telefone armazenada como JSONB
#     real_estate_whats_app_number VARCHAR,
#     real_estate_default_message TEXT,
#     real_estate_total_count_by_filter INTEGER,
#     real_estate_total_count_by_advertiser INTEGER,
#     real_estate_is_verified BOOLEAN,
#     real_estate_is_premium BOOLEAN,
#     real_estate_image_url VARCHAR,
    
#     visualized BOOLEAN,
#     description TEXT,
#     is_no_warrantor_rent BOOLEAN,
#     construction_status VARCHAR,
#     expansion_type VARCHAR,
#     source_id VARCHAR,
    
#     -- Listas armazenadas como JSONB
#     stamps JSONB,
#     unit_types JSONB,
    
#     display_address_type VARCHAR,
    
#     -- Campos do modelo AdvertiserLogo (prefixados com 'advertiser_logo_')
#     advertiser_logo_src VARCHAR,
#     advertiser_logo_alt VARCHAR
# );


def datetime_to_iso8601_z(dt):
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    else:
        dt = dt.astimezone(timezone.utc)
    return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

def real_estate_element_to_dict(real_estate_element: RealEstateElement, search_date: datetime):
    formatted_search_date = datetime_to_iso8601_z(search_date)
    
    data = {
        'id': str(uuid.uuid4()),
        'estate_id': real_estate_element.id,
        'search_date': formatted_search_date,
        'external_id': real_estate_element.externalId,
        'contract_type': real_estate_element.contractType,
        'href': real_estate_element.href,

        # Campos do modelo Price
        'price_main_value': real_estate_element.prices.mainValue if real_estate_element.prices else None,
        'price_empty_value': real_estate_element.prices.emptyValue if real_estate_element.prices else None,
        'price_below_price': real_estate_element.prices.belowPrice if real_estate_element.prices else None,
        'price_multiple_prices': real_estate_element.prices.multiplePrices if real_estate_element.prices else None,

        # Campos do modelo Address
        'address_city': real_estate_element.address.city if real_estate_element.address else None,
        'address_state_acronym': real_estate_element.address.stateAcronym if real_estate_element.address else None,
        'address_neighborhood': real_estate_element.address.neighborhood if real_estate_element.address else None,
        'address_is_approximate_location': real_estate_element.address.isApproximateLocation if real_estate_element.address else None,
        'business': real_estate_element.business,
        'highlight': real_estate_element.highlight,

        # image_list como JSON
        'image_list': json.dumps([image.model_dump() for image in real_estate_element.imageList]) if real_estate_element.imageList else None,

        # Campos do modelo Amenities
        'amenities_usable_areas': real_estate_element.amenities.usableAreas if real_estate_element.amenities else None,
        'amenities_bedrooms': real_estate_element.amenities.bedrooms if real_estate_element.amenities else None,
        'amenities_bathrooms': real_estate_element.amenities.bathrooms if real_estate_element.amenities else None,
        'amenities_parking_spaces': real_estate_element.amenities.parkingSpaces if real_estate_element.amenities else None,
        'amenities_values': json.dumps(real_estate_element.amenities.values) if real_estate_element.amenities and real_estate_element.amenities.values else None,
        
        # Campos do modelo RealEstate
        'real_estate_id': real_estate_element.realEstate.id if real_estate_element.realEstate else None,
        'real_estate_legacy_id': real_estate_element.realEstate.legacyId if real_estate_element.realEstate else None,
        'real_estate_name': real_estate_element.realEstate.name if real_estate_element.realEstate else None,
        'real_estate_advertiser_url': real_estate_element.realEstate.advertiserUrl if real_estate_element.realEstate else None,
        'real_estate_tier': real_estate_element.realEstate.tier if real_estate_element.realEstate else None,
        'real_estate_license': real_estate_element.realEstate.license if real_estate_element.realEstate else None,
        'real_estate_created_date': real_estate_element.realEstate.createdDate if real_estate_element.realEstate else None,
        'real_estate_phone_numbers': json.dumps(real_estate_element.realEstate.phoneNumbers) if real_estate_element.realEstate and real_estate_element.realEstate.phoneNumbers else None,
        'real_estate_whats_app_number': real_estate_element.realEstate.whatsAppNumber if real_estate_element.realEstate else None,
        'real_estate_default_message': real_estate_element.realEstate.defaultMessage if real_estate_element.realEstate else None,
        'real_estate_total_count_by_filter': real_estate_element.realEstate.totalCountByFilter if real_estate_element.realEstate else None,
        'real_estate_total_count_by_advertiser': real_estate_element.realEstate.totalCountByAdvertiser if real_estate_element.realEstate else None,
        'real_estate_is_verified': real_estate_element.realEstate.isVerified if real_estate_element.realEstate else None,
        'real_estate_is_premium': real_estate_element.realEstate.isPremium if real_estate_element.realEstate else None,
        'real_estate_image_url': real_estate_element.realEstate.imageUrl if real_estate_element.realEstate else None,
        
        'visualized': real_estate_element.visualized,
        'description': real_estate_element.description,
        'is_no_warrantor_rent': real_estate_element.isNoWarrantorRent,
        'construction_status': real_estate_element.constructionStatus,
        'expansion_type': real_estate_element.expansionType,
        'source_id': real_estate_element.sourceId,
        'stamps': json.dumps(real_estate_element.stamps) if real_estate_element.stamps else None,
        'unit_types': json.dumps(real_estate_element.unitTypes) if real_estate_element.unitTypes else None,
        'display_address_type': real_estate_element.displayAddressType,

        # Campos do modelo AdvertiserLogo
        'advertiser_logo_src': real_estate_element.advertiserLogo.src if real_estate_element.advertiserLogo else None,
        'advertiser_logo_alt': real_estate_element.advertiserLogo.alt if real_estate_element.advertiserLogo else None,
    }

    # Converter 'real_estate_created_date' para datetime, se necessário
    if data['real_estate_created_date']:
        try:
            data['real_estate_created_date'] = datetime.fromisoformat(data['real_estate_created_date'].replace('Z', '+00:00'))
        except ValueError:
            data['real_estate_created_date'] = None

    return data

In [4]:
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"


@dataclass
class RequestedPageResponse:
    html: Any
    code: int = 200
    exception: Exception | None = None

def get_page(url: str, timeout: int = 20, verbose: int = 0):

    request = Request(url)

    request.add_header("User-Agent", USER_AGENT)

    try:
        return RequestedPageResponse(html=urlopen(request, timeout=timeout))
    except HTTPError as e:
        logger.error("[error]", e)
        return RequestedPageResponse(html=None, code=e.getcode(), exception=e)



def backoff_hdlr(details):
    time.sleep(3)
    logger.warning("Backing off {wait:0.1f} seconds after {tries} tries "
           "calling function {target} with args {args} and kwargs "
           "{kwargs}".format(**details))
    
@backoff.on_exception(
    backoff.expo,
    HTTPError,
    max_tries=3,
    logger=logger,
    on_backoff=backoff_hdlr,
)
def get_page_html(page, action, type, localization):
    url = f"https://www.zapimoveis.com.br/{action}/{type}/{localization}/?pagina={page}"
    logger.debug(f"Requesting info from '{url}'")

    response = get_page(url)

    if response.code != HTTPStatus.OK:
        raise response.exception

    return response.html

def get_page_properties(page_html):
    soup = BeautifulSoup(page_html, "html.parser")
    script = soup.find('script', id='__NEXT_DATA__')
    listings = json.loads(script.text)
    return (
        listings
        .get("props", {})
        .get("pageProps", {})
        .get("initialProps", {})
    )

def get_total_listings(page_properties):
    return (
        page_properties
        .get("pagination", {})
        .get("totalListings", None)
    )

def get_real_state_data(page_properties):
    raw_data = (
        page_properties
        .get("data", {})
    )
    return [RealEstateElement(**d) for d in raw_data]


In [5]:
ACTION = "venda"
LOCALIZATION = "mg+pocos-de-caldas"
TYPE = "imoveis"

In [6]:
# # Como o loop deve ficar na funcao principal
# for page in [1,2, 1000, 3]:
#     try:
#         page_html = get_page_html(page=page, action=ACTION, type=TYPE, localization=LOCALIZATION)
#     except HTTPError:
#         logger.error(f"Error on scrapping page {page}")

In [15]:
import time
from sqlalchemy import create_engine
import os
import pandas as pd
from urllib.parse import quote_plus

db_params = dict(
    user=os.getenv("PSQL_USERNAME"),
    password=quote_plus(os.getenv("PSQL_PASSWORD")),
    host=os.getenv("PSQL_HOST"),
    port=os.getenv("PSQL_PORT"),
    dbname=os.getenv("PSQL_NAME"),
)

db_url = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
engine = create_engine(db_url)


MAX_LISTINGS = 2700
TABLE_NAME = "real_estate"

total_listings = None

page_count = 1
listings_count = 0
while (listings_count < (total_listings or 1e8) ) and (listings_count < MAX_LISTINGS):
    print(f"page_count = {page_count}")
    print(f"listings_count = {listings_count}\n\n\n")
    try:
        search_date = datetime.now()
        page_html = get_page_html(page=page_count, action=ACTION, type=TYPE, localization=LOCALIZATION)
        page_properties = get_page_properties(page_html)
        total_listings = get_total_listings(page_properties)
        real_state_data = get_real_state_data(page_properties)
        new_df = pd.DataFrame([real_estate_element_to_dict(x, search_date) for x in real_state_data])
        new_df.to_sql(TABLE_NAME, engine, if_exists='append', index=False)
        listings_count += len(new_df)
        if len(len(new_df)) == 0:
            break
    except HTTPError as err:
        logger.error(f"Error on scrapping page {page}")
        break

    page_count += 1;
    time.sleep(3)

[32m2024-11-22 03:50:51.550[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=1'[0m


page_count = 1
listings_count = 0





[32m2024-11-22 03:50:56.340[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=2'[0m


page_count = 2
listings_count = 15





[32m2024-11-22 03:51:00.782[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=3'[0m


page_count = 3
listings_count = 30





[32m2024-11-22 03:51:05.122[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=4'[0m


page_count = 4
listings_count = 45





[32m2024-11-22 03:51:09.444[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=5'[0m


page_count = 5
listings_count = 60





[32m2024-11-22 03:51:13.805[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=6'[0m


page_count = 6
listings_count = 75





[32m2024-11-22 03:51:18.119[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=7'[0m


page_count = 7
listings_count = 90





[32m2024-11-22 03:51:22.217[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=8'[0m


page_count = 8
listings_count = 105





[32m2024-11-22 03:51:26.417[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=9'[0m


page_count = 9
listings_count = 120





[32m2024-11-22 03:51:30.696[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=10'[0m


page_count = 10
listings_count = 135





[32m2024-11-22 03:51:34.694[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=11'[0m


page_count = 11
listings_count = 150





[32m2024-11-22 03:51:38.884[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=12'[0m


page_count = 12
listings_count = 165





[32m2024-11-22 03:51:43.090[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=13'[0m


page_count = 13
listings_count = 180





[32m2024-11-22 03:51:47.345[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=14'[0m


page_count = 14
listings_count = 195





[32m2024-11-22 03:51:51.962[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=15'[0m


page_count = 15
listings_count = 210





[32m2024-11-22 03:51:56.369[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=16'[0m


page_count = 16
listings_count = 225





[32m2024-11-22 03:52:00.904[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=17'[0m


page_count = 17
listings_count = 240





[32m2024-11-22 03:52:05.893[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=18'[0m


page_count = 18
listings_count = 255





[32m2024-11-22 03:52:10.045[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=19'[0m


page_count = 19
listings_count = 270





[32m2024-11-22 03:52:14.328[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=20'[0m


page_count = 20
listings_count = 285





[32m2024-11-22 03:52:18.584[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=21'[0m


page_count = 21
listings_count = 300





[32m2024-11-22 03:52:22.933[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=22'[0m


page_count = 22
listings_count = 315





[32m2024-11-22 03:52:27.995[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=23'[0m


page_count = 23
listings_count = 330





[32m2024-11-22 03:52:32.137[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=24'[0m


page_count = 24
listings_count = 345





[32m2024-11-22 03:52:36.428[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=25'[0m


page_count = 25
listings_count = 360





[32m2024-11-22 03:52:40.416[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=26'[0m


page_count = 26
listings_count = 360





[32m2024-11-22 03:52:45.376[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=27'[0m


page_count = 27
listings_count = 360





[32m2024-11-22 03:52:49.027[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=28'[0m


page_count = 28
listings_count = 360





[32m2024-11-22 03:52:53.466[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=29'[0m


page_count = 29
listings_count = 360





[32m2024-11-22 03:52:57.177[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=30'[0m


page_count = 30
listings_count = 360





[32m2024-11-22 03:53:01.349[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=31'[0m


page_count = 31
listings_count = 360





[32m2024-11-22 03:53:05.196[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=32'[0m


page_count = 32
listings_count = 360





[32m2024-11-22 03:53:08.959[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=33'[0m


page_count = 33
listings_count = 360





[32m2024-11-22 03:53:13.078[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=34'[0m


page_count = 34
listings_count = 360





[32m2024-11-22 03:53:17.429[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=35'[0m


page_count = 35
listings_count = 360





[32m2024-11-22 03:53:21.724[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=36'[0m


page_count = 36
listings_count = 360





[32m2024-11-22 03:53:25.578[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=37'[0m


page_count = 37
listings_count = 360





KeyboardInterrupt: 

In [17]:
len(new_df)



0

In [16]:
get_page_html(page=37, action=ACTION, type=TYPE, localization=LOCALIZATION)

[32m2024-11-22 03:53:46.005[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=37'[0m


<http.client.HTTPResponse at 0x7633d7db8670>

In [7]:
page_count = 1
page_html = get_page_html(page=page_count, action=ACTION, type=TYPE, localization=LOCALIZATION)
page_properties = get_page_properties(page_html)
total_listings = get_total_listings(page_properties)
real_state_data = get_real_state_data(page_properties)

[32m2024-11-22 03:45:37.477[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m39[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=1'[0m


In [None]:
!pip install fake_useragent

[0m

In [9]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

from fake_useragent import UserAgent
ua = UserAgent()
headers = {"User-Agent": ua.random}

ModuleNotFoundError: No module named 'fake_useragent'

In [93]:
MAX_LISTINGS = 10
count = 0
while (count < total_listings) and (count < MAX_LISTINGS):
    print(count)
    count += 1;

0
1
2
3
4
5
6
7
8
9


In [66]:
listings["props"]["pageProps"].keys()

dict_keys(['campaigns', 'initialProps', 'trackingData', 'initialCms', 'pageCategory', 'contingency', 'experiments', 'featureToggle', 'gtmId', 'seasonalCampaigns'])

In [67]:
listings["props"]["pageProps"]["initialProps"]["pagination"]

{'totalListings': 2378, 'range': 3, 'total': 24, 'current': 4}

In [51]:
listings["props"]["pageProps"]["initialProps"].keys()

dict_keys(['isCampaignPage', 'isPublisherPage', 'isDeduplication', 'filters', 'data', 'pagination', 'levels', 'metaContent', 'searchText', 'schema', 'widgets', 'amenities'])

In [18]:
data = get_real_state_data(page_html)

In [89]:
total_listings

2377