In [1]:
%config IPCompleter.use_jedi=False

In [2]:
import os
from loguru import logger
from urllib.parse import quote_plus
from playwright._impl._errors import TimeoutError

from pc_zap_scrapper.v2.scrape import get_estates_from_page, get_html_page, get_number_of_pages, get_number_of_real_estates
from pc_zap_scrapper.v2.database import DatabaseHandler, TableRealEstateInfo


import nest_asyncio
nest_asyncio.apply()

In [None]:
ACTION="venda"
TYPE="imoveis"
LOCALIZATION="mg+pocos-de-caldas"

db_params = dict(
    user=os.getenv("PSQL_USERNAME"),
    password=quote_plus(os.getenv("PSQL_PASSWORD")),
    host=os.getenv("PSQL_HOST"),
    port=os.getenv("PSQL_PORT"),
    dbname=os.getenv("PSQL_NAME"),
)

db_handler = DatabaseHandler(db_params, table=TableRealEstateInfo, echo=False)


soup = get_html_page(f"https://www.zapimoveis.com.br/{ACTION}/{TYPE}/{LOCALIZATION}")
number_of_real_estates = get_number_of_real_estates(soup)
number_of_pages = get_number_of_pages(number_of_real_estates)
logger.info(f"number_of_real_estates = {number_of_real_estates}")
logger.info(f"number_of_pages = {number_of_pages}")

N_EXPECTED_PAGES = 25

for page in range(1, 1+number_of_pages):

    if page > N_EXPECTED_PAGES:
        break

    logger.info(f"scrapping page {page}")
    try:
        estates = await get_estates_from_page(
            action=ACTION,
            type=TYPE,
            localization=LOCALIZATION,
            page=page,
        )
        await db_handler.create_table()
        await db_handler.insert_data(estates)
    except TimeoutError:
        continue

    except:
        logger.error("Não foi possível persistir. Valores duplicados")


await db_handler.close()

[32m2024-11-29 02:59:46.208[0m | [34m[1mDEBUG   [0m | [36mpc_zap_scrapper.v2.scrape[0m:[36mget_html_page[0m:[36m413[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas'[0m
[32m2024-11-29 02:59:50.365[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mnumber_of_real_estates = 2377[0m
[32m2024-11-29 02:59:50.367[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mnumber_of_pages = 23[0m
[32m2024-11-29 02:59:50.367[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mscrapping page 1[0m
Loop 8 | 110 elementos: 100%|██████████| 110/110 [01:19<00:00,  1.38it/s]
[32m2024-11-29 03:01:19.699[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mscrapping page 2[0m
Loop 7 | 105 elementos: 100%|██████████| 105/105 [01:09<00:00,  1.51it/s]
[32m2024-11-29 03:02:39.792[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0

In [38]:

import pandas as pd
import sqlalchemy
from sqlalchemy import insert
from sqlalchemy.orm import declarative_base
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import sessionmaker
from sqlalchemy import (
   Column,
   UUID,
   Text,
   TIMESTAMP,
   Double,
   Integer,
   Numeric,
   ARRAY,
)


Base = declarative_base()


class DatabaseHandler:
    def __init__(self, db_params: dict, table: sqlalchemy.orm.decl_api.DeclarativeMeta, echo: bool = False):
        db_url = (
            f"postgresql+asyncpg://{db_params['user']}:{db_params['password']}@"
            f"{db_params['host']}:{db_params['port']}/{db_params['dbname']}"
        )
        self.table = table
        self.engine = create_async_engine(db_url, echo=echo)
        self.async_session = sessionmaker(self.engine, class_=AsyncSession, expire_on_commit=False)

    async def create_table(self):
        async with self.engine.begin() as conn:
            await conn.run_sync(Base.metadata.create_all)

    async def insert_data(self, df: pd.DataFrame):
        records = df.to_dict(orient="records")

        async with self.async_session() as session:
            async with session.begin():
                await session.execute(insert(self.table).values(records))
                await session.commit()
    
    async def query(self, sql_query: str) -> pd.DataFrame:
        async with self.engine.connect() as conn:
            result = await conn.execute(sqlalchemy.text(sql_query))
            rows = result.fetchall()
            columns = result.keys()
            df = pd.DataFrame(rows, columns=columns)
            return df


    async def close(self):
        await self.engine.dispose()


In [39]:
db_params = dict(
    user=os.getenv("PSQL_USERNAME"),
    password=quote_plus(os.getenv("PSQL_PASSWORD")),
    host=os.getenv("PSQL_HOST"),
    port=os.getenv("PSQL_PORT"),
    dbname=os.getenv("PSQL_NAME"),
)

db_handler = DatabaseHandler(db_params, table=TableRealEstateInfo, echo=False)


In [40]:
sql_query = """
SELECT * FROM real_estate_info
"""

data = await db_handler.query(sql_query)

await db_handler.close()

In [41]:
data

Unnamed: 0,id,estate_id,action,search_date,post_type,link,type,image_list,snippet,street,neighbor,city,state,latitude,longitude,floor_size,number_of_rooms,number_of_bathrooms,number_of_parking_spaces,amenities_list,price,condominium,iptu
0,fc495ad4-19d8-46a5-a6a8-065e5258d52e,2635911061,venda,2024-11-27 05:21:57.499350,SUPER PREMIUM,https://www.zapimoveis.com.br/imovel/venda-apa...,Apartamento,[https://resizedimgs.zapimoveis.com.br/crop/61...,Apartamento com 3 quartos à venda,,Jardim dos Estados,Poços de Caldas,MG,-21.783975,-46.552999,235.0,3.0,1.0,2.0,"[Cozinha, Elevador, Interfone, Área de serviço]",800000.00,700.00,0.00
1,ff0af6b2-ef1b-4070-ae2f-0a6312c47157,2753629515,venda,2024-11-27 05:21:57.499350,SUPER PREMIUM,https://www.zapimoveis.com.br/imovel/venda-cas...,Casa,[https://resizedimgs.zapimoveis.com.br/crop/61...,Casa com 4 quartos à venda,,Boa Esperança,Poços de Caldas,MG,-21.826302,-46.557761,219.0,4.0,4.0,3.0,,450000.00,,
2,02007133-c21e-4dc9-9e2f-34e9d95e92d6,2756849948,venda,2024-11-27 05:21:57.499350,SUPER PREMIUM,https://www.zapimoveis.com.br/imovel/venda-cas...,Casa,[https://resizedimgs.zapimoveis.com.br/crop/61...,Casa com 3 quartos à venda,,Jardim Carolina,Poços de Caldas,MG,-21.793090,-46.601182,131.0,3.0,4.0,3.0,"[Circuito de segurança, Portaria 24h, Espaço g...",820000.00,,
3,476271bb-6912-4e8f-acdb-f9389ab87124,2635633461,venda,2024-11-27 05:21:57.499350,PREMIUM,https://www.zapimoveis.com.br/imovel/venda-cas...,Casa,[https://resizedimgs.zapimoveis.com.br/crop/61...,Casa com quarto à venda,,Jardim Ipê,Poços de Caldas,MG,-21.802998,-46.543752,63.0,1.0,1.0,,,250000.00,,
4,2277df07-4a44-4526-b8a7-bce42e1b5298,2731635351,venda,2024-11-27 05:21:57.499350,STANDARD,https://www.zapimoveis.com.br/imovel/venda-apa...,Apartamento,[https://resizedimgs.zapimoveis.com.br/crop/61...,Apartamento com 2 quartos à venda,,Jardim Bandeirantes,Poços de Caldas,MG,-21.809666,-46.586492,50.0,2.0,1.0,1.0,"[Cozinha, Interfone, Varanda, Área de serviço,...",200000.00,180.00,660.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2430,df186c48-0c19-4435-86a5-2dbd6a0eec82,2742901654,venda,2024-11-27 06:16:06.315730,STANDARD,https://www.zapimoveis.com.br/imovel/venda-sob...,Casa,[https://resizedimgs.zapimoveis.com.br/crop/61...,Casa com 3 quartos à venda,,Parque Vivaldi Leite Ribeiro,Poços de Caldas,MG,-21.798239,-46.557005,279.0,3.0,3.0,6.0,"[Churrasqueira, Interfone, Lavanderia, Cozinha...",900000.00,,
2431,ecfd5ee6-d3b3-46a3-9b5a-b1a8b4edb63a,2654686087,venda,2024-11-27 06:16:06.315730,STANDARD,https://www.zapimoveis.com.br/imovel/venda-apa...,Apartamento,[https://resizedimgs.zapimoveis.com.br/crop/61...,Apartamento com 4 quartos à venda,,Centro,Poços de Caldas,MG,-21.784097,-46.570472,333.0,4.0,1.0,4.0,"[Varanda, Churrasqueira, Condomínio fechado, S...",3849850.00,1.00,1.00
2432,fcdcd1e3-0429-4228-ba7a-858982af3752,2759299622,venda,2024-11-27 06:16:06.315730,PREMIUM,https://www.zapimoveis.com.br/imovel/venda-apa...,Apartamento,[https://resizedimgs.zapimoveis.com.br/crop/61...,Apartamento com 2 quartos à venda,,Centro,Poços de Caldas,MG,-21.784097,-46.570472,75.0,2.0,1.0,,"[Elevador, Aceita animais]",340000.00,500.00,0.00
2433,f2cc3484-b3ed-4947-b966-2692096f7604,2724651059,venda,2024-11-27 06:16:06.315730,PREMIUM,https://www.zapimoveis.com.br/imovel/venda-cas...,Casa,[https://resizedimgs.zapimoveis.com.br/crop/61...,Casa com 2 quartos à venda,,Bianucci,Poços de Caldas,MG,-21.796872,-46.522370,400.0,2.0,5.0,6.0,"[Churrasqueira, Cozinha, Lareira, Piscina, Sis...",1900000.00,0.00,1.00


In [37]:
await db_handler.close()

# Sandbox

In [3]:
import pandas as pd

In [4]:
pd.options.display.max_columns = None

In [5]:
ACTION="venda"
TYPE="imoveis"
LOCALIZATION="mg+pocos-de-caldas"


In [None]:

page = 1

estates = await get_estates_from_page(
    action=ACTION,
    type=TYPE,
    localization=LOCALIZATION,
    page=page,
)

In [None]:
estates

In [8]:
from pc_zap_scrapper.v2.scrape import scrape_estate_divs_from_page, _get_snippet

In [9]:
divs = await scrape_estate_divs_from_page(
    action=ACTION,
    type=TYPE,
    localization=LOCALIZATION,
    page=1,
)

Loop 8 | 113 elementos: 100%|██████████| 113/113 [01:11<00:00,  1.57it/s]


In [10]:
div = divs[0]

In [11]:
from pc_zap_scrapper.v2.scrape import _get_type

In [14]:
def _get_type(div):

    if snippet := _get_snippet(div):
        type = _get_type_from_snippet(snippet)

    if type:
        return type


    if "apartamento" in str(div).lower():
        return "Apartamento"

    if "casa" in str(div).lower():
        return "Casa"

    if "cobertura" in str(div).lower():
        return "Apartamento"

    if "flat" in str(div).lower():
        return "Flat"

    if "chacara" in unidecode(str(div).lower()):
        return "Rural"

    if "sitio" in unidecode(str(div).lower()):
        return "Rural"

    if "fazenda" in unidecode(str(div).lower()):
        return "Rural"


def _get_type_from_snippet(snippet):
    if not snippet:
        logger.warning("No snippet found")
        return None
    snippet = str(snippet)
    is_rural = ("fazenda" in snippet.lower()) or ("sítio" in snippet.lower()) or ("chácara" in snippet.lower())
    is_comercial = ("ponto comercial" in snippet.lower()) or ("loja" in snippet.lower()) or ("box" in snippet.lower())
    is_lote = ("lote" in snippet.lower()) or ("terreno" in snippet.lower())

    if "apartamento" in snippet.lower():
        return "Apartamento"

    if is_rural:
        return "Rural"

    if "casa" in snippet.lower():
        return "Casa"

    if "flat" in snippet.lower():
        return "Flat"

    if "cobertura" in snippet.lower():
        return "Apartamento"

    if is_comercial:
        return "Comercial"

    if is_lote:
        return "Lote"

In [15]:
snippet = _get_snippet(div)
type = _get_type(div)

type

'Casa'

In [19]:
snippet

'Centro, Poços de Caldas'

In [13]:

def _format_neighbor_name(text: str) -> str:
    if text:
        return unidecode(text.strip().lower())
    return None

pc_neighbors_latlong = (
    pd.read_parquet(resources.files("pc_zap_scrapper").joinpath("datasets/external/neighbor_latlong.parquet"))
    .set_index("neighborhood")
    .to_dict(orient="index")
)

pc_neighbors_latlong = {_format_neighbor_name(k): v for k, v in pc_neighbors_latlong.items() if k}

search_date = search_date or datetime.now()
prices = _get_prices(div)
snippet = _get_snippet(div)
location = _get_location(div)
type = _get_type(snippet)
neighbor = location.get("neighbor")
formatted_neighbor_name = _format_neighbor_name(neighbor)
latlong = pc_neighbors_latlong.get(formatted_neighbor_name, {}) if formatted_neighbor_name else {}
latitude = latlong.get("latitude", None)
longitude = latlong.get("longitude", None)


In [14]:
snippet

'Jardim Quisisana, Poços de Caldas'

In [None]:
# @suppress_errors_and_log
# def _get_location(div):
#     if subtitle := div.find("h2", attrs={"data-testid": "card-header"}):
#         if spans := subtitle.findAll("span"):
#             texts = [x.text for x in spans]
#             *street, neighbor, city, state = [x.strip() for x in texts[1].replace("-", ",").split(",")]
#             return {
#                 "street": " - ".join(street),
#                 "neighbor": neighbor,
#                 "city": city,
#                 "state": state,
#             }

#     if subtitle := div.find("h2"):
#         elements = str(subtitle.text).split[","]
#         if len(elements) == 2:
#             return {
#                 "street": None,
#                 "neighbor": elements[0],
#                 "city": elements[1],
#                 "state": None,
#             }
#         logger.warning(f"It wa not possible to get locatino info from snippet '{subtitle}'")
#     return {
#         "street": None,
#         "neighbor": None,
#         "city": None,
#         "state": None,
#     }



In [12]:
div

<div data-position="1" data-type="SUPER PREMIUM"><div class="ListingCard_result-card__Pumtx"><a class="ListingCard_result-card__Pumtx ListingCard_result-card__highlight__UgAvU ListingCard_result-card__highlight--super__8WmjB" data-id="2759584336" href="https://www.zapimoveis.com.br/imovel/venda-terreno-lote-condominio-jardins-de-florenca-pocos-de-caldas-mg-276m2-id-2759584336/" itemprop="url" itemscope="" itemtype="https://schema.org/House" target="_blank"><div class="BaseCard_card__wrapper__AKWue"><div class="BaseCard_card__Ci4Ny BaseCard_card--horizontal__GgDlY" data-testid="card"><div class="BaseCard_card-carousel__VFcJU"><div class="l-carousel-image" data-cy="rp-cardProperty-image-img"><div class="l-carousel-image__container"><ul class="l-carousel-image__list" style="transform: translateX(0px);"><li aria-hidden="false"><img alt="Imagem do imóvel" class="l-image" fetchpriority="low" itemprop="image" loading="lazy" src="https://resizedimgs.zapimoveis.com.br/crop/614x297/vr.images.sp/

In [20]:
def _get_snippet(div):
    if subtitle := div.find("h2", attrs={"data-testid": "card-header"}):
        if spans := subtitle.findAll("span"):
            texts = [x.text for x in spans]
            return texts[0].split(",")[0]

    if subtitle := div.find("h2"):
        return subtitle.text

In [21]:
div.find("h2")

<h2 class="l-text l-u-color-neutral-28 l-text--variant-heading-small l-text--weight-medium truncate" data-cy="rp-cardProperty-location-txt" title="Jardins de Florença, Poços de Caldas">Jardins de Florença, Poços de Caldas</h2>

In [22]:
_get_snippet(div)

'Jardins de Florença, Poços de Caldas'

In [15]:
div.find("h2")

<h2 class="l-text l-u-color-neutral-28 l-text--variant-heading-small l-text--weight-medium truncate" data-cy="rp-cardProperty-location-txt" title="Jardins de Florença, Poços de Caldas">Jardins de Florença, Poços de Caldas</h2>