In [9]:
!pip install selenium
!pip install seleniumbase
!pip install random-user-agent -qqq
!pip install webdriver-manager -qqq

# Teste com Selenium

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
import time
import re
import requests
from random_user_agent.user_agent import UserAgent
import pandas as pd
import datetime
import pytz
from sqlalchemy import create_engine
from pandas import json_normalize
import os
from selenium.webdriver.chrome.service import Service
import concurrent.futures
import pyarrow as pa
import pyarrow.parquet as pq
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager

import time
from dataclasses import dataclass

import backoff
from loguru import logger

In [2]:
import json
from http import HTTPStatus
from typing import List, Optional, Any
from pydantic import BaseModel

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import HTTPError

USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"


@dataclass
class RequestedPageResponse:
    html: Any
    code: int = 200
    exception: Exception | None = None

def get_page(url: str, timeout: int = 20, verbose: int = 0):

    request = Request(url)
    user_agent = UserAgent().get_random_user_agent()

    request.add_header("User-Agent", user_agent)

    try:
        return RequestedPageResponse(html=urlopen(request, timeout=timeout))
    except HTTPError as e:
        logger.error("[error]", e)
        return RequestedPageResponse(html=None, code=e.getcode(), exception=e)

def backoff_hdlr(details):
    time.sleep(3)
    logger.warning("Backing off {wait:0.1f} seconds after {tries} tries "
           "calling function {target} with args {args} and kwargs "
           "{kwargs}".format(**details))
    
@backoff.on_exception(
    backoff.expo,
    HTTPError,
    max_tries=3,
    logger=logger,
    on_backoff=backoff_hdlr,
)
def get_page_html(page, action, type, localization):
    url = f"https://www.zapimoveis.com.br/{action}/{type}/{localization}/?pagina={page}"
    logger.debug(f"Requesting info from '{url}'")

    response = get_page(url)

    if response.code != HTTPStatus.OK:
        raise response.exception

    return response.html

def get_number_of_real_estates(soup_object: BeautifulSoup):
    title_element = soup_object.find('div', {"class":"result-wrapper__title"})
    return int(re.sub('[^0-9]','',title_element.text))

def get_number_of_pages(number_of_real_estates: int):
    return number_of_real_estates//100 if number_of_real_estates//100 > 1 else 1
    

In [3]:
BASE_URL = 'https://www.zapimoveis.com.br'

TRANSACTION = "venda"
LOCALIZATION = "mg+pocos-de-caldas"
TYPE = "imoveis"
PAGE = 1

In [4]:
url = f'{BASE_URL}/{TRANSACTION}/{TYPE}/{LOCALIZATION}/?transacao={TRANSACTION}&pagina={PAGE}'

html_page = get_page_html(page=PAGE, action=TRANSACTION, type=TYPE, localization=LOCALIZATION)

soup = BeautifulSoup(html_page, "html.parser")

[32m2024-11-23 20:38:46.589[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_page_html[0m:[36m47[0m - [34m[1mRequesting info from 'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?pagina=1'[0m


In [6]:
n_estates = get_number_of_real_estates(soup)
n_pages = get_number_of_pages(n_estates)

In [7]:
pagina = n_pages + 2 if n_pages < 100 else 101
pagina

26

In [9]:
from selenium import webdriverf
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Configurações opcionais para o Chrome
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--remote-debugging-port=9222')

# Cria o serviço do ChromeDriver
service = Service(ChromeDriverManager().install())

# Inicializa o navegador com o serviço e as opções especificadas
browser = webdriver.Chrome(service=service, options=options)

url = f'{BASE_URL}/{TRANSACTION}/{TYPE}/{LOCALIZATION}/?transacao={TRANSACTION}&pagina={PAGE}'

browser.get(url)
time.sleep(2)

total_height = int(browser.execute_script("return document.body.scrollHeight"))


n = 1
while n < total_height:
    browser.execute_script(f"window.scrollTo(0, {n});")
    n += 90
    total_height = int(browser.execute_script("return document.body.scrollHeight"))

time.sleep(2)

resultado = browser.find_element(By.XPATH, '//*')
source_code = resultado.get_attribute("innerHTML")

browser.quit()

In [17]:
source_code

'<!--<![endif]--><head>\n<title>Attention Required! | Cloudflare</title>\n<meta charset="UTF-8">\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=Edge">\n<meta name="robots" content="noindex, nofollow">\n<meta name="viewport" content="width=device-width,initial-scale=1">\n<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css">\n<!--[if lt IE 9]><link rel="stylesheet" id=\'cf_styles-ie-css\' href="/cdn-cgi/styles/cf.errors.ie.css" /><![endif]-->\n<style>body{margin:0;padding:0}</style>\n\n\n<!--[if gte IE 10]><!-->\n<script>\n  if (!navigator.cookieEnabled) {\n    window.addEventListener(\'DOMContentLoaded\', function () {\n      var cookieEl = document.getElementById(\'cookie-alert\');\n      cookieEl.style.display = \'block\';\n    })\n  }\n</script>\n<!--<![endif]-->\n\n\n</head>\n<body>\n  <div id="cf-wrapper">\n    <div class="cf-alert cf-alert-error cf-cookie-error" id="cookie-alert" data-t

In [None]:

# Seu código para interagir com o navegador aqui

# Não se esqueça de fechar o navegador após o uso
browser.quit()


In [None]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager

# # Configurações opcionais para o Chrome
# options = webdriver.ChromeOptions()

# # Cria o serviço do ChromeDriver
# service = Service(ChromeDriverManager().install())

# # Uso do Context Manager com o WebDriver
# with webdriver.Chrome(service=service, options=options) as browser:
#     # Seu código para interagir com o navegador aqui
#     browser.get('https://www.example.com')
#     # ... outras operações ...
# # O navegador será fechado automaticamente ao sair do bloco 'with'


In [57]:
browser = webdriver.Chrome(ChromeDriverManager().install())

AttributeError: 'str' object has no attribute 'capabilities'

In [53]:
n_pagesa

24

In [42]:
res = soup.find('div', {"class":"result-wrapper__title"}).text

In [49]:
type(soup)

bs4.BeautifulSoup

In [44]:
imoveis = int(re.sub('[^0-9]','',res))
imoveis_pagina = imoveis//100 if imoveis//100 > 1 else 1

In [46]:
imoveis

2408

In [48]:
imoveis//100

24

In [35]:
ua = UserAgent()
user_agents = USER_AGENT#ua.get_random_user_agent()

headers = {'user-agent': user_agents.strip(), 'encoding':'utf-8'}

r = requests.get(url, headers = headers)

r.status_code

403

In [33]:
get_page(url)

RequestedPageResponse(html=<http.client.HTTPResponse object at 0x76ff6dbbb640>, code=200, exception=None)

In [36]:
UserAgent().get_random_user_agent()

'Mozilla/5.0 (Linux; Android 7.0; Mi-4c Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.42 Mobile Safari/537.36'

403

In [29]:
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [24]:
r.reason

'Forbidden'

In [20]:
url

'https://www.zapimoveis.com.br/venda/imoveis/mg+pocos-de-caldas/?transacao=venda&pagina=1'