## Busca de Imóveis - ImovelWeb

Utilizar sites de corretoras para criar um dataset de imóveis de Curitiba

* Programação Concorrente

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import csv
from rateL import RateLimiter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from unidecode import unidecode
import cloudscraper

RATE_LIMITER = RateLimiter(10)
SCRAPER = cloudscraper.create_scraper()

SITE = "https://www.imovelweb.com.br/casas-venda-curitiba-pr-pagina-{pagina}.html"

#### Bairros

In [2]:
with open('bairros.txt', 'r', encoding='utf-8') as f:
    
    bairros ={}

    bs = f.read().splitlines()

    for b in range(len(bs)):
        bairros[bs[b]] = b

#### Funções

In [3]:
def list_to_csv(list,file):
    with open(file, 'a',newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        #writer.writerow(['local','preco','metragem','quartos','garagens','banheiros','bairro'])
        writer.writerows(list)

In [4]:
def define_bairro(local):
    global bairros
    for bairro in bairros:
        if unidecode(bairro) in unidecode(local):
            return bairros.get(bairro)
    return -1

In [5]:
def get_page(s,p):
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })

    #next(RATE_LIMITER)

    link = s.format(pagina=p)
    content = SCRAPER.get(link).text

    #print(content)

    result = re.sub(r'<!.*?->','', content)
    result = result.replace('\n','')

    reddit1Content =BeautifulSoup(result,"html.parser")

    cards = reddit1Content.find_all('div', class_='sc-i1odl-1 cDJsOw')

    imoveis = []
    for card in cards:
        imovel = []

        local = card.find(attrs={'data-qa': 'POSTING_CARD_LOCATION'}).text
        imovel.append(local)

        preco =card.find(attrs={'data-qa': 'POSTING_CARD_PRICE'}).text
        imovel.append(preco.replace('R$','').replace(' ','').replace('.',''))

        features = card.find(attrs={'data-qa': 'POSTING_CARD_FEATURES'})
        
        try:
            dados = features.text.split('  ')
        except:
            continue
       

        try:
            metragem = dados[1]
            if 'm²' in metragem:
                imovel.append(metragem.replace('m²','').replace(' ',''))
            else:
                imovel.append('')
        except:
            imovel.append('')

        try:
            quartos = dados[2]
            if 'quarto' in quartos or 'quartos' in quartos:
                imovel.append(quartos.replace('quartos','').replace('quarto','').replace(' ',''))
            else:
                imovel.append('')
        except:
            imovel.append('')

        try:
            garagens = dados[4]
            if 'vaga' in garagens or 'vagas' in garagens:
                imovel.append(garagens.replace('vagas','').replace('vaga','').replace(' ',''))
            else:
                imovel.append('')
        except:
            imovel.append('')

        try:
            banheiros = dados[3]
            if 'banheiro' in banheiros or 'ban' in banheiros:
                imovel.append(banheiros.replace('banheiro','').replace('ban','').replace(' ',''))
            else:
                imovel.append('')

        except:
            imovel.append('')
        
        bairro = define_bairro(local)
        imovel.append(bairro)

        imoveis.append(imovel)
        
    

    return imoveis

In [6]:
def thread_executor(pages):
    imoveis = []

    threads= []
    erros = []
    with ThreadPoolExecutor(15) as executor:
        for p in pages:
            threads.append(executor.submit(get_page, SITE, p))

        for task in tqdm(as_completed(threads),total=len(threads)):
                    if task.result() == False:
                        erros.append(task.result())
                    else:
                        imoveis += task.result()
                        
                    continue
    return imoveis

#### Execução

In [10]:
imoveis = thread_executor(range(701,800))

list_to_csv(imoveis,'imoveis_imovelweb.csv')

100%|██████████| 99/99 [00:34<00:00,  2.89it/s]
