## Busca de Imóveis

Utilizar sites de corretoras para criar um dataset de imóveis de Curitiba

* Programação Concorrente

In [18]:
from bs4 import BeautifulSoup
import requests
import re
import csv
from rateL import RateLimiter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from unidecode import unidecode

RATE_LIMITER = RateLimiter(2)
SITE = "https://www.zapimoveis.com.br/venda/imoveis/pr+curitiba/?onde=,Paran%C3%A1,Curitiba,,,,,city,BR%3EParana%3ENULL%3ECuritiba,-25.437238,-49.269973,&pagina={pagina}&tipo=Im%C3%B3vel%20usado&transacao=Venda"

#### Bairros

In [23]:
with open('bairros.txt', 'r', encoding='utf-8') as f:
    
    bairros ={}

    bs = f.read().splitlines()

    for b in range(len(bs)):
        bairros[bs[b]] = b

#### Funções

In [21]:
def list_to_csv(list,file):
    with open(file, 'w',newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['local','preco','metragem','quartos','garagens','banheiros','bairro'])
        writer.writerows(list)

In [3]:
def get_page(s,p):
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })

    next(RATE_LIMITER)

    link = requests.get(s.format(pagina=p), headers=headers)

    if link.ok:

        result = re.sub(r'<!.*?->','', link.content.decode("utf-8"))
        result = result.replace('\n','')

        reddit1Content =BeautifulSoup(result,"html.parser")

        cards = reddit1Content.find_all('div', class_='simple-card__box')

        imoveis = []
        for card in cards:
            imovel = []

            local = card.find('h2').text
            imovel.append(local.replace('            ','').replace('          ',''))

            preco =card.find('p').find('strong').text
            imovel.append(preco.replace('R$','').replace(' ','').replace('.',''))

            try:
                metragem = card.find("span", itemprop="floorSize").text
                imovel.append(metragem.replace('m²','').replace(' ',''))
            except:
                imovel.append('')

            try:
                quartos = card.find("span", itemprop="numberOfRooms").text
                imovel.append(quartos.replace(' ',''))
            except:
                imovel.append('')

            try:
                garagens = card.find("li", class_="feature__item text-small js-parking-spaces")
                garagens = garagens.find('span', class_="").text
                imovel.append(garagens.replace(' ',''))
            except:
                imovel.append('')

            try:
                banheiros = card.find("span", itemprop="numberOfBathroomsTotal").text
                imovel.append(banheiros.replace(' ',''))
            except:
                imovel.append('')
            
            bairro = define_bairro(local.replace('            ','').replace('          ',''))
            imovel.append(bairro)

            imoveis.append(imovel)
        
    else:
        return False

    return imoveis

In [19]:
def define_bairro(local):
    global bairros
    for bairro in bairros:
        if unidecode(bairro) in unidecode(local):
            return bairros.get(bairro)
    return -1

In [7]:
def thread_executor(pages):
    imoveis = []

    threads= []
    erros = []
    with ThreadPoolExecutor(15) as executor:
        for p in pages:
            threads.append(executor.submit(get_page, SITE, p))

        for task in tqdm(as_completed(threads),total=len(threads)):
                    if task.result() == False:
                        erros.append(task.result())
                    else:
                        imoveis += task.result()
                        
                    continue
    return imoveis

#### Execução

In [8]:
imoveis = thread_executor(range(1,20))

list_to_csv(imoveis,'imoveis.csv')

 11%|█         | 2/19 [00:05<00:47,  2.79s/it]
