In [1]:
import requests
from bs4 import BeautifulSoup
import textwrap
from datetime import datetime
import json
import urllib
from decimal import Decimal

In [2]:
query = 'search%5Bfilter_float_m%3Afrom%5D=100&search%5Bcreated_since%5D=3&search%5Bregion_id%5D=11&search%5Bsubregion_id%5D=153'
query_arr = [ q.split('=') for q in urllib.parse.unquote(query).split('&')]
query_dict = {}
for q in query_arr:
    query_dict[q[0]] = q[1]
    
query_dict = {
    "search[created_since]": "3",
    "search[filter_float_m:from]": "100",
    "search[region_id]": "11",
    "search[subregion_id]": "153"
}

def json_to_dict(dict_json):
    return '&'.join(['{}={}'.format(urllib.parse.quote(k),urllib.parse.quote(v)) for k, v in dict_json.items()])

#print(json.dumps(query_dict, indent=4, sort_keys=True))
query=json_to_dict(query_dict)

In [3]:
host = 'https://www.imovirtual.com'
path = 'comprar/apartamento/lisboa'
url = '{}/{}/?{}'.format(host,path,query)

total_pages = 0
articles = []
while(url):
    print('Request URL: {}'.format(url))
    total_pages += 1
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    articles += soup.find_all('article')
    
    pagination = soup.find('li', 'pager-next')
    next_page_tag = pagination.find('a')
    url = next_page_tag.get('href') if next_page_tag else None

print('Total pages: {}\nTotal articles: {}'.format(total_pages,len(articles)))

Request URL: https://www.imovirtual.com/comprar/apartamento/lisboa/?search%5Bcreated_since%5D=3&search%5Bfilter_float_m%3Afrom%5D=100&search%5Bregion_id%5D=11&search%5Bsubregion_id%5D=153
Request URL: https://www.imovirtual.com/comprar/apartamento/lisboa/?search%5Bcreated_since%5D=3&search%5Bfilter_float_m%3Afrom%5D=100&search%5Bregion_id%5D=11&search%5Bsubregion_id%5D=153&page=2
Request URL: https://www.imovirtual.com/comprar/apartamento/lisboa/?search%5Bcreated_since%5D=3&search%5Bfilter_float_m%3Afrom%5D=100&search%5Bregion_id%5D=11&search%5Bsubregion_id%5D=153&page=3
Request URL: https://www.imovirtual.com/comprar/apartamento/lisboa/?search%5Bcreated_since%5D=3&search%5Bfilter_float_m%3Afrom%5D=100&search%5Bregion_id%5D=11&search%5Bsubregion_id%5D=153&page=4
Total pages: 4
Total articles: 87


In [4]:
from re import sub

def imovirtual_article_to_json(art):
    offer_details = art.find('div', 'offer-item-details')
    
    price_el = offer_details.find('li','offer-item-price')
    size_el = offer_details.find('li','offer-item-area')
    price_str = sub(r'[^\d.,]', '', price_el.get_text())
    size_str = sub(r'[^\d.,]', '', size_el.get_text()).replace(',','.')
    if(len(price_str) == 0): price_str = '0'
    if(len(size_str) == 0): size_str = '0'
    
    title = art.find('header', 'offer-item-header').find('span', 'offer-item-title').get_text().strip()
    address = offer_details.find('p').find('span').next_sibling
    size = Decimal(size_str)
    price = Decimal(price_str)
    advertising_link = art.get('data-url')
    data_id = art.get('data-item-id')
    tracking_id = art.get('data-tracking-id')
    offer_dict = {
        'title': title,
        'price': price,
        'url': advertising_link,
        'item-id': data_id,
        'tracking-id': tracking_id,
        'size': size,
        'address': address,
        'price_note': None if price != 0 else price_el.get_text().strip(),
        'created': datetime.now().strftime('%d-%m-%Y %H:%M:%S')
    }
 
    return offer_dict

items = []
for art in articles:
    items.append(imovirtual_article_to_json(art))
    
len(items)

87

In [5]:
items

[{'title': 'Fabuloso T3 no Lux Garden em Lisboa',
  'price': Decimal('785000'),
  'url': 'https://www.imovirtual.com/anuncio/fabuloso-t3-no-lux-garden-em-lisboa-ID13zNY.html#22dd3644e3',
  'item-id': '13zNY',
  'tracking-id': '15628958',
  'size': Decimal('110'),
  'address': 'São Domingos de Benfica, Lisboa',
  'price_note': None,
  'created': '05-08-2020 11:01:06'},
 {'title': 'Apartamento T3 com varandas, Parque das Nações',
  'price': Decimal('575000'),
  'url': 'https://www.imovirtual.com/anuncio/apartamento-t3-com-varandas-parque-das-nacoes-ID13zGT.html#22dd3644e3',
  'item-id': '13zGT',
  'tracking-id': '15628519',
  'size': Decimal('101'),
  'address': 'Parque das Nações, Lisboa',
  'price_note': None,
  'created': '05-08-2020 11:01:06'},
 {'title': 'T2 Duplex com Vista Rio',
  'price': Decimal('685000'),
  'url': 'https://www.imovirtual.com/anuncio/t2-duplex-com-vista-rio-ID13zDJ.html#22dd3644e3',
  'item-id': '13zDJ',
  'tracking-id': '15628323',
  'size': Decimal('114'),
  '