In [1]:
import requests
from bs4 import BeautifulSoup
import textwrap
from datetime import datetime
import json
import urllib


In [2]:
with open("scrap.json") as json_file:
    scrap_json = json.load(json_file)
print('Loaded scrap json:\n\tUID: {}'.format(scrap_json['uid']))

Loaded scrap json:
	UID: 55c5fd9dadafwmbu5lzwmd2vx5d2agxggm894hqnht8


## Ad scrapping

In [3]:
def json_to_cookie(cookie_json):
    return '; '.join(['{}={}'.format(k,v) for k, v in cookie_json.items()])
#print(cookie_str, new_cookie_str)

#print compare
def compare_print(a, b):
    print('{:>55} | {:<55}'.format('A','B'))
    print('-'*113)
    for i in range(len(a)):
        a_el = ''
        b_el = ''
        if(len(a) > i):
            a_el = a[i]
        if(len(b) > i):
            b_el = b[i]
        print('{:<55} | {:<55}'.format(a_el,b_el))

In [4]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'cookie': json_to_cookie(scrap_json['cookie'])
}

In [5]:
articles = []

host = 'https://www.idealista.pt'
path = 'en/comprar-casas'
region = 'lisboa'
params = ['com-tamanho-min_100','publicado_ultimas-48-horas']
url = '/' + '/'.join([path,region,','.join(params)])

i = 0
while url:
    i += 1
    url = host + url
    print('{}) Request URL: {}'.format(i, url))
    
    req = requests.get(url, headers)
    if(req.status_code != 200):
        print('{}) HTTP Code {}\n\tBody: {}'.format(i, req.status_code,req.text))
        break
    
    soup = BeautifulSoup(req.content, 'html.parser')
    
    articles += soup.find_all('article')
    
    print('\tArticles/Ads found: {}'.format(len(articles)))
    pagination = soup.find('div', 'pagination')

    next_page_tag = pagination.find('a', 'icon-arrow-right-after')
    url = next_page_tag.get('href') if next_page_tag else None

1) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas
	Articles/Ads found: 32
2) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-2
	Articles/Ads found: 64
3) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-3
	Articles/Ads found: 96
4) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-4
	Articles/Ads found: 128
5) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-5
	Articles/Ads found: 160
6) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-6
	Articles/Ads found: 180


In [6]:
#print(articles[0].prettify())

items = []
for ad in articles:
    # .info-data-price
    if('adv' in ad.attrs['class']):
        continue
    item_price = ad.find('span', class_='item-price')
    item_detail = ad.find_all('span', class_='item-detail')
    item_link = ad.find('a', class_='item-link')
    item_title = item_link.get('title')
    item_id = ad.get('data-adid')
    
    price = int(item_price.get_text()[:-1].replace(',',''))
    address = urllib.parse.quote(item_title[item_title.find(' in ')+4:]) if item_title.find(' in ') != - 1 else item_title
    size = int(item_detail[1].get_text()[:-3].replace(',','')) if len(item_detail) > 1 else print('{} Failed to fetch area!'.format(item_id))
    item_url = '{}{}'.format(host,item_link.get('href'))

    items.append({
        'price': price,
        'address': address,
        'id': item_id,
        'size': size,
        'url': item_url
    })
    #print(json.dumps(items_dict[item_id], indent=4, sort_keys=True))
print('Ads parsed: {}'.format(len(items)))

Ads parsed: 169


In [7]:
time_format = '%d-%m-%Y_%H:%M:%S_scrap'
data_path = './data_scrap'

now = datetime.now()
s2 = now.strftime(time_format)

f = open(data_path + '/' + s2 + ".json","w")
f.write(json.dumps(items))
f.close()

## Ad GeoLocation (scrapped from Idealista)

In [8]:
import urllib
# TODO go around the map taking the places
params_dict = dict(
    locationUri='lisboa',
    typology='1',
    operation='1',
    zoom='5',
    northEast='54.17960751741163,16.15620517730715',
    southWest='19.08984486221779,-34.46879482269287',
    uid=scrap_json['uid'],
    adfilter_pricemin='default',
    adfilter_price='700000',
    adfilter_area='100',
    adfilter_areamax='default',
    adfilter_published='default'
)
params_dict['solo-favoritos'] = 'false'
params = urllib.parse.urlencode(params_dict, doseq=True)

host = 'https://www.idealista.pt'
path = '/ajax/listingcontroller/listingmapajax.ajax'

new_url = '{}{}?{}'.format(host,path,params)


req = requests.get(new_url, headers)
print(req)
ajax_listing_json = req.json()
json_resp = ajax_listing_json['jsonResponse']

print(json_resp.keys(), len(json_resp['map']['items']))

print(len(json_resp['map']['items']), json_resp['listingTotalResults'])
for geo in json_resp['map']['items']:
    print(geo['adId'])

<Response [200]>
dict_keys(['valueH1', 'description', 'total', 'result', 'errorMessage', 'searchFeaturesPhrase', 'searchPhrase', 'howmany', 'search', 'breadcrumbUrlValue', 'breadcrumbUrlAllMunicipalitiesLevel', 'mapSearchUrl', 'listingSearchUrl', 'rentListingSearchUrl', 'saleListingSearchUrlMobile', 'newDevelopmentUrlMobile', 'newDevelopmentUrl', 'saleListingSearchUrl', 'rentListingSearchUrlMobile', 'pagetarget', 'searchTotalsUrl', 'searchWithoutFilters', 'listingTotalResults', 'listingPriceByArea', 'map', 'existAlert']) 1300
1300 4.134
30574563
30474960
30454160
30500116
30442424
30441597
30255778
30513024
30601215
30544767
30593991
30448223
29928128
30189798
30105276
30190564
30190559
30430613
30247975
30519376
30188038
30300785
30212191
30212192
30509477
30212208
29663193
29125722
30531506
30324143
30483168
30307170
30307138
30497990
29733615
29953067
30195468
29760218
30279701
30171998
30171994
30172453
30172475
30171995
30171989
30184103
30177544
30411086
30578969
30579033
3032317

## Join Ad and GeoLocation

In [9]:
items_dict = {}
for it in items:
    items_dict[it['id']] = it

geo_dict = {}
for geo in json_resp['map']['items']:
    geo_dict[geo['adId']] = geo
    
    if str(geo['adId']) in items_dict:
        items_dict[str(geo['adId'])]['geo'] = { 'latitude': geo['latitude'], 'longitude': geo['longitude'] }
        #print(items_dict[str(geo['adId'])])
print(json.dumps(items_dict, indent=4, sort_keys=True))

{
    "29847229": {
        "address": "Castelo%20-%20Mouraria%2C%20Santa%20Maria%20Maior",
        "id": "29847229",
        "price": 505000,
        "size": 100,
        "url": "https://www.idealista.pt/en/imovel/29847229/"
    },
    "30077386": {
        "address": "Alto%20de%20S%C3%A3o%20Jo%C3%A3o%20-%20Alto%20do%20Varej%C3%A3o%2C%20Penha%20de%20Fran%C3%A7a",
        "id": "30077386",
        "price": 760000,
        "size": 179,
        "url": "https://www.idealista.pt/en/imovel/30077386/"
    },
    "30229122": {
        "address": "praceta%20Fernando%20Valle%2C%201%2C%20Santa%20Clara%2C%20Lisboa",
        "id": "30229122",
        "price": 620000,
        "size": 250,
        "url": "https://www.idealista.pt/en/imovel/30229122/"
    },
    "30360827": {
        "address": "S%C3%A3o%20Sebasti%C3%A3o%20da%20Pedreira%2C%20Avenidas%20Novas",
        "id": "30360827",
        "price": 450000,
        "size": 126,
        "url": "https://www.idealista.pt/en/imovel/30360827/"
    },
 

## Real Estate Agency

In [82]:
agency = []
for it in items_dict.values():
    url = 'https://www.idealista.pt/pt/ajax/listingController/adContactInfoForListing.ajax?adId={}'.format(it['id'])
    req = requests.get(url, headers)
    agency.append(req)

In [83]:
[agent.json() for agent in agency]

[{'message': None,
  'result': 'OK',
  'errorCode': None,
  'data': {'isInvoiceAd': False,
   'isAdProfessional': True,
   'agencyIsABank': False,
   'shortDescription': ' Casa ou moradia à venda  em Alvalade',
   'showAdvertiserName': True,
   'showEmailContactMethod': True,
   'showPhoneContactMethod': True,
   'contactMessage': None,
   'contactDate': None,
   'contactType': 2,
   'phone2': '217920620',
   'phoneRedirection': None,
   'formattedContactPhone1': '915093625',
   'showFormattedContactPhone2': True,
   'formattedContactPhone2': '217920620',
   'adTypologyName': None,
   'newDevelopmentDirectory': False,
   'adId': 30640186,
   'showProfessionalName': True,
   'firstName': 'Pedro Oliveira',
   'lastName': None,
   'commercialName': 'KW Ábaco',
   'referenceMessage': 'Ref.: 1196-2857',
   'isMicrosite': True,
   'mustShowLogo': True,
   'micrositeCityName': 'areeiro',
   'commercialNamePhraseBlock': 'KW Ábaco',
   'advertiserNameLink': '/pro/kwabaco/',
   'advertiserLogoUr