In [120]:
import requests
from bs4 import BeautifulSoup
import textwrap
from datetime import datetime
import json
import urllib


In [143]:
with open("scrap.json") as json_file:
    scrap_json = json.load(json_file)
print('Loaded scrap json:\n\tUID: {}'.format(scrap_json['uid']))

Loaded scrap json:
	UID: 55c5fd9dadafwmbu5lzwmd2vx5d2agxggm894hqnht8


## Ad scrapping

In [41]:
def json_to_cookie(cookie_json):
    return '; '.join(['{}={}'.format(k,v) for k, v in cookie_json.items()])
#print(cookie_str, new_cookie_str)

#print compare
def compare_print(a, b):
    print('{:>55} | {:<55}'.format('A','B'))
    print('-'*113)
    for i in range(len(a)):
        a_el = ''
        b_el = ''
        if(len(a) > i):
            a_el = a[i]
        if(len(b) > i):
            b_el = b[i]
        print('{:<55} | {:<55}'.format(a_el,b_el))

In [42]:
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'cookie': json_to_cookie(scrap_json['cookie'])
}

In [43]:
articles = []

host = 'https://www.idealista.pt'
path = 'en/comprar-casas'
region = 'lisboa'
params = ['com-tamanho-min_100','publicado_ultimas-48-horas']
url = '/' + '/'.join([path,region,','.join(params)])

i = 0
while url:
    i += 1
    url = host + url
    print('{}) Request URL: {}'.format(i, url))
    
    req = requests.get(url, headers)
    if(req.status_code != 200):
        print('{}) HTTP Code {}\n\tBody: {}'.format(i, req.status_code,req.text))
        break
    
    soup = BeautifulSoup(req.content, 'html.parser')
    
    articles += soup.find_all('article')
    
    print('\tArticles/Ads found: {}'.format(len(articles)))
    pagination = soup.find('div', 'pagination')

    next_page_tag = pagination.find('a', 'icon-arrow-right-after')
    url = next_page_tag.get('href') if next_page_tag else None

1) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas
	Articles/Ads found: 32
2) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-2
	Articles/Ads found: 64
3) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-3
	Articles/Ads found: 96
4) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-4
	Articles/Ads found: 128
5) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-5
	Articles/Ads found: 135


In [140]:
#print(articles[0].prettify())

items = []
for ad in articles:
    # .info-data-price
    if('adv' in ad.attrs['class']):
        continue
    item_price = ad.find('span', class_='item-price')
    item_detail = ad.find_all('span', class_='item-detail')
    item_link = ad.find('a', class_='item-link')
    item_title = item_link.get('title')
    item_id = ad.get('data-adid')
    
    price = int(item_price.get_text()[:-1].replace(',',''))
    address = urllib.parse.quote(item_title[item_title.find(' in ')+4:]) if item_title.find(' in ') != - 1 else item_title
    size = int(item_detail[1].get_text()[:-3].replace(',','')) if len(item_detail) > 1 else print('{} Failed to fetch area!'.format(item_id))
    item_url = '{}{}'.format(host,item_link.get('href'))

    items.append({
        'price': price,
        'address': address,
        'id': item_id,
        'size': size,
        'url': item_url
    })
    #print(json.dumps(items_dict[item_id], indent=4, sort_keys=True))
print('Ads parsed: {}'.format(len(items)))

Ads parsed: 126


In [141]:
time_format = '%d-%m-%Y_%H:%M:%S_scrap'
data_path = './data_scrap'

now = datetime.now()
s2 = now.strftime(time_format)

f = open(data_path + '/' + s2 + ".json","w")
f.write(json.dumps(items))
f.close()

## Ad GeoLocation (scrapped from Idealista)

In [40]:
import urllib
# TODO go around the map taking the places
params_dict = dict(
    locationUri='lisboa',
    typology='1',
    operation='1',
    zoom='5',
    northEast='54.17960751741163,16.15620517730715',
    southWest='19.08984486221779,-34.46879482269287',
    uid=scrap_json['uid'],
    adfilter_pricemin='default',
    adfilter_price='700000',
    adfilter_area='100',
    adfilter_areamax='default',
    adfilter_published='default'
)
params_dict['solo-favoritos'] = 'false'
params = urllib.parse.urlencode(params_dict, doseq=True)

host = 'https://www.idealista.pt'
path = '/ajax/listingcontroller/listingmapajax.ajax'

new_url = '{}{}?{}'.format(host,path,params)


req = requests.get(new_url, headers)
print(req)
ajax_listing_json = req.json()
json_resp = ajax_listing_json['jsonResponse']

print(json_resp.keys(), len(json_resp['map']['items']))

print(len(json_resp['map']['items']), json_resp['listingTotalResults'])
for geo in json_resp['map']['items']:
    print(geo['adId'])

<Response [200]>
dict_keys(['valueH1', 'description', 'total', 'result', 'errorMessage', 'searchFeaturesPhrase', 'searchPhrase', 'howmany', 'search', 'breadcrumbUrlValue', 'breadcrumbUrlAllMunicipalitiesLevel', 'mapSearchUrl', 'listingSearchUrl', 'rentListingSearchUrl', 'saleListingSearchUrlMobile', 'newDevelopmentUrlMobile', 'newDevelopmentUrl', 'saleListingSearchUrl', 'rentListingSearchUrlMobile', 'pagetarget', 'searchTotalsUrl', 'searchWithoutFilters', 'listingTotalResults', 'listingPriceByArea', 'map', 'existAlert']) 1300
1300 4.110
30579297
30588445
30574563
30449665
30555916
30502116
29973505
30410973
30578514
30500469
30320596
30466689
30495942
30598207
30421636
29162771
29805147
30236328
30502176
30201596
30300785
29839441
30041316
30454549
30581620
30531663
28722785
30346263
30272272
30360956
30360952
30336240
30515559
30524931
29130767
30089300
29743272
30215749
30566607
30450654
28913068
30393162
30411086
30396586
30396805
30391883
30391850
30391873
30341029
30331774
3032317

## Join Ad and GeoLocation

In [139]:
items_dict = {}
for it in items:
    items_dict[it['id']] = it

geo_dict = {}
for geo in json_resp['map']['items']:
    geo_dict[geo['adId']] = geo
    
    if str(geo['adId']) in items_dict:
        items_dict[str(geo['adId'])]['geo'] = { 'latitude': geo['latitude'], 'longitude': geo['longitude'] }
        #print(items_dict[str(geo['adId'])])
print(json.dumps(items_dict, indent=4, sort_keys=True))

{
    "30114049": {
        "address": "Parque%20das%20Na%C3%A7%C3%B5es%2C%20Lisboa",
        "id": "30114049",
        "price": 1050000,
        "size": 172,
        "url": "https://www.idealista.pt/en/imovel/30114049/"
    },
    "30231156": {
        "address": "Saldanha%20-%20Picoas%2C%20S%C3%A3o%20Jorge%20de%20Arroios%2C%20Arroios",
        "id": "30231156",
        "price": 1398000,
        "size": 222,
        "url": "https://www.idealista.pt/en/imovel/30231156/"
    },
    "30247992": {
        "address": "Parque%20das%20Na%C3%A7%C3%B5es%2C%20Lisboa",
        "id": "30247992",
        "price": 2400000,
        "size": 763,
        "url": "https://www.idealista.pt/en/imovel/30247992/"
    },
    "30365401": {
        "address": "avenida%20da%20Rep%C3%BAblica%2C%20Campo%20Pequeno%20-%20Nossa%20Senhora%20de%20F%C3%A1tima%2C%20Avenidas%20Novas",
        "id": "30365401",
        "price": 350000,
        "size": 127,
        "url": "https://www.idealista.pt/en/imovel/30365401/"
    