In [1]:
import boto3
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

In [2]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, 'lambda/scrap_idealista')

import scrapper

items = scrapper.scrap()

Loaded scrap json:
	UID: 55c5fd9dadafwmbu5lzwmd2vx5d2agxggm894hqnht8
1) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas
	Articles/Ads found: 32
2) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-2
	Articles/Ads found: 64
3) Request URL: https://www.idealista.pt/en/comprar-casas/lisboa/com-tamanho-min_100,publicado_ultimas-48-horas/pagina-3
	Articles/Ads found: 93
Ads parsed: 87


In [3]:
def add_updated_columns(items):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('scrapped_ads')

    response = table.scan(
        AttributesToGet=['id','price', 'created','price_update','date_update'],
        ScanFilter={'id': {'AttributeValueList': [it['id'] for it in items],'ComparisonOperator': 'IN'}}
    )
    scanned_items = response['Items']

    scanned_items[0]['price'] -= 5
    scanned_items[20]['price'] -= 5
    #array to hash to be faster to look
    old_items_map = { }
    for it in scanned_items:
        if 'price_update' not in it['id']:
            it['price_update'] = [it['price']]
        if 'date_update' not in it['id']:
            it['date_update'] = [it['created']]
        old_items_map[it['id']] = it

    # rows with new price get and updated array
    for it in items:
        if it['id'] in old_items_map and it['price'] != old_items_map[it['id']]['price']:
            old_items_map[it['id']]['price_update'].append(it['price'])
            old_items_map[it['id']]['date_update'].append(it['created'])
            it['price_update'] = old_items_map[it['id']]['price_update']
            it['date_update'] = old_items_map[it['id']]['date_update']
            print('Price update!\n\t{}\n\t{}'.format(it, old_items_map[it['id']]))
add_updated_columns(items)

Price update!
	{'price': 485000, 'address': 'Centro%20-%20Nova%20Campolide%2C%20Campolide', 'id': '30630305', 'size': 100, 'url': 'https://www.idealista.pt/en/imovel/30630305/', 'created': '03-08-2020 17:43:39', 'price_update': [Decimal('484995'), 485000], 'date_update': ['03-08-2020 00:32:13', '03-08-2020 17:43:39']}
	{'created': '03-08-2020 00:32:13', 'id': '30630305', 'price': Decimal('484995'), 'price_update': [Decimal('484995'), 485000], 'date_update': ['03-08-2020 00:32:13', '03-08-2020 17:43:39']}
Price update!
	{'price': 1150000, 'address': 'Pedrou%C3%A7os%2C%20Bel%C3%A9m', 'id': '30629264', 'size': 239, 'url': 'https://www.idealista.pt/en/imovel/30629264/', 'created': '03-08-2020 17:43:39', 'price_update': [Decimal('1149995'), 1150000], 'date_update': ['03-08-2020 00:32:13', '03-08-2020 17:43:39']}
	{'created': '03-08-2020 00:32:13', 'id': '30629264', 'price': Decimal('1149995'), 'price_update': [Decimal('1149995'), 1150000], 'date_update': ['03-08-2020 00:32:13', '03-08-2020 

In [4]:
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('scrapped_ads')


In [5]:
response = table.scan()
response

{'Items': [{'size': Decimal('107'),
   'geo': [{'continent': 'Europe',
     'country': 'Portugal',
     'latitude': Decimal('38.71394'),
     'confidence': Decimal('1'),
     'county': None,
     'locality': 'Mercês',
     'administrative_area': None,
     'label': 'Mercês, Portugal',
     'type': 'locality',
     'number': None,
     'country_code': 'PRT',
     'street': None,
     'neighbourhood': None,
     'name': 'Mercês',
     'postal_code': None,
     'region': 'Lisboa',
     'longitude': Decimal('-9.15097'),
     'region_code': 'LI'},
    {'continent': 'Europe',
     'country': 'Portugal',
     'latitude': Decimal('38.793858'),
     'confidence': Decimal('1'),
     'county': None,
     'locality': None,
     'administrative_area': None,
     'label': 'Mercês, Portugal',
     'type': 'venue',
     'number': None,
     'country_code': 'PRT',
     'street': None,
     'neighbourhood': None,
     'name': 'Mercês',
     'postal_code': None,
     'region': 'Lisboa',
     'longitude':

In [6]:
len(response['Items'])

# filter ads with no geo
ad_with_geo = list(filter(lambda x: (('geo' in x) and (len(x['geo']) > 0)), response['Items']))

print('Items scanned:{:>5}\nItems with geo:{:>4}'.format(len(response['Items']), len(ad_with_geo)))

Items scanned:  234
Items with geo: 160


In [64]:
ad_with_geo[0]['geo'][0]

for ad in ad_with_geo:
    geo = ad['geo'][0]
    ad['latitude'] = geo['latitude']
    ad['longitude'] = geo['longitude']
    ad['locality'] = geo['locality']
    ad['neighbourhood'] = geo['neighbourhood']
    ad['geo_name'] = geo['name']

In [65]:
df = pd.DataFrame.from_dict(ad_with_geo)
df.describe()

Unnamed: 0,size,geo,created,address,price,id,url,latitude,longitude,locality,geo_name,neighbourhood
count,160,160,160,160,160,160,160,160.0,160.0,44,160,89
unique,96,69,8,73,111,160,160,66.0,66.0,16,66,25
top,140,"[{'continent': 'Europe', 'country': 'Portugal'...",01-08-2020 23:00:53,Parque%20das%20Na%C3%A7%C3%B5es%2C%20Lisboa,549900,30627624,https://www.idealista.pt/en/imovel/30624466/,38.793547,-9.097822,São Domingos de Benfica,Parque das Nações Norte,Lumiar
freq,7,12,50,12,8,1,1,12.0,12.0,10,12,25


In [72]:
df_group_locality = df[df.locality.notnull()].groupby('locality')

data = []
for name, group in df_group_locality:
    subplot_id = "mapbox" + name
    trace = go.Scattermapbox(
        name=name,
        showlegend=True,
        lat=df[df.locality == name].latitude,
        lon=df[df.locality == name].longitude,
        mode="markers",
        marker=dict(size=14),
        text=name,
        subplot="mapbox",
    )
    # fig.add_trace(trace)
    data.append(trace)


In [73]:
# ---------------------------
# Plot Color ad per municipality
# ---------------------------

layout = go.Layout(
    autosize=True,
    height=700,
    title="Color ad by area",
    # grid={"rows": rows, "columns": cols, "xgap": 0.1, "ygap": 0.2},
)


map_center = go.layout.mapbox.Center(
    lat=df["latitude"].mean(), lon=df["longitude"].mean()
)
layout["mapbox"] = dict(style="carto-positron", center=map_center, zoom=11,)

fig = go.Figure(data=data, layout=layout)
fig

In [70]:
import urllib
#df.address = df.address.apply(urllib.parse.unquote)
#df[df.address.str.contains('Flat|flat')][['address','geo']]
df.head()

Unnamed: 0,size,geo,created,address,price,id,url,latitude,longitude,locality,geo_name,neighbourhood
0,107,"[{'continent': 'Europe', 'country': 'Portugal'...",01-08-2020 15:22:25,Merc%C3%AAs%2C%20Miseric%C3%B3rdia,625000,30623780,https://www.idealista.pt/en/imovel/30623780/,38.71394,-9.15097,Mercês,Mercês,
1,239,"[{'continent': 'Europe', 'country': 'Portugal'...",03-08-2020 00:32:13,Pedrou%C3%A7os%2C%20Bel%C3%A9m,1150000,30629264,https://www.idealista.pt/en/imovel/30629264/,38.694261,-9.202439,,Restaurante O Pedroucos,Belém
2,138,"[{'continent': 'Europe', 'country': 'Portugal'...",01-08-2020 23:00:53,Anjos%2C%20Arroios,430000,30586059,https://www.idealista.pt/en/imovel/30586059/,38.72365,-9.13411,Anjos,Anjos,
3,170,"[{'continent': 'Europe', 'country': 'Portugal'...",03-08-2020 00:32:13,avenida%20do%20%C3%8Dndico%2C%2C%20Parque%20da...,1350000,30629099,https://www.idealista.pt/en/imovel/30629099/,38.768614,-9.095832,,Avenida do Índico,Olivais Sul
4,331,"[{'continent': 'Europe', 'country': 'Portugal'...",03-08-2020 00:32:14,Alvalade%2C%20Alvalade,899900,30629195,https://www.idealista.pt/en/imovel/30629195/,38.75328,-9.14397,Alvalade,Alvalade,
