In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from geopy.geocoders import Nominatim
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import Mail
import logging
import sys
import yaml

# initialize Nominatim API
geolocator = Nominatim(user_agent='house_listings')

# set logger
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# constants
YMERE_URL = "https://aanbod.ymere.nl/portal/publication/frontend/getallobjects/format/json"
YMERE_PAYLOAD = {
    'accept': 'application/json',
    'origin': 'https://aanbod.ymere.nl',
    'referer': 'https://aanbod.ymere.nl/aanbod/huurwoningen/',
    'dwellingTypeCategory': 'woning'
}
YMERE_LISTINGS = "./ymere_listings.csv"

# load sendgrid api key from yaml config file
try:
    with open("config.yml", 'r') as stream:
        SENDGRID_API_KEY = yaml.safe_load(stream)['api-keys']['sendgrid']
except Exception as e:
    print(f"{e}: SendGrid API KEY is not available.")


In [3]:
def fetch_attr(attr):
    """
    Fetch an attribute and return an empty string "" if None or does not exists.

    Args:
        attr:       attribute value

    Return:
        attribute value (default type) or None
    """
    
    return attr if not None else ""

def getLocation(lat, long):
    """
    Get details of a location using lat & long values obtained from the listing.
    The details contain:
        - house number
        - road
        - city
        - postcode

    Args:
        lat:       (string) latitude of house.
        long:      (string) longitude of house.

    Return:
        (dict) accurate attributes about the location of the house.
    """

    try:
        # get exact address from geolocator
        location = geolocator.reverse(f"{lat}, {long}").raw['address']
        # keep specific keys only
        location = {x:location[x] for x in ['house_number', 'road', 'city', 'postcode']}

    except Exception as e:
        print("Locator unavailable\n", e)
        return None

    return location

def filter_listings(current_listings, old_listings):
    """
    Filter new listings by matching ids in the loaded csv and the scrapred listings ids.

    Args:
        current_listings:   pandas.DateFrame() containing scraped data which might contain new listings.     
        old_listings:       pandas.DateFrame() containing previously saved listings of which they have yet to expire.
    
    Return:
        new_listings:       pandas.DataFrame() containing the filtered listings of which are new.
        updates_listings:   pandas.DataFrame() containing the updated listings including the new ones.
    """
    # filter listings ids which have not been discovered yet
    new_listings_ids = [idx for idx in current_listings.index.to_list() if idx not in old_listings.index.to_list()]
    # select listings by their ids
    new_listings = current_listings.loc[new_listings_ids]
    # add new listings to csv
    updated_listings = pd.concat([old_listings, new_listings], join="inner")

    return new_listings, updated_listings

def clean_up(old_listings):
    """
    Clean up the old listings by iterating through the dataframe and removing and listings of which `closingDate` has passed.
    i.e.: current date is greater than closing date.

    Args:
        old_listings:       pandas.DateFrame() containing previously saved listings of which they have yet to expire.

    Return:
        pandas.DateFrame() containing previously saved listings which still have yet to expire.
    """

    # get current date (yyyy-mm-dd)
    current_date = int(np.floor(datetime.now().timestamp()))

    drop_rows = []
    # get closing date for every listing
    for k, v in old_listings.iterrows():
        date = datetime.strptime(v['closingDate'], "%Y-%m-%d")
        date = int(np.floor(datetime.timestamp(date)))

        # if closing date has passed, delete listing
        if current_date > date:
            drop_rows.append(k)

    # drop listings that expired
    return old_listings.drop(drop_rows)

def extract_listings_ymere(listings):
    """
    Extract housing listings from scraped data. The data is in Dutch but most attributes are numerals or dates.
    This function is hard-coded to extract the data with the following conditions:
        - The house is for rent ("huur").
        - The house is NOT for temporary students ("Tijdelijke verhuur studenten").
        - The house is located in the city of "Amsterdam".
        - The house has a rent which is affordable, meaning that the rent per month is lower than x times my gross income (where x is sensitive data).

    Args:
        listings:       (list) of scraped Ymere data containing all the housing listings, including social houses and houses for sale.

    Return:
        (dict) containing the desirable filtered listings from the scraped data.
    """
    
    dt_string = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

    houses = []
    for house in listings:
        house_dict = {}
        
        # conditions
        to_rent = house['dwellings'][0]['rentBuy'] == 'Huur'
        action_label = house['actionLabel'][0]['label'] != 'Tijdelijke verhuur studenten'
        amsterdam = 'amsterdam' in house['city'][0]['name'].lower()
        affordable = house['totalRent'][0] <= 1250

        if to_rent and action_label and amsterdam and affordable:
            ### extract and parse attributes ###
            house_dict['city'] = house['city'][0]['name']
            house_dict['id'] = house['id']
            house_dict['totalRent'] = fetch_attr(house['totalRent'][0])
            house_dict['actionLabel'] = fetch_attr(house['actionLabel'][0]['label'])
            house_dict['floor'] = fetch_attr(house['floor'][0]['name'][:2])
            house_dict['neighborhood'] = fetch_attr(house['neighborhood'][0]['name'])
            # improve accuracy of location
            location = getLocation(house['latitude'][0], house['longitude'][0])
            if location:
                house_dict.update(location)
            house_dict['publicationDate'] = datetime.strptime(house['publicationDate'], '%Y-%m-%d %H:%M:%S').date()
            house_dict['closingDate'] = datetime.strptime(house['closingDate'], '%Y-%m-%d %H:%M:%S').date()
            house_dict['dateAdded'] = dt_string
            houses.append(house_dict)
    
    return houses

def send_mail(new_listings, to_email="egalea.11@gmail.com"):
    """
    Send email to address with only the new listings founds.

    Args:
        new_listings:   (pandas.DataFrame) containing the new listings found.
        to_email:       (string) containing the email address to be sent to.

    Return:
        status of email response.
    """

    message = Mail(
        from_email="pitirross.life@gmail.com",
        to_emails=to_email,
        subject=f"[Ymere listing] {len(new_listings)} found!",
        html_content=f"""
        <html>
            <head>
                <style> 
                table, th, td {{font-size:10pt; border:1px solid black; border-collapse:collapse; text-align:left;}}
                th, td {{padding: 5px;}}
                </style>
            </head>
            <body>
                <p>
                    Log in and react to ymere listings 
                    <a href="https://aanbod.ymere.nl/mijn-omgeving/inloggen/">
                        <b>here</b>
                    </a>
                </p>
                <br>
                {new_listings.to_html(index=False)}
            </body>
        </html>
        """
        )
    try:
        sg = SendGridAPIClient(SENDGRID_API_KEY)
        response = sg.send(message)
        logging.info(response.status_code)
        logging.debug(response.body)
        logging.debug(response.headers)
    except Exception as e:
        logging.debug(e.message)

    return response.status_code

In [8]:
r = requests.post(YMERE_URL, data=YMERE_PAYLOAD)
logging.info(f'<Status code: {r.status_code}>')
data = r.json()['result']

dt_string = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

houses = []
for house in data:
    house_dict = {}
    
    # conditions
    to_rent = house['dwellings'][0]['rentBuy'] == 'Huur'
    action_label = house['actionLabel'][0]['label'] != 'Tijdelijke verhuur studenten'
    amsterdam = 'amsterdam' in house['city'][0]['name'].lower()
    affordable = False
    # if house rent exists and is affordable set to True, otherwise skip iteration
    if house['totalRent']:
        affordable = house['totalRent'][0] <= 1250
    else:
        continue

    if to_rent and action_label and amsterdam and affordable:
        ### extract and parse attributes ###
        house_dict['city'] = house['city'][0]['name']
        house_dict['id'] = house['id']
        house_dict['totalRent'] = fetch_attr(house['totalRent'][0])
        house_dict['actionLabel'] = fetch_attr(house['actionLabel'][0]['label'])
        house_dict['floor'] = fetch_attr(house['floor'][0]['name'][:2])
        house_dict['neighborhood'] = fetch_attr(house['neighborhood'][0]['name'])
        # improve accuracy of location
        location = getLocation(house['latitude'][0], house['longitude'][0])
        if location:
            house_dict.update(location)
        house_dict['publicationDate'] = datetime.strptime(house['publicationDate'], '%Y-%m-%d %H:%M:%S').date()
        house_dict['closingDate'] = datetime.strptime(house['closingDate'], '%Y-%m-%d %H:%M:%S').date()
        house_dict['dateAdded'] = dt_string
        houses.append(house_dict)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): aanbod.ymere.nl:443
DEBUG:urllib3.connectionpool:https://aanbod.ymere.nl:443 "POST /portal/publication/frontend/getallobjects/format/json HTTP/1.1" 200 None
INFO:root:<Status code: 200>


IndexError: list index out of range

In [12]:
if house['totalRent']:
    house['totalRent'][0]
else:
    print('no')

no


In [5]:
if __name__ == "__main__":
    # scrape a JSON object of currently available Ymere houses
    r = requests.post(YMERE_URL, data=YMERE_PAYLOAD)
    logging.info(f'<Status code: {r.status_code}>')
    data = r.json()['result']

    # extract ymere listings from obtained json object
    listings = extract_listings_ymere(data)

    if listings:
        logging.info("Houses available found...")
        listings = pd.DataFrame(listings).set_index('id').sort_values('id')

        try:
            # read df from file
            old_listings = pd.read_csv(YMERE_LISTINGS, index_col="id")

            # clean up old listings
            old_listings = clean_up(old_listings)

            # filter
            new_listings, updated_listings = filter_listings(listings, old_listings)

            # write new df to file
            updated_listings.to_csv(YMERE_LISTINGS)

            # send email notification if new listings found (not empty)
            if not new_listings.empty:
                print(new_listings)
                send_mail(new_listings)
            else:
                logging.debug("No new houses found.")

        except pd.errors.EmptyDataError as e:
            logging.info("No old listings found. Writing new listings to file.")
            listings.to_csv(YMERE_LISTINGS)

    else:
        logging.info("No houses found...")


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): aanbod.ymere.nl:443
DEBUG:urllib3.connectionpool:https://aanbod.ymere.nl:443 "POST /portal/publication/frontend/getallobjects/format/json HTTP/1.1" 200 None
INFO:root:<Status code: 200>


IndexError: list index out of range

In [13]:
old_listings = pd.read_csv(YMERE_LISTINGS, index_col="id")
old_listings


Unnamed: 0_level_0,city,totalRent,actionLabel,floor,neighborhood,house_number,road,postcode,publicationDate,closingDate,dateAdded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5617,Amsterdam,1934.42,,2e,,452,Erich Salomonstraat,1087 JA,2023-06-26,2023-06-30,2023_06_30__11_10_57


In [20]:
# filter listings ids which have not been discovered yet
new_listings_ids = [idx for idx in listings.index.to_list() if idx not in old_listings.index.to_list()]
# select listings by their ids
print(old_listings)
# add new listings to csv
updated_listings = pd.concat([old_listings, new_listings], join="inner")


[]
           city  totalRent  actionLabel floor  neighborhood  house_number  \
id                                                                          
5617  Amsterdam    1934.42          NaN    2e           NaN           452   

                     road postcode publicationDate closingDate  \
id                                                               
5617  Erich Salomonstraat  1087 JA      2023-06-26  2023-06-30   

                 dateAdded  
id                          
5617  2023_06_30__11_10_57  


In [41]:
listing_keys = [
    'id',
    'title',
    'street',
    'city',
    'neighborhood',
    'municipality',
    'houseNumber',
    'houseNumberAddition',
    'floor',
    'publicationDate',
    'closingDate',
    'netRent',
    'totalRent',
    'rentDuration',
    'sleepingRoom',
    'vatInclusive',
]


houses = []
for key in listing_keys:
    

[{'__className': 'stdClass',
  'title': 'Binnenhof 135',
  'publicationDate': '2022-07-19 00:00:00',
  'closingDate': '2023-07-19 00:00:00',
  'remainingTimeUntilClosingDate': '5 maanden en 22 dagen',
  'pictures': [{'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-23.png',
    'type': 'png'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-2.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-3.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-4.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-5.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-6.jpg',
    'type': 'jpg'},
   {'__className':