In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from geopy.geocoders import Nominatim
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import Mail
import logging
import sys
import yaml
from funda_scraper import FundaScraper

# initialize Nominatim API
geolocator = Nominatim(user_agent='house_listings')

# set logger
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# constants
# YMERE_URL = "https://aanbod.ymere.nl/portal/publication/frontend/getallobjects/format/json"
# YMERE_PAYLOAD = {
#     'accept': 'application/json',
#     'origin': 'https://aanbod.ymere.nl',
#     'referer': 'https://aanbod.ymere.nl/aanbod/huurwoningen/',
#     'dwellingTypeCategory': 'woning'
# }
# YMERE_LISTINGS = "./ymere_listings.csv"

# load sendgrid api key from yaml config file
try:
    with open("src/.config.yml", 'r') as stream:
        SENDGRID_API_KEY = yaml.safe_load(stream)['api-keys']['sendgrid']
except Exception as e:
    print(f"{e}: SendGrid API KEY is not available.")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
scraper = FundaScraper(area="breda", want_to="rent", find_past=False, page_start=1, n_pages=1)
df_clean = scraper.run(raw_data=False)

[38;20m2024-03-14 10:14:04,402 - INFO - *** Phase 1: Fetch all the available links from all pages ***  (scrape.py:122)[0m
INFO:funda_scraper:*** Phase 1: Fetch all the available links from all pages *** 
  0%|          | 0/1 [00:00<?, ?it/s]DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.funda.nl:443
DEBUG:urllib3.connectionpool:https://www.funda.nl:443 "GET /en/zoeken/huur?selected_area=%22breda%22&search_result=1 HTTP/1.1" 200 None
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
[38;20m2024-03-14 10:14:04,970 - INFO - *** Got all the urls. 15 houses found from 1 to 1 *** (scrape.py:138)[0m
INFO:funda_scraper:*** Got all the urls. 15 houses found from 1 to 1 ***
[38;20m2024-03-14 10:14:04,971 - INFO - *** Phase 2: Start scraping from individual links *** (scrape.py:239)[0m
INFO:funda_scraper:*** Phase 2: Start scraping from individual links ***
100%|██████████| 15/15 [00:03<00:00,  4.82it/s]
[38;20m2024-03-14 10:14:08,332 - INFO - *** All scraping done: 15 res

AttributeError: 'Series' object has no attribute 'url'

In [26]:
columns = ['url', 'price', 'address', 'zip_code', 'size', 'year', 'living_area', 'kind_of_house', 'building_type', 'num_of_rooms', 'energy_label', 'parking', 'listed_since']
df[columns]

Unnamed: 0,url,price,address,zip_code,size,year,living_area,kind_of_house,building_type,num_of_rooms,energy_label,parking,listed_since
0,https://www.funda.nl/huur/breda/appartement-43...,€ 2.600 /mnd,Ceresstraat 24 A*,4811 CC Breda,185 m²,1922,185 m²,Bovenwoning (dubbel bovenhuis),Bestaande bouw,7 kamers (4 slaapkamers),C,na,na
1,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminastraat 50 A,4818 SH Breda,145 m²,1900,145 m²,Bovenwoning (appartement),Bestaande bouw,5 kamers (3 slaapkamers),F,na,na
2,https://www.funda.nl/huur/breda/huis-43477238-...,€ 2.500 /mnd,Cimburgalaan 85,4819 BB Breda,160 m²,1955,160 m²,"Eengezinswoning, hoekwoning",Bestaande bouw,4 kamers (3 slaapkamers),na,na,470 m³
3,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminapark 33,4818 SL Breda,133 m²,1994,133 m²,Portiekflat (appartement),Bestaande bouw,3 kamers (2 slaapkamers),A,Soort parkeergelegenheidOp eigen terrein en pa...,na
4,https://www.funda.nl/huur/breda/parkeergelegen...,€ 135 /mnd,Middellaan 60,4811 VM Breda,na,na,Elektrische deur,Parkeerkelder,Bestaande bouw,na,na,na,na
5,https://www.funda.nl/huur/breda/huis-42374641-...,€ 2.500 /mnd,Valkenierslaan 356,4834 CP Breda,195 m²,2012,195 m²,"Herenhuis, tussenwoning",Bestaande bouw,6 kamers (5 slaapkamers),A++,Soort parkeergelegenheidOpenbaar parkeren,na
6,https://www.funda.nl/huur/breda/appartement-42...,€ 3.500 /mnd,Concordiaplein 42,4811 NZ Breda,231 m²,1999,231 m²,Portiekflat (appartement),Bestaande bouw,6 kamers (4 slaapkamers),A+,Inschrijving KvKJaJaarlijkse vergaderingJaPeri...,Onbepaalde tijd
7,https://www.funda.nl/huur/breda/parkeergelegen...,€ 169 /mnd,Goeseelsstraat 35 A 31-84,4817 MV Breda,na,na,Elektra,Garage,Nieuwbouw,na,na,na,Onbepaalde tijd
8,https://www.funda.nl/huur/breda/huis-43472987-...,€ 1.850 /mnd,Fazantstraat 7,4815 GD Breda,129 m²,1935,129 m²,"Eengezinswoning, tussenwoning",Bestaande bouw,5 kamers (4 slaapkamers),na,na,na
9,https://www.funda.nl/huur/breda/appartement-42...,€ 1.170 /mnd,Luciastraat 83,4813 CX Breda,83 m²,2017,83 m²,Galerijflat (appartement),Bestaande bouw,3 kamers (2 slaapkamers),A,na,na


In [16]:
df['layout'].iloc[0]

'Aantal kamers4 kamers (3 slaapkamers)Aantal badkamers1 badkamer en 1 apart toiletBadkamervoorzieningenLigbad en toiletAantal woonlagen1 woonlaagGelegen op31e woonlaagVoorzieningenBalansventilatie en lift'

In [6]:
def fetch_attr(attr):
    """
    Fetch an attribute and return an empty string "" if None or does not exists.

    Args:
        attr:       attribute value

    Return:
        attribute value (default type) or None
    """
    
    return attr if not None else ""

def getLocation(lat, long):
    """
    Get details of a location using lat & long values obtained from the listing.
    The details contain:
        - house number
        - road
        - city
        - postcode

    Args:
        lat:       (string) latitude of house.
        long:      (string) longitude of house.

    Return:
        (dict) accurate attributes about the location of the house.
    """

    try:
        # get exact address from geolocator
        location = geolocator.reverse(f"{lat}, {long}").raw['address']
        # keep specific keys only
        location = {x:location[x] for x in ['house_number', 'road', 'city', 'postcode']}

    except Exception as e:
        print("Locator unavailable\n", e)
        return None

    return location

def filter_listings(current_listings, old_listings):
    """
    Filter new listings by matching ids in the loaded csv and the scrapred listings ids.

    Args:
        current_listings:   pandas.DateFrame() containing scraped data which might contain new listings.     
        old_listings:       pandas.DateFrame() containing previously saved listings of which they have yet to expire.
    
    Return:
        new_listings:       pandas.DataFrame() containing the filtered listings of which are new.
        updates_listings:   pandas.DataFrame() containing the updated listings including the new ones.
    """
    # filter listings ids which have not been discovered yet
    new_listings_ids = [idx for idx in current_listings.index.to_list() if idx not in old_listings.index.to_list()]
    # select listings by their ids
    new_listings = current_listings.loc[new_listings_ids]
    # add new listings to csv
    updated_listings = pd.concat([old_listings, new_listings], join="inner")

    return new_listings, updated_listings

def clean_up(old_listings):
    """
    Clean up the old listings by iterating through the dataframe and removing and listings of which `closingDate` has passed.
    i.e.: current date is greater than closing date.

    Args:
        old_listings:       pandas.DateFrame() containing previously saved listings of which they have yet to expire.

    Return:
        pandas.DateFrame() containing previously saved listings which still have yet to expire.
    """

    # get current date (yyyy-mm-dd)
    current_date = int(np.floor(datetime.now().timestamp()))

    drop_rows = []
    # get closing date for every listing
    for k, v in old_listings.iterrows():
        date = datetime.strptime(v['closingDate'], "%Y-%m-%d")
        date = int(np.floor(datetime.timestamp(date)))

        # if closing date has passed, delete listing
        if current_date > date:
            drop_rows.append(k)

    # drop listings that expired
    return old_listings.drop(drop_rows)

def extract_listings_ymere(listings):
    """
    Extract housing listings from scraped data. The data is in Dutch but most attributes are numerals or dates.
    This function is hard-coded to extract the data with the following conditions:
        - The house is for rent ("huur").
        - The house is NOT for temporary students ("Tijdelijke verhuur studenten").
        - The house is located in the city of "Amsterdam".
        - The house has a rent which is affordable, meaning that the rent per month is lower than x times my gross income (where x is sensitive data).

    Args:
        listings:       (list) of scraped Ymere data containing all the housing listings, including social houses and houses for sale.

    Return:
        (dict) containing the desirable filtered listings from the scraped data.
    """
    
    dt_string = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

    houses = []
    for house in listings:
        house_dict = {}
        
        # conditions
        to_rent = house['dwellings'][0]['rentBuy'] == 'Huur'
        action_label = house['actionLabel'][0]['label'] != 'Tijdelijke verhuur studenten'
        amsterdam = 'amsterdam' in house['city'][0]['name'].lower()
        affordable = house['totalRent'][0] <= 1250

        if to_rent and action_label and amsterdam and affordable:
            ### extract and parse attributes ###
            house_dict['city'] = house['city'][0]['name']
            house_dict['id'] = house['id']
            house_dict['totalRent'] = fetch_attr(house['totalRent'][0])
            house_dict['actionLabel'] = fetch_attr(house['actionLabel'][0]['label'])
            house_dict['floor'] = fetch_attr(house['floor'][0]['name'][:2])
            house_dict['neighborhood'] = fetch_attr(house['neighborhood'][0]['name'])
            # improve accuracy of location
            location = getLocation(house['latitude'][0], house['longitude'][0])
            if location:
                house_dict.update(location)
            house_dict['publicationDate'] = datetime.strptime(house['publicationDate'], '%Y-%m-%d %H:%M:%S').date()
            house_dict['closingDate'] = datetime.strptime(house['closingDate'], '%Y-%m-%d %H:%M:%S').date()
            house_dict['dateAdded'] = dt_string
            houses.append(house_dict)
    
    return houses

def send_mail(new_listings, to_email="egalea.11@gmail.com"):
    """
    Send email to address with only the new listings founds.

    Args:
        new_listings:   (pandas.DataFrame) containing the new listings found.
        to_email:       (string) containing the email address to be sent to.

    Return:
        status of email response.
    """

    message = Mail(
        from_email="pitirross.life@gmail.com",
        to_emails=to_email,
        subject=f"[Ymere listing] {len(new_listings)} found!",
        html_content=f"""
        <html>
            <head>
                <style> 
                table, th, td {{font-size:10pt; border:1px solid black; border-collapse:collapse; text-align:left;}}
                th, td {{padding: 5px;}}
                </style>
            </head>
            <body>
                <p>
                    Log in and react to ymere listings 
                    <a href="https://aanbod.ymere.nl/mijn-omgeving/inloggen/">
                        <b>here</b>
                    </a>
                </p>
                <br>
                {new_listings.to_html(index=False)}
            </body>
        </html>
        """
        )
    try:
        sg = SendGridAPIClient(SENDGRID_API_KEY)
        response = sg.send(message)
        logging.info(response.status_code)
        logging.debug(response.body)
        logging.debug(response.headers)
    except Exception as e:
        logging.debug(e.message)

    return response.status_code

In [7]:
r = requests.post(YMERE_URL, data=YMERE_PAYLOAD)
logging.info(f'<Status code: {r.status_code}>')
data = r.json()['result']

dt_string = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

houses = []
for house in data:
    house_dict = {}
    
    # conditions
    to_rent = house['dwellings'][0]['rentBuy'] == 'Huur'
    action_label = house['actionLabel'][0]['label'] != 'Tijdelijke verhuur studenten'
    amsterdam = 'amsterdam' in house['city'][0]['name'].lower()
    affordable = False
    # if house rent exists and is affordable set to True, otherwise skip iteration
    if house['totalRent']:
        affordable = house['totalRent'][0] <= 1250
    else:
        continue

    if to_rent and action_label and amsterdam and affordable:
        ### extract and parse attributes ###
        house_dict['city'] = house['city'][0]['name']
        house_dict['id'] = house['id']
        house_dict['totalRent'] = fetch_attr(house['totalRent'][0])
        house_dict['actionLabel'] = fetch_attr(house['actionLabel'][0]['label'])
        house_dict['floor'] = fetch_attr(house['floor'][0]['name'][:2])
        house_dict['neighborhood'] = fetch_attr(house['neighborhood'][0]['name'])
        # improve accuracy of location
        location = getLocation(house['latitude'][0], house['longitude'][0])
        if location:
            house_dict.update(location)
        house_dict['publicationDate'] = datetime.strptime(house['publicationDate'], '%Y-%m-%d %H:%M:%S').date()
        house_dict['closingDate'] = datetime.strptime(house['closingDate'], '%Y-%m-%d %H:%M:%S').date()
        house_dict['dateAdded'] = dt_string
        houses.append(house_dict)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): aanbod.ymere.nl:443
DEBUG:urllib3.connectionpool:https://aanbod.ymere.nl:443 "POST /portal/publication/frontend/getallobjects/format/json HTTP/1.1" 404 12028
INFO:root:<Status code: 404>


JSONDecodeError: Expecting value: line 2 column 1 (char 2)

In [12]:
if house['totalRent']:
    house['totalRent'][0]
else:
    print('no')

no


In [5]:
if __name__ == "__main__":
    # scrape a JSON object of currently available Ymere houses
    r = requests.post(YMERE_URL, data=YMERE_PAYLOAD)
    logging.info(f'<Status code: {r.status_code}>')
    data = r.json()['result']

    # extract ymere listings from obtained json object
    listings = extract_listings_ymere(data)

    if listings:
        logging.info("Houses available found...")
        listings = pd.DataFrame(listings).set_index('id').sort_values('id')

        try:
            # read df from file
            old_listings = pd.read_csv(YMERE_LISTINGS, index_col="id")

            # clean up old listings
            old_listings = clean_up(old_listings)

            # filter
            new_listings, updated_listings = filter_listings(listings, old_listings)

            # write new df to file
            updated_listings.to_csv(YMERE_LISTINGS)

            # send email notification if new listings found (not empty)
            if not new_listings.empty:
                print(new_listings)
                send_mail(new_listings)
            else:
                logging.debug("No new houses found.")

        except pd.errors.EmptyDataError as e:
            logging.info("No old listings found. Writing new listings to file.")
            listings.to_csv(YMERE_LISTINGS)

    else:
        logging.info("No houses found...")


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): aanbod.ymere.nl:443
DEBUG:urllib3.connectionpool:https://aanbod.ymere.nl:443 "POST /portal/publication/frontend/getallobjects/format/json HTTP/1.1" 200 None
INFO:root:<Status code: 200>


IndexError: list index out of range

In [13]:
old_listings = pd.read_csv(YMERE_LISTINGS, index_col="id")
old_listings


Unnamed: 0_level_0,city,totalRent,actionLabel,floor,neighborhood,house_number,road,postcode,publicationDate,closingDate,dateAdded
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5617,Amsterdam,1934.42,,2e,,452,Erich Salomonstraat,1087 JA,2023-06-26,2023-06-30,2023_06_30__11_10_57


In [20]:
# filter listings ids which have not been discovered yet
new_listings_ids = [idx for idx in listings.index.to_list() if idx not in old_listings.index.to_list()]
# select listings by their ids
print(old_listings)
# add new listings to csv
updated_listings = pd.concat([old_listings, new_listings], join="inner")


[]
           city  totalRent  actionLabel floor  neighborhood  house_number  \
id                                                                          
5617  Amsterdam    1934.42          NaN    2e           NaN           452   

                     road postcode publicationDate closingDate  \
id                                                               
5617  Erich Salomonstraat  1087 JA      2023-06-26  2023-06-30   

                 dateAdded  
id                          
5617  2023_06_30__11_10_57  


In [41]:
listing_keys = [
    'id',
    'title',
    'street',
    'city',
    'neighborhood',
    'municipality',
    'houseNumber',
    'houseNumberAddition',
    'floor',
    'publicationDate',
    'closingDate',
    'netRent',
    'totalRent',
    'rentDuration',
    'sleepingRoom',
    'vatInclusive',
]


houses = []
for key in listing_keys:
    

[{'__className': 'stdClass',
  'title': 'Binnenhof 135',
  'publicationDate': '2022-07-19 00:00:00',
  'closingDate': '2023-07-19 00:00:00',
  'remainingTimeUntilClosingDate': '5 maanden en 22 dagen',
  'pictures': [{'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-23.png',
    'type': 'png'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-2.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-3.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-4.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-5.jpg',
    'type': 'jpg'},
   {'__className': 'stdClass',
    'label': '',
    'uri': '/portal/uploads/dwelling/pictures/11857-6.jpg',
    'type': 'jpg'},
   {'__className':

In [12]:
from pushbullet import Pushbullet

pb = Pushbullet('o.RDkdCwFFKJv6EITRaWlpeC610Zoi3BhA')

dev = pb.devices[0]
response = dev.push_note(title, body)

response

NameError: name 'title' is not defined

In [None]:
{
    content: {
        {
            "plain_text": "indeed",
            "v1": {
                "children": [
                {
                    "type": "text",
                    "align": "left",
                    "content": {
                    "children": [
                        {
                        "type": "text_span",
                        "text": "indeed",
                        "text_size": 16
                        }
                    ]
                    }
                }
                ]
            }
        },
    content_excerpt: "Cause you're worth it",
    cover_image_url: "https://s3.amazonaws.com/campus-cloud-image-use/bkw6yc2x9616176pr1ktm4x861rmuzg4m0m7cv1s3es4hdlsgb.png",
    host_id: 350200,
    host_type: "store",
    school_id: 157,
    status: 1,
    title: "L'oreal"
}

In [64]:
import pandas as pd
from funda_scraper import FundaScraper, preprocess
from translate import Translator
import datetime

translator = Translator(from_lang="nl", to_lang="en")

def extract_listings_funda(area:str, want_to:str='rent', n_pages:int=1, raw_listings:bool=True) -> pd.DataFrame:
    """
    Extract listings from Funda scraping library.

    Args:
        area:               (string)    area that you want to search in
        want_to:            (string)    'rent' or 'buy'
        n_pages:            (int)       number of listings to show
        raw_listings:       (bool)      return raw listings scraped or clean them

    Return:
        (pandas.DataFrame) containing the desirable filtered listings from the scraped data.
    """

    scraper = FundaScraper(area="breda", want_to="rent", find_past=False, page_start=1, n_pages=1)
    houses = scraper.run(raw_data=raw_listings)

    return houses


new_listings = extract_listings_funda(area='breda', raw_listings=True)
new_listings

[38;20m2024-03-17 12:07:06,710 - INFO - *** Phase 1: Fetch all the available links from all pages ***  (scrape.py:122)[0m
100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
[38;20m2024-03-17 12:07:07,311 - INFO - *** Got all the urls. 15 houses found from 1 to 1 *** (scrape.py:138)[0m
[38;20m2024-03-17 12:07:07,311 - INFO - *** Phase 2: Start scraping from individual links *** (scrape.py:239)[0m
100%|██████████| 15/15 [00:03<00:00,  4.58it/s]
[38;20m2024-03-17 12:07:10,786 - INFO - *** All scraping done: 15 results *** (scrape.py:254)[0m
[38;20m2024-03-17 12:07:10,786 - INFO - *** Done! *** (scrape.py:292)[0m


Unnamed: 0,url,price,address,descrip,listed_since,zip_code,size,year,living_area,kind_of_house,...,ownership,exteriors,parking,neighborhood_name,date_list,last_ask_price,last_ask_price_m2,photo,city,log_id
0,https://www.funda.nl/huur/breda/appartement-43...,€ 2.600 /mnd,Ceresstraat 24 A*,Te Huur: Betreed de wereld van ongeëvenaard...,na,4811 CC Breda,185 m²,1922,185 m²,Bovenwoning (dubbel bovenhuis),...,na,na,na,na,€ 2.600 per maand (geen servicekosten),€ 2.600 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/150...,breda,202403-1712-0710
1,https://www.funda.nl/huur/breda/appartement-43...,€ 1.145 /mnd,Menno van Coehoornstraat 51,Wonen in het centrum van gezellig Breda nab...,na,4811 AV Breda,88 m²,2007,88 m²,Galerijflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.145 per maand (exclusief servicekosten à €...,€ 1.145 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/324...,breda,202403-1712-0710
2,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminastraat 50 A,"In een karakteristiek pand, boven de exclus...",na,4818 SH Breda,145 m²,1900,145 m²,Bovenwoning (appartement),...,na,na,na,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/785...,breda,202403-1712-0710
3,https://www.funda.nl/koop/breda/appartement-42...,€ 815.000 k.k.,Laurenspark 40,"Op een prachtige woonlocatie in Breda-Zuid, ...",na,4835 GX Breda Ginneken,164 m²,1997,164 m²,Portiekflat (appartement),...,na,"LiggingAan bosrand, aan park, aan rustige weg,...",Soort garageParkeerkelder,Ginneken,€ 815.000 kosten koper,€ 815.000 kosten koper,€ 4.970,https://cloud.funda.nl/valentina_media/182/388...,breda,202403-1712-0710
4,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminapark 33,Zeer royaal en heerlijk licht 3-kamer hoek-...,na,4818 SL Breda,133 m²,1994,133 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Soort parkeergelegenheidOp eigen terrein en pa...,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/186/830...,breda,202403-1712-0710
5,https://www.funda.nl/huur/breda/appartement-42...,€ 3.500 /mnd,Concordiaplein 42,Droomt u ook van een eigen woonverdieping m...,Onbepaalde tijd,4811 NZ Breda,231 m²,1999,231 m²,Portiekflat (appartement),...,na,na,Inschrijving KvKJaJaarlijkse vergaderingJaPeri...,na,€ 3.500 per maand (geen servicekosten),€ 3.500 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/175/950...,breda,202403-1712-0710
6,https://www.funda.nl/huur/breda/appartement-43...,€ 2.195 /mnd,van Coothplein 18 a,"Zeer luxe, ruime en deels gemeubileerde app...",na,4811 NE Breda,120 m²,2022,120 m²,Tussenverdieping (appartement),...,Balkon aanwezig,na,na,na,€ 2.195 per maand (geen servicekosten),€ 2.195 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/187/457...,breda,202403-1712-0710
7,https://www.funda.nl/huur/breda/huis-43477238-...,€ 2.500 /mnd,Cimburgalaan 85,Op zeer gewilde locatie in de kindvriendeli...,470 m³,4819 BB Breda,160 m²,1955,160 m²,"Eengezinswoning, hoekwoning",...,Achtertuin,na,na,na,€ 2.500 per maand (exclusief servicekosten à €...,€ 2.500 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/185/774...,breda,202403-1712-0710
8,https://www.funda.nl/huur/breda/appartement-43...,€ 1.060 /mnd,Meerten Verhoffstraat 10 B4,Geweldig 3 Kamer Appartement in Complex De ...,na,4811 AT Breda,80 m²,2007,80 m²,Galerijflat (appartement),...,na,na,Soort parkeergelegenheidParkeergarage,na,€ 1.060 per maand (exclusief servicekosten à €...,€ 1.060 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/437...,breda,202403-1712-0710
9,https://www.funda.nl/huur/breda/appartement-43...,€ 1.130 /mnd,Stationslaan 3 C12,Gelegen in Breda ligt dit karakteristieke e...,na,4815 GW Breda,96 m²,2016,96 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.130 per maand (servicekosten onbekend),€ 1.130 per maand (servicekosten onbekend),na,https://cloud.funda.nl/valentina_media/186/348...,breda,202403-1712-0710


In [78]:
new_listings

Unnamed: 0,url,price,address,descrip,listed_since,zip_code,size,year,living_area,kind_of_house,...,ownership,exteriors,parking,neighborhood_name,date_list,last_ask_price,last_ask_price_m2,photo,city,log_id
0,https://www.funda.nl/huur/breda/appartement-43...,€ 2.600 /mnd,Ceresstraat 24 A*,Te Huur: Betreed de wereld van ongeëvenaard...,na,4811 CC Breda,185 m²,1922,185 m²,Bovenwoning (dubbel bovenhuis),...,na,na,na,na,€ 2.600 per maand (geen servicekosten),€ 2.600 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/150...,breda,202403-1712-0710
1,https://www.funda.nl/huur/breda/appartement-43...,€ 1.145 /mnd,Menno van Coehoornstraat 51,Wonen in het centrum van gezellig Breda nab...,na,4811 AV Breda,88 m²,2007,88 m²,Galerijflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.145 per maand (exclusief servicekosten à €...,€ 1.145 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/324...,breda,202403-1712-0710
2,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminastraat 50 A,"In een karakteristiek pand, boven de exclus...",na,4818 SH Breda,145 m²,1900,145 m²,Bovenwoning (appartement),...,na,na,na,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/785...,breda,202403-1712-0710
3,https://www.funda.nl/koop/breda/appartement-42...,€ 815.000 k.k.,Laurenspark 40,"Op een prachtige woonlocatie in Breda-Zuid, ...",na,4835 GX Breda Ginneken,164 m²,1997,164 m²,Portiekflat (appartement),...,na,"LiggingAan bosrand, aan park, aan rustige weg,...",Soort garageParkeerkelder,Ginneken,€ 815.000 kosten koper,€ 815.000 kosten koper,€ 4.970,https://cloud.funda.nl/valentina_media/182/388...,breda,202403-1712-0710
4,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminapark 33,Zeer royaal en heerlijk licht 3-kamer hoek-...,na,4818 SL Breda,133 m²,1994,133 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Soort parkeergelegenheidOp eigen terrein en pa...,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/186/830...,breda,202403-1712-0710
5,https://www.funda.nl/huur/breda/appartement-42...,€ 3.500 /mnd,Concordiaplein 42,Droomt u ook van een eigen woonverdieping m...,Onbepaalde tijd,4811 NZ Breda,231 m²,1999,231 m²,Portiekflat (appartement),...,na,na,Inschrijving KvKJaJaarlijkse vergaderingJaPeri...,na,€ 3.500 per maand (geen servicekosten),€ 3.500 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/175/950...,breda,202403-1712-0710
6,https://www.funda.nl/huur/breda/appartement-43...,€ 2.195 /mnd,van Coothplein 18 a,"Zeer luxe, ruime en deels gemeubileerde app...",na,4811 NE Breda,120 m²,2022,120 m²,Tussenverdieping (appartement),...,Balkon aanwezig,na,na,na,€ 2.195 per maand (geen servicekosten),€ 2.195 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/187/457...,breda,202403-1712-0710
7,https://www.funda.nl/huur/breda/huis-43477238-...,€ 2.500 /mnd,Cimburgalaan 85,Op zeer gewilde locatie in de kindvriendeli...,470 m³,4819 BB Breda,160 m²,1955,160 m²,"Eengezinswoning, hoekwoning",...,Achtertuin,na,na,na,€ 2.500 per maand (exclusief servicekosten à €...,€ 2.500 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/185/774...,breda,202403-1712-0710
8,https://www.funda.nl/huur/breda/appartement-43...,€ 1.060 /mnd,Meerten Verhoffstraat 10 B4,Geweldig 3 Kamer Appartement in Complex De ...,na,4811 AT Breda,80 m²,2007,80 m²,Galerijflat (appartement),...,na,na,Soort parkeergelegenheidParkeergarage,na,€ 1.060 per maand (exclusief servicekosten à €...,€ 1.060 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/437...,breda,202403-1712-0710
9,https://www.funda.nl/huur/breda/appartement-43...,€ 1.130 /mnd,Stationslaan 3 C12,Gelegen in Breda ligt dit karakteristieke e...,na,4815 GW Breda,96 m²,2016,96 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.130 per maand (servicekosten onbekend),€ 1.130 per maand (servicekosten onbekend),na,https://cloud.funda.nl/valentina_media/186/348...,breda,202403-1712-0710


In [89]:
def extract_log_datetime(log_id:str) -> str:
    """
    Extract log datetime as string from log_id
    """

    log = new_listings.log_id[0]
    log_datetime = datetime.datetime.strptime(log, '%Y%m-%d%H-%M%S')
    return log_datetime.strftime('%d-%m-%Y %H:%M:%S')


def clean_listings(df:pd.DataFrame):
    """
    Manually clean listings to extract desired fields.

    Args:
        df: (Pandas.DataFrame)  the scraped house listings
    """

    clean_df = pd.DataFrame()

    # info
    clean_df["house_id"] = df["url"].apply(lambda x: int(x.split("/")[-2].split("-")[1]))
    clean_df["house_type"] = df["url"].apply(lambda x: x.split("/")[-2].split("-")[0])

    # price
    clean_df["price"] = df["price"].apply(preprocess.clean_price)
    clean_df = clean_df[clean_df["price"] != 0]

    # house layout
    clean_df["room"] = df["num_of_rooms"].apply(preprocess.find_n_room)
    clean_df["bedroom"] = df["num_of_rooms"].apply(preprocess.find_n_bedroom)
    clean_df["bathroom"] = df["num_of_bathrooms"].apply(preprocess.find_n_bathroom)
    clean_df["energy_label"] = df["energy_label"].apply(preprocess.clean_energy_label)

    # translations
    # clean_parking = df[df["parking"] != "na"]
    # clean_df["parking"] = clean_parking.apply(translator.translate)

    clean_df["address"] = df["address"]
    clean_df["zip_code"] = df["zip_code"]
    clean_df["year_built"] = df["year"]
    clean_df["url"] = df["url"]

    # time
    clean_df["log_id"] = df["log_id"].apply(extract_log_datetime)
    clean_df = clean_df.set_index('house_id').sort_index(ascending=False)

    return clean_df

clean_listings(new_listings)

Unnamed: 0_level_0,house_type,price,room,bedroom,bathroom,energy_label,address,zip_code,year_built,url,log_id
house_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
43495729,appartement,1145,3,2,1,B,Menno van Coehoornstraat 51,4811 AV Breda,2007,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43493919,appartement,2250,3,2,2,A,Wilhelminapark 33,4818 SL Breda,1994,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43481296,appartement,1130,4,3,1,A,Stationslaan 3 C12,4815 GW Breda,2016,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43478517,appartement,2250,5,3,1,F,Wilhelminastraat 50 A,4818 SH Breda,1900,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43477238,huis,2500,4,3,1,na,Cimburgalaan 85,4819 BB Breda,1955,https://www.funda.nl/huur/breda/huis-43477238-...,17-03-2024 12:07:10
43452897,appartement,2600,7,4,2,C,Ceresstraat 24 A*,4811 CC Breda,1922,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43419556,appartement,2195,4,3,2,A,van Coothplein 18 a,4811 NE Breda,2022,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43418036,appartement,1060,3,2,1,A,Meerten Verhoffstraat 10 B4,4811 AT Breda,2007,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10
43410021,huis,2750,6,5,1,>A+,Barend Cohenstraat 14,4817 CS Breda,2024,https://www.funda.nl/huur/breda/huis-43410021-...,17-03-2024 12:07:10
43404122,appartement,1350,4,3,1,E,Valkeniersplein 8 A,4835 RB Breda,1963,https://www.funda.nl/huur/breda/appartement-43...,17-03-2024 12:07:10


In [84]:
new_listings

Unnamed: 0,url,price,address,descrip,listed_since,zip_code,size,year,living_area,kind_of_house,...,ownership,exteriors,parking,neighborhood_name,date_list,last_ask_price,last_ask_price_m2,photo,city,log_id
0,https://www.funda.nl/huur/breda/appartement-43...,€ 2.600 /mnd,Ceresstraat 24 A*,Te Huur: Betreed de wereld van ongeëvenaard...,na,4811 CC Breda,185 m²,1922,185 m²,Bovenwoning (dubbel bovenhuis),...,na,na,na,na,€ 2.600 per maand (geen servicekosten),€ 2.600 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/150...,breda,202403-1712-0710
1,https://www.funda.nl/huur/breda/appartement-43...,€ 1.145 /mnd,Menno van Coehoornstraat 51,Wonen in het centrum van gezellig Breda nab...,na,4811 AV Breda,88 m²,2007,88 m²,Galerijflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.145 per maand (exclusief servicekosten à €...,€ 1.145 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/324...,breda,202403-1712-0710
2,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminastraat 50 A,"In een karakteristiek pand, boven de exclus...",na,4818 SH Breda,145 m²,1900,145 m²,Bovenwoning (appartement),...,na,na,na,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/185/785...,breda,202403-1712-0710
3,https://www.funda.nl/koop/breda/appartement-42...,€ 815.000 k.k.,Laurenspark 40,"Op een prachtige woonlocatie in Breda-Zuid, ...",na,4835 GX Breda Ginneken,164 m²,1997,164 m²,Portiekflat (appartement),...,na,"LiggingAan bosrand, aan park, aan rustige weg,...",Soort garageParkeerkelder,Ginneken,€ 815.000 kosten koper,€ 815.000 kosten koper,€ 4.970,https://cloud.funda.nl/valentina_media/182/388...,breda,202403-1712-0710
4,https://www.funda.nl/huur/breda/appartement-43...,€ 2.250 /mnd,Wilhelminapark 33,Zeer royaal en heerlijk licht 3-kamer hoek-...,na,4818 SL Breda,133 m²,1994,133 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Soort parkeergelegenheidOp eigen terrein en pa...,na,€ 2.250 per maand (geen servicekosten),€ 2.250 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/186/830...,breda,202403-1712-0710
5,https://www.funda.nl/huur/breda/appartement-42...,€ 3.500 /mnd,Concordiaplein 42,Droomt u ook van een eigen woonverdieping m...,Onbepaalde tijd,4811 NZ Breda,231 m²,1999,231 m²,Portiekflat (appartement),...,na,na,Inschrijving KvKJaJaarlijkse vergaderingJaPeri...,na,€ 3.500 per maand (geen servicekosten),€ 3.500 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/175/950...,breda,202403-1712-0710
6,https://www.funda.nl/huur/breda/appartement-43...,€ 2.195 /mnd,van Coothplein 18 a,"Zeer luxe, ruime en deels gemeubileerde app...",na,4811 NE Breda,120 m²,2022,120 m²,Tussenverdieping (appartement),...,Balkon aanwezig,na,na,na,€ 2.195 per maand (geen servicekosten),€ 2.195 per maand (geen servicekosten),na,https://cloud.funda.nl/valentina_media/187/457...,breda,202403-1712-0710
7,https://www.funda.nl/huur/breda/huis-43477238-...,€ 2.500 /mnd,Cimburgalaan 85,Op zeer gewilde locatie in de kindvriendeli...,470 m³,4819 BB Breda,160 m²,1955,160 m²,"Eengezinswoning, hoekwoning",...,Achtertuin,na,na,na,€ 2.500 per maand (exclusief servicekosten à €...,€ 2.500 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/185/774...,breda,202403-1712-0710
8,https://www.funda.nl/huur/breda/appartement-43...,€ 1.060 /mnd,Meerten Verhoffstraat 10 B4,Geweldig 3 Kamer Appartement in Complex De ...,na,4811 AT Breda,80 m²,2007,80 m²,Galerijflat (appartement),...,na,na,Soort parkeergelegenheidParkeergarage,na,€ 1.060 per maand (exclusief servicekosten à €...,€ 1.060 per maand (exclusief servicekosten à €...,na,https://cloud.funda.nl/valentina_media/187/437...,breda,202403-1712-0710
9,https://www.funda.nl/huur/breda/appartement-43...,€ 1.130 /mnd,Stationslaan 3 C12,Gelegen in Breda ligt dit karakteristieke e...,na,4815 GW Breda,96 m²,2016,96 m²,Portiekflat (appartement),...,Balkon aanwezig,na,Inschrijving KvKNeeJaarlijkse vergaderingNeePe...,na,€ 1.130 per maand (servicekosten onbekend),€ 1.130 per maand (servicekosten onbekend),na,https://cloud.funda.nl/valentina_media/186/348...,breda,202403-1712-0710


In [35]:

translation = translator.translate("Soort parkeergelegenheidOpenbaar parkeren")
translation

'Type of parkingPublic parking'