In [194]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import os
import sys

CURR_DIR = os.getcwd()
PARENT_DIR = "/".join(CURR_DIR.split("/")[:-1])
sys.path.append(PARENT_DIR)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from math import ceil
from time import sleep

In [2]:
target_domains = [
    "https://www.room.nl/en/",
    "https://kamernet.nl/en",
    "https://www.funda.nl/en/",
    "https://www.pararius.com/english"
]

domain = target_domains[2]

In [10]:
n_results = 15
init_search_url = "https://www.funda.nl/en/zoeken/huur?selected_area=%5B%22leiden%22%5D"
search_url = "https://www.funda.nl/en/zoeken/huur?selected_area=%%5B%%22leiden%%22%%5D&search_result=%.0f"

In [4]:
def selenium_soup_get(driver, url, test_xpath, load_delay: int = 10):
    driver.get(url)
    load_delay = 10

    test_element = EC.presence_of_element_located((By.XPATH, test_xpath))
    _ = WebDriverWait(driver, load_delay).until(test_element)

    resp_source = driver.page_source
    soup = BeautifulSoup(resp_source)

    return soup

In [5]:
opts = webdriver.FirefoxOptions()
opts.add_argument('--enable-javascript')
# opts.add_argument('--headless')
driver = webdriver.Firefox(options = opts)

In [6]:
soup = selenium_soup_get(driver, init_search_url, "//div[@data-test-id='search-result-item']")

In [7]:
rental_count_obj = soup.find_all("div", attrs = {"class": "overflow-hidden text-ellipsis whitespace-nowrap font-semibold"})[0]
rental_count_text = rental_count_obj.text
rental_count_str_split = rental_count_text.replace(" ", "").split("\n")
rental_count_num = int(rental_count_str_split[1])

page_count = ceil(rental_count_num/n_results)

In [12]:
result_links = []

for page_num in range(1, page_count + 1):
    page_search_url = search_url % page_num
    page_soup = selenium_soup_get(driver, page_search_url, "//div[@data-test-id='search-result-item']")
    result_divs = page_soup.find_all("div", attrs = {"data-test-id": "search-result-item"})
    for result in result_divs:
        listing_name_obj = result.find_all("a", attrs = {"class": "text-blue-2 visited:text-purple-1 cursor-pointer"})[0]
        listing_link = listing_name_obj['href']

        result_links.append(listing_link)

    sleep(5)

In [192]:
# page parse
data_results = []

for result_link in result_links:

    # page parse
    listing_link = result_link

    # insert en for english results
    en_swapped_listing_link = listing_link.replace("https://www.funda.nl/", "https://www.funda.nl/en/")
    result_soup = selenium_soup_get(driver, en_swapped_listing_link, "//span[@class='object-header__title']")

    # Select taal == english
    language_selector = Select(driver.find_element(By.XPATH, "//select[@id='langSwitch']"))
    language_selector.select_by_visible_text("English")

    # pull soup again with english text
    sleep(1)

    result_soup = BeautifulSoup(driver.page_source)

    listing_dict = {}

    domain_stripped_url = listing_link.split("leiden")[1]
    url_append = listing_link.split("leiden")[1]
    domain_id = domain_stripped_url.split("-")[1]

    # Street/house number/addition
    house_info_obj = result_soup.find_all("span", "object-header__title")[0]
    house_info_text = house_info_obj.text
    house_info_list = house_info_text.split(" ")
    street = house_info_list[0]
    house_number = house_info_list[1]
    if len(house_info_list) > 2:
        house_addition = house_info_list[2]
    else:
        house_addition = None

    # Neighbourhood/postal code
    postal_code_obj = result_soup.find_all("span", "object-header__subtitle fd-color-dark-3")[0]
    postal_code_text = postal_code_obj.text
    postal_code_text_list = postal_code_text.split(" ")
    postal_code_str_list = postal_code_text_list[:2]
    postal_code = "".join(postal_code_str_list)
    neighbourhood_obj = result_soup.find_all("div", "w-full pl-3 pr-16")[0]
    neighbourhood_title_span = neighbourhood_obj.find("span")
    locale = neighbourhood_title_span.text.replace(" ", "")

    # Ownership details
    # If table structure is not with header/body, assume singular table with different class tags
    table_headers = result_soup.find_all("h3", "object-kenmerken-list-header")
    table_headers_text = [x.text for x in table_headers]
    tables = result_soup.find_all("dl", "object-kenmerken-list")
    singular_table = False
    if len(table_headers_text) == 0:
        singular_table = True
    table_dict = dict(zip(table_headers_text, tables))

    # Area dwelling
    area_objects = result_soup.find_all("span", "kenmerken-highlighted__value fd-text--nowrap")
    if len(area_objects) == 0:
        area_dwelling = 0
    else:
        sq_m_obj = area_objects[0]
        area_dwelling = int(sq_m_obj.text.replace(" m²", ""))

    if not singular_table:

        ownership_table = table_dict['Transfer of ownership']
        ownership_table_keys = [x.text for x in ownership_table.find_all("dt")]
        ownership_table_values = [x.text for x in ownership_table.find_all("dd") if x.find("dd") is None]
        ownership_table_dict = dict(zip(ownership_table_keys, ownership_table_values))

        if "Rental agreement" in ownership_table_dict:
            rent_buy = "Rent"
        else:
            rent_buy = "Buy"

        if ownership_table_dict['Status'].replace("\n", "") == "Available":
            available_date = datetime.now()
        else:
            available_date = None

        rent_total_str = ownership_table_dict['Rental price ']
        rent_numeric_match = re.search("(\d+\,|)\d{3}", rent_total_str).group()
        rent_total = int(rent_numeric_match.replace(",", ""))


    else:

        feature_table = tables[0]
        rent_buy = "Rent"
        available_date = datetime.now()

        feature_table = tables[0]
        feature_table_keys = [x.text for x in feature_table.find_all("dt")]
        feature_table_values = [x.text for x in feature_table.find_all("dd") if x.find("dd") is None]
        feature_table_dict = dict(zip(feature_table_keys, feature_table_values))

        rent_total_str = feature_table_dict['Rental price ']
        rent_numeric_match = re.search("(\d+\,|)\d{3}", rent_total_str).group()
        rent_total = int(rent_numeric_match.replace(",", ""))
        
    listing_dict['url_append'] = url_append
    listing_dict['domain'] = domain
    listing_dict['domain_id'] = domain_id
    listing_dict['postal_code'] = postal_code
    listing_dict['street'] = street
    listing_dict['house_number'] = house_number
    listing_dict['house_addition'] = house_addition
    listing_dict['locale'] = locale
    listing_dict['rent_buy'] = rent_buy
    listing_dict['available_date'] = available_date
    listing_dict['area_dwelling'] = area_dwelling
    listing_dict['rent_total'] = rent_total

    data_results.append(listing_dict)

    sleep(10)


In [195]:
utils.push_to_db(data_results)