In [60]:
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import utils
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random

In [62]:
target_domains = [
    "https://www.room.nl/en/",
    "https://kamernet.nl/en",
    "https://www.funda.nl/en/",
    "https://www.pararius.com/english"
]

domain = target_domains[3]

In [63]:
init_search_url = "https://www.pararius.com/apartments/leiden"
search_url = "https://www.pararius.com/apartments/leiden/page-%.0f"
root_url = "https://www.pararius.com"
n_listing_per_page = 30

In [64]:
opts = webdriver.FirefoxOptions()
opts.add_argument('--enable-javascript')
driver = webdriver.Firefox(options = opts)

In [65]:
def selenium_soup_get(driver, url, test_element_class, load_delay: int = 10):
    driver.get(url)
    load_delay = 10

    test_element = EC.presence_of_element_located((By.CLASS_NAME, test_element_class))
    _ = WebDriverWait(driver, load_delay).until(test_element)

    resp_source = driver.page_source
    soup = BeautifulSoup(resp_source)

    return soup

In [67]:
soup = selenium_soup_get(driver, init_search_url, "search-list")

In [68]:
def get_single_page_listings(soup):
    listings = soup.find_all("li", attrs = {"class": "search-list__item search-list__item--listing"})
    title_attrs = {"class": "listing-search-item__title"}
    listing_title_objs = [x.find("h2", attrs = title_attrs) for x in listings]
    listing_links = [x.find("a").get('href') for x in listing_title_objs]
    return listing_links


def get_total_listings_count(soup):
    listing_total_attrs = {"class": "search-list-header__count"}
    title_obj = soup.find("span", attrs = listing_total_attrs)
    return int(title_obj.text)


def get_all_page_listings(soup, driver):
    total_listing_n = get_total_listings_count(soup)

    listing_count = 0
    page_i = 1

    all_listing_urls = []

    while listing_count < total_listing_n:
        fmted_search_url = search_url % page_i
        soup = selenium_soup_get(driver, fmted_search_url, "search-list")
        listing_urls = get_single_page_listings(soup)
        all_listing_urls += listing_urls

        sleep(2)

        listing_count += len(listing_urls)
        page_i += 1

    all_listing_urls = list(set(all_listing_urls))

    return all_listing_urls

In [69]:
all_listings = get_all_page_listings(soup, driver)
all_listings_unique = list(set(all_listings))

In [70]:
all_listing_dicts = []

for listing_url_append in all_listings_unique:

    listing_full_url = root_url + listing_url_append
    listing_soup = selenium_soup_get(driver, listing_full_url, "listing-detail-summary__title")

    listing_dict = {}

    # Get street/city
    listing_dict['url_append'] = listing_url_append
    url_portions = listing_url_append.split("/")
    if url_portions[2] == "project":
        continue
    street_str = url_portions[-1].capitalize()
    locale_str = url_portions[2].capitalize()
    listing_dict['domain_id'] = url_portions[3]
    listing_dict['street'] = street_str
    listing_dict['locale'] = locale_str

    # Get postal/district
    postal_code_attrs = {"class": "listing-detail-summary__location"}
    postal_code_raw_str = listing_soup.find("div", attrs = postal_code_attrs).text
    postal_code_str = postal_code_raw_str.split(" (")[0].replace(" ", "")
    district_str = postal_code_raw_str.split(" (")[1][:-1]
    listing_dict['postal_code'] = postal_code_str
    listing_dict['locale'] = locale_str

    # Get transfer info
    transfer_info_attrs = {"class": "listing-features__list"}
    transfer_info_obj = listing_soup.find("dl", attrs = transfer_info_attrs)
    transfer_lineitem_attrs = {"class": "listing-features__main-description"}
    transfer_lineitem_objs = transfer_info_obj.find_all("span", attrs = {"class": "listing-features__main-description"})
    transfer_lineitem_raw_texts = [x.text for x in transfer_lineitem_objs]

    price_obj_attrs = {"class": "listing-features__description listing-features__description--for_rent_price"}
    price_obj = transfer_info_obj.find("dd", attrs = price_obj_attrs)
    sub_price_obj_attrs = {"class": "listing-features__sub-description"}
    sub_price_obj = price_obj.find("ul", attrs = sub_price_obj_attrs)
    if sub_price_obj is not None:
        sub_price_text = sub_price_obj.find("li").text
        if "Includes" in sub_price_text:
            listing_dict['additional_costs'] = 1
        else:
            listing_dict['additional_costs'] = 0

    service_costs_attrs = {"class": "listing-features__description listing-features__description--service_costs"}
    service_costs_obj = transfer_info_obj.find("dd", attrs = service_costs_attrs)
    if service_costs_obj is not None:
        service_costs_raw = service_costs_obj.find("span").text
        service_costs_str = "".join(re.findall("\d*", service_costs_raw))
        listing_dict['additional_costs'] = int(service_costs_str)

    price_idx = 0
    posted_idx = 1
    rent_buy_idx = 2
    available_idx = 3
    furnished_idx = 4

    price_value = transfer_lineitem_raw_texts[price_idx]
    price_re_find = re.findall("\d*", price_value)
    price_num = int("".join([x for x in price_re_find if x != '']))
    listing_dict['rent_total'] = price_num

    posted_value = transfer_lineitem_raw_texts[posted_idx]
    if "weeks" in posted_value:
        num_weeks = int(re.findall("\d*", posted_value)[0])
        today_date = datetime.now()
        time_diff = timedelta(weeks = num_weeks)
        posted_date = today_date - time_diff
    elif "months" in posted_value:
        num_months = int(re.findall("\d*", posted_value)[0])
        today_date = datetime.now()
        weeks_constant = 4.33
        time_diff = timedelta(weeks = num_months * weeks_constant)
        posted_date = today_date - time_diff
    else:
        posted_date = datetime.strptime(posted_value, "%d-%m-%Y")
    listing_dict['publish_date'] = posted_date

    available_date = transfer_lineitem_raw_texts[available_idx]
    if "From" in available_date:
        available_date_split = available_date.split(" ")
        available_date_value = datetime.strptime(available_date_split[1], "%d-%m-%Y")
    elif available_date == "Immediately":
        available_date_value = datetime.now()
    elif available_date == "In consultation":
        available_date_value = datetime.now()
    listing_dict['available_date'] = available_date_value

    furnished_info = transfer_lineitem_raw_texts[furnished_idx]
    listing_dict['additional_info'] = [furnished_info]

    # Get dimensions info
    dimensions_attrs = {"class": "page__details page__details--dimensions"}
    dimensions_obj = listing_soup.find("section", attrs = dimensions_attrs)
    surface_area_attrs = {"class": "listing-features__description listing-features__description--surface_area"}
    surface_area_obj = listing_soup.find("dd", attrs = surface_area_attrs)
    surface_area_text_obj = surface_area_obj.find("span")
    surface_area_text = surface_area_text_obj.text.split(" ")[0]
    listing_dict['area_dwelling'] = int(surface_area_text)

    # Dwelling type check
    dwelling_type_attrs = {"class": "listing-features__description listing-features__description--dwelling_type"}
    dwelling_obj = listing_soup.find("dd", attrs = dwelling_type_attrs)
    if dwelling_obj is not None:
        dwelling_type = dwelling_obj.find("span").text
        listing_dict['dwelling_type'] = dwelling_type

    # Property type check
    proprety_type_attrs = {"class": "listing-features__description listing-features__description--property_types"}
    property_obj = listing_soup.find("dd", attrs = proprety_type_attrs)
    if property_obj is not None:
        property_type = property_obj.find("span").text
        listing_dict['building_Type'] = property_type


    listing_dict["additional_costs"] = None
    listing_dict["additional_info"] = ";".join(listing_dict["additional_info"])
    listing_dict["domain"] = domain
    all_listing_dicts.append(listing_dict)

    sleep(random.randint(2, 10))

# Get full long description (?)

In [10]:
utils.push_to_db(all_listing_dicts)