In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import utils

In [2]:
target_domains = [
    "https://www.room.nl/en/",
    "https://kamernet.nl/en",
    "https://www.funda.nl/en/",
    "https://www.pararius.com/english"
]

domain = target_domains[1]

In [3]:
init_search_url = "https://kamernet.nl/en/for-rent/rooms-leiden?radius=5&minSize=2&maxRent=0&searchview=1&typeAndCity=rooms-leiden&pageNo=1"

In [4]:
r = requests.get(init_search_url)
resp_text = r.text
soup = BeautifulSoup(resp_text)
listing_cards = soup.find_all('a', attrs = {'class': re.compile("^MuiTypography-root.*ListingCard.*")})

In [41]:
post_resp = requests.post("https://kamernet.nl/services/api/listing/findlistings",
                json = {"location":{"name":"Leiden","cityName":"Leiden"},"radiusId":4,"listingTypeIds":[],"maxRentalPriceId":33,"surfaceMinimumId":2,"listingSortOptionId":1,"pageNo":2,"suitableForGenderIds":[],"furnishings":[],"availabilityPeriods":[],"availableFromDate":None,"isBathroomPrivate":None,"isToiletPrivate":None,"isKitchenPrivate":None,"hasInternet":None,"suitableForNumberOfPersonsId":None,"candidateAge":None,"suitableForStatusIds":[],"isSmokingInsideAllowed":None,"isPetsInsideAllowed":None,"roommateMaxNumberId":None,"roommateGenderIds":[],"ownerTypeIds":[],"variant":None,"searchview":1,"rowsPerPage":18,"OpResponse":{"Code":1000,"Message":"Operation successful.","HttpStatusCode":200},"LogEntryId":None}).text

In [14]:
def month_enum(text):
    month_map = {
        "Jan": 1, "Feb": 2, "Mar": 3,
        "Apr": 4, "May": 5, "Jun": 6,
        "Jul": 7, "Aug": 8, "Sep": 9,
        "Oct": 10, "Nov": 11, "Dec": 12
    }
    narrowed_map = {k: v for k, v in month_map.items() if k in text}
    found_key = list(narrowed_map.keys())[0]
    return re.sub(found_key, str(narrowed_map[found_key]), text)


def month_convert(date_text):
    numeric_switched_date = month_enum(date_text)
    return datetime.strptime(numeric_switched_date, "%d %m %Y")


def card_ids_get(listing_card):
    card_url = listing_card.get('href').replace(" ", "-")
    listing_id = re.search("\d*$", card_url).group()

    return {'listingId': listing_id, 'url_append': card_url}


def card_availability_range(card_info_div):
    availability_p = card_info_div.find_all('p', attrs = {'class': re.compile("^MuiTypography-root MuiTypography-body2 mui-style-.*")})[1]
    availability_text = availability_p.text
    full_date_extract_regex = "(?:\d{1,2})\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec){1}\s(?:\d{4})"
    if "From" in availability_text:
        availability_date_str = re.search(full_date_extract_regex, availability_text).group()
        avail_start_date = month_convert(availability_date_str)
        avail_end_date = None
    else:
        availability_date_strs = re.findall(full_date_extract_regex, availability_text)
        if len(availability_date_strs) > 1:
            converted_availability_dates = [month_convert(x) for x in availability_date_strs]
            avail_start_date = converted_availability_dates[0]
            avail_end_date = converted_availability_dates[1]
        elif "-" in availability_text:
            avail_end_date_raw = re.search(full_date_extract_regex, availability_text).group()
            avail_end_date = month_convert(avail_end_date_raw)
            end_date_str_idx = availability_text.index(avail_end_date_raw)
            start_date_md = availability_text[:end_date_str_idx].strip("- ")
            rebuilt_start_date_raw = "%s %s" % (start_date_md, str(avail_end_date.year))
            avail_start_date = month_convert(rebuilt_start_date_raw)
        else:
            avail_start_date = None
            avail_end_date = None
    return {"availabilityStartDate": avail_start_date, "availabilityEndDate": avail_end_date}


def card_street_city(card_rows, street_city_idx: int = 0):
    street_city = card_rows[street_city_idx].find_all('h6')
    street = street_city[0].text.strip(",")
    city = street_city[1].text.strip(",")
    return {"street": street, "city": city}


def card_listing_details(card_rows, listing_details_idx: int = 1):
    listing_details = card_rows[listing_details_idx].find_all('p')
    listing_details_strs = [x.text.replace(" m²", "") for x in listing_details]
    listing_detail_keys = ["surfaceArea", "furnishingId", "listingType"]
    return {k: v for k, v in zip(listing_detail_keys, listing_details_strs)}


def card_rent_value(card_rows, price_idx: int = 2):
    price_value = card_rows[price_idx].text
    if "incl. utilities" in price_value:
        util_incl = True
    else:
        util_incl = False
    price_re_find = re.findall("\d*", price_value)
    price_num = int("".join([x for x in price_re_find if x != '']))
    return {"totalRentalPrice": price_num, "utilitiesIncluded": util_incl}


def card_new_chip_check(listing_card):
    new_chips = listing_card.find_all("span", attrs = {"class": "MuiChip-label MuiChip-labelMedium mui-style-9iedg7"})
    if len(new_chips) == 1:
        is_new = True
    else:
        is_new = False
    is_new_dict = {'isNewAdvert': is_new}
    return is_new_dict

In [15]:
skinny_listing_data = []
for listing_card in listing_cards:
    result_dict = {}

    card_info_div = listing_card.find('div', attrs = {'class': re.compile("^MuiCardContent-root ListingCard_cardContent__.*")})
    card_rows = card_info_div.find_all('div', attrs = {'class': re.compile("^ListingCard_listingRo.*")})

    result_dict.update(card_ids_get(listing_card))

    result_dict.update(card_availability_range(card_info_div))
    
    result_dict.update(card_street_city(card_rows))

    result_dict.update(card_listing_details(card_rows))

    result_dict.update(card_rent_value(card_rows))

    result_dict.update(card_new_chip_check(listing_card))

    skinny_listing_data.append(result_dict)

In [42]:
post_resp_dict = json.loads(post_resp)
post_resp_dict.keys()

dict_keys(['listings', 'topAdListings', 'nearbyListings', 'total', 'OpResponse', 'LogEntryId'])

In [15]:
mapping_f = open("db_mappings/kamernet_mapping.json")
db_mapping_dict = json.load(mapping_f)

key_converted_listing_data = []

for skinny_listing in skinny_listing_data:
    key_swapped_listing = {db_mapping_dict[k]: v for k, v in skinny_listing.items()}
    key_swapped_listing['domain'] = domain
    key_converted_listing_data.append(key_swapped_listing)

In [17]:
utils.push_to_db(key_converted_listing_data)