In [31]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import numpy as np

In [32]:
# Creating a list with URLs to scrape addresses from
url_list = ["https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012181099&theme=BRT+Achtergrond&geometry.x=122518.23449999999&geometry.y=486193.337&zoomlevel=15&bijbehorendeAdressen=Pand", "https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012181081&theme=BRT+Achtergrond&geometry.x=122485.011&geometry.y=486135.5375&zoomlevel=15&bijbehorendeAdressen=Pand", "https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012253901&theme=BRT+Achtergrond&geometry.x=122536.796&geometry.y=486176.6535&zoomlevel=15&bijbehorendeAdressen=Pand"]
# Creating an empty list to store the addresses and latitudes and longitudes in
all_scraped_addresses = []

In [33]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

for i in url_list:
    # Getting url from url_list
    driver.get(i)
    
    try:
        # Fetching the sidebar where all address information is present
        sidebar = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//app-root//app-sidebar"))
        )
        
        # Find the element mentioning the amount of buildings in that block
        amount_buildings = sidebar.find_elements(By.XPATH, ".//h2")
        
        # Create a variable containing only a number for the amount of buildings in the block
        if len(amount_buildings) >= 2:
            amount_addresses = amount_buildings[1].text
            number_addresses = int(amount_addresses.split()[0])

        # Fetching the addresses if the number of addresses is less than or equal to 5    
        if number_addresses <= 5:
            # Locating the addresses in the sidebar
            buttons_houses = sidebar.find_elements(By.XPATH, "//app-root//app-sidebar//sidebar-overzicht//ul//li//a//span")

            # Adding the addresses to the list into seperate sublists
            for button in buttons_houses:
                address_text = button.text
                all_scraped_addresses.append([address_text])
    
        # Fetching the addresses if the number of addresses is bigger than 5
        elif number_addresses > 5:
            # Locating the text connected to the button showing all buildings in the block
            button_more_houses = sidebar.find_elements(By.XPATH, ".//a//span")

            if len(button_more_houses) >= 9:
                # Clicking the button connected to the previously mentioned text
                parent_button = button_more_houses[8].find_element(By.XPATH, "./parent::a")
                parent_button.click()
        
            # Locating the addresses within the sidebar
            houses_buttons = sidebar.find_elements(By.XPATH, "//app-root//app-sidebar-left//table//tbody//tr//ul//li")
        
            # Adding the addresses into to the list into seperate sublists
            for button in houses_buttons:
                address_text = button.text
                all_scraped_addresses.append([address_text])

        # Pause the search for a bit, so to not overwhelm the website
        time.sleep(1)

    except Exception as e:
        print(f"Error {e}")

driver.quit()

In [34]:
print(all_scraped_addresses)

[['Nieuwe Achtergracht 142 A1, Amsterdam'], ['Nieuwe Achtergracht 142 A2, Amsterdam'], ['Nieuwe Achtergracht 142 A3, Amsterdam'], ['Nieuwe Achtergracht 142 A4, Amsterdam'], ['Nieuwe Achtergracht 142 B1, Amsterdam'], ['Nieuwe Achtergracht 142 B2, Amsterdam'], ['Nieuwe Achtergracht 142 B3, Amsterdam'], ['Nieuwe Achtergracht 142 B4, Amsterdam'], ['Nieuwe Achtergracht 142 C1, Amsterdam'], ['Nieuwe Achtergracht 142 C2, Amsterdam'], ['Nieuwe Achtergracht 142 C3, Amsterdam'], ['Nieuwe Achtergracht 142 C4, Amsterdam'], ['Nieuwe Achtergracht 142 D1, Amsterdam'], ['Nieuwe Achtergracht 142 D2, Amsterdam'], ['Nieuwe Achtergracht 142 D3, Amsterdam'], ['Nieuwe Achtergracht 142 D4, Amsterdam'], ['Roetersstraat 170, Amsterdam'], ['Valckenierstraat 29 A, Amsterdam'], ['Valckenierstraat 29 B, Amsterdam'], ['Valckenierstraat 29 C, Amsterdam'], ['Valckenierstraat 29 D, Amsterdam'], ['Valckenierstraat 29 E, Amsterdam'], ['Valckenierstraat 29 F, Amsterdam'], ['Valckenierstraat 29 G, Amsterdam'], ['Valckenie

In [35]:
!pip install geopy

Defaulting to user installation because normal site-packages is not writeable


In [36]:
# Test for using geopy on the dataset
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="scraping_addresses.ipynb")
location = geolocator.geocode("Roetersstraat 170\n1018 WE  AMSTERDAM")
print(location.address)
print((location.latitude, location.longitude))
print(location.raw)

Kriterion, 170, Roetersstraat, Weesperbuurt, Centrum, Amsterdam, Noord-Holland, Nederland, 1018 WE, Nederland
(52.362493, 4.9106652)
{'place_id': 144946830, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'node', 'osm_id': 253130481, 'lat': '52.362493', 'lon': '4.9106652', 'class': 'amenity', 'type': 'cinema', 'place_rank': 30, 'importance': 0.2522941968270874, 'addresstype': 'amenity', 'name': 'Kriterion', 'display_name': 'Kriterion, 170, Roetersstraat, Weesperbuurt, Centrum, Amsterdam, Noord-Holland, Nederland, 1018 WE, Nederland', 'boundingbox': ['52.3624430', '52.3625430', '4.9106152', '4.9107152']}


In [37]:
# Looping over all previously added addresses and getting their relevant latitudes, longitudes and function
for sublist in all_scraped_addresses:
    address = sublist[0]
    try:
        location = geolocator.geocode(address)
        if location:
            # Add latitude, longitude and function to the sublist
            sublist.append(location.latitude)
            sublist.append(location.longitude)
            sublist.append(location.raw["type"])
            if location.raw["name"] == "":
                sublist.append(None)
            else:
                sublist.append(location.raw["name"])
        else:
            # Appending None if the location is not found
            sublist.append(None)
            sublist.append(None)
            sublist.append(None)
            sublist.append(None)
        
        # Adding a sleep time to comply towards usage policies
        time.sleep(1)


    except Exception as e:
        print(f"Error occurred for address {address}: {e}")
        

In [38]:
print(all_scraped_addresses)

[['Nieuwe Achtergracht 142 A1, Amsterdam', 52.3627638, 4.9102335, 'house', None], ['Nieuwe Achtergracht 142 A2, Amsterdam', 52.3627637, 4.9102337, 'house', None], ['Nieuwe Achtergracht 142 A3, Amsterdam', 52.3627636, 4.9102338, 'house', None], ['Nieuwe Achtergracht 142 A4, Amsterdam', 52.3627634, 4.910234, 'house', None], ['Nieuwe Achtergracht 142 B1, Amsterdam', 52.3627459, 4.9102483, 'house', None], ['Nieuwe Achtergracht 142 B2, Amsterdam', 52.3627458, 4.9102485, 'house', None], ['Nieuwe Achtergracht 142 B3, Amsterdam', 52.3627457, 4.9102486, 'house', None], ['Nieuwe Achtergracht 142 B4, Amsterdam', 52.3627456, 4.9102488, 'house', None], ['Nieuwe Achtergracht 142 C1, Amsterdam', 52.3627279, 4.9102485, 'house', None], ['Nieuwe Achtergracht 142 C2, Amsterdam', 52.3627278, 4.9102487, 'house', None], ['Nieuwe Achtergracht 142 C3, Amsterdam', 52.3627277, 4.9102488, 'house', None], ['Nieuwe Achtergracht 142 C4, Amsterdam', 52.3627276, 4.910249, 'house', None], ['Nieuwe Achtergracht 142 D1,