In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

## Reading the used geojson-file's coordinates into Python

In [2]:
!pip install geojson

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# Source for code: https://stackoverflow.com/questions/42753745/how-can-i-parse-geojson-with-python
import geojson
# Change this directory after with open to your relevant directory
with open("C:/Users/viggo/Downloads/GitHub_repos/roetersstraat/abstreet/geojson.json") as square:
    model_boundaries = geojson.load(square)
# Saving the features of the geojson file to a variable
features = model_boundaries['features']

# Only saving the coordinates of the geojson
coordinates_geojson = features[0]["geometry"]["coordinates"][0]
coordinates_geojson

[[4.901983, 52.369094],
 [4.901983, 52.359875],
 [4.923408, 52.359875],
 [4.923408, 52.369094],
 [4.901983, 52.369094]]

In [4]:
# Due to some coordinates appearing twice in the geojson coordinates variable, only unique coordinates will be saved
geojson_coordinates = []
for coordinates in coordinates_geojson:
    if coordinates not in geojson_coordinates:
        geojson_coordinates.append(coordinates)

geojson_coordinates

[[4.901983, 52.369094],
 [4.901983, 52.359875],
 [4.923408, 52.359875],
 [4.923408, 52.369094]]

## Getting exact locations from the Kadasterregister map corresponding to coordinates from geojson

Following a process of pinpointing the geojson coordinates on a Google My Maps-file and creating the square there as well, these coordinates where pinpointed as accurately as humanly possible on the map on the Kadasterregister website (available here: https://bagviewer.kadaster.nl/lvbag/bag-viewer/?zoomlevel=1). This website does not utilise coordinates at all and uses a seperate x-y-axis system. THe following values got out of this system after pinpointing the locations:

In [5]:
# Getting the Kadasterregister x and y-values that are relevant for the geojson-file
bag_coordinates = [[121946.73, 485884.65], [123429.29, 485895.45], [123414.96, 486899.86], [121942.62, 486918.21]]
bag_coordinate = [[121942.00, 485885.00], [123430.00, 486919.00]]

In [6]:
# Setting x and y-values for beginning and ending values
x_begin = 121942.00
x_end = 123430.00
y_begin = 485885.00
y_end = 486919.00

# Setting x_now and y_now
x_now = x_begin
y_now = y_begin

In [7]:
# Looping over all values in between x_begin and x_end and y_begin and y_end, where it loops over all y-values in steps of 1.00 for every x-value
while x_now <= x_end:
    while y_now <= y_end:
        print([x_now, y_now])

        y_now += 3.5

    y_now = y_begin
    x_now += 3.5

[121942.0, 485885.0]
[121942.0, 485888.5]
[121942.0, 485892.0]
[121942.0, 485895.5]
[121942.0, 485899.0]
[121942.0, 485902.5]
[121942.0, 485906.0]
[121942.0, 485909.5]
[121942.0, 485913.0]
[121942.0, 485916.5]
[121942.0, 485920.0]
[121942.0, 485923.5]
[121942.0, 485927.0]
[121942.0, 485930.5]
[121942.0, 485934.0]
[121942.0, 485937.5]
[121942.0, 485941.0]
[121942.0, 485944.5]
[121942.0, 485948.0]
[121942.0, 485951.5]
[121942.0, 485955.0]
[121942.0, 485958.5]
[121942.0, 485962.0]
[121942.0, 485965.5]
[121942.0, 485969.0]
[121942.0, 485972.5]
[121942.0, 485976.0]
[121942.0, 485979.5]
[121942.0, 485983.0]
[121942.0, 485986.5]
[121942.0, 485990.0]
[121942.0, 485993.5]
[121942.0, 485997.0]
[121942.0, 486000.5]
[121942.0, 486004.0]
[121942.0, 486007.5]
[121942.0, 486011.0]
[121942.0, 486014.5]
[121942.0, 486018.0]
[121942.0, 486021.5]
[121942.0, 486025.0]
[121942.0, 486028.5]
[121942.0, 486032.0]
[121942.0, 486035.5]
[121942.0, 486039.0]
[121942.0, 486042.5]
[121942.0, 486046.0]
[121942.0, 48

## Getting relevant addresses to scrape from Kadasterregister

I will run a Selenium webbrowser and click every building that fits inside our geojson square. Every new building being clicked, equals a new URL being loaded. All these URLs will be saved into a list and later a CSV-file, so these can be used later wehn scraping all these building blocks or seperate buildings individually.

I have looked into opportunities to scrape this, but this was made extremely difficult by the relevant website and, when finally figured out, requires brute-forcing over 120000 seperate coordinate combos, which each will take at least 7 consecutive days to complete (which is not feasible).

In [8]:
# Setting up a list to track the visited URLs and a set with all object_ids seen so far
visited_urls = []
unique_object_ids = set()

In [9]:
# Thanks ChatGPT for helping me out with this code
driver_manual = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver_manual.get("https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012171532&theme=BRT+Achtergrond&geometry.x=122598.07182599524&geometry.y=486194.1180860803&zoomlevel=11.03314400030912&bijbehorendeAdressen=Pand")

try:
    # Making sure everything still works the way it should
    while True:
        # Getting the current URL
        current_url = driver_manual.current_url

        # Extracting the objectId using string operations when "objectId=" is present in the URL
        if "objectId=" in current_url:
            # Getting the start index of the objectID
            start_idx = current_url.index("objectId=") + len("objectId=")
            # Finding the end index of the objectID
            end_idx = current_url.find("&", start_idx)
            # Handling the case where objectId is the last parameter
            if end_idx == -1: 
                end_idx = len(current_url)
            
            # Getting the current objectID by using the previously found start index and end index of the objectID
            current_object_id = current_url[start_idx:end_idx]

            # Adding the current objectID the set of unique objectIDs if it is a unique one and if it was actually found
            if current_object_id and current_object_id not in unique_object_ids:
                unique_object_ids.add(current_object_id)

                # Adding the URL to the visited URLs list
                visited_urls.append(current_url)

        # Sleeping for a short time to avoid overwhelming the browser
        time.sleep(1)

except KeyboardInterrupt:
    # Stopping the loop gracefully when interrupted
    print("Stopped monitoring.")
    print(f"Visited URLs: {visited_urls}")

finally:
    driver_manual.quit()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=130.0.6723.117)
Stacktrace:
	GetHandleVerifier [0x006FEBD3+24307]
	(No symbol) [0x00688D74]
	(No symbol) [0x0056C323]
	(No symbol) [0x0054E00B]
	(No symbol) [0x005D5F6F]
	(No symbol) [0x005E8EC9]
	(No symbol) [0x005CFC26]
	(No symbol) [0x005A219C]
	(No symbol) [0x005A311D]
	GetHandleVerifier [0x009A8D93+2818227]
	GetHandleVerifier [0x00A0542E+3196750]
	GetHandleVerifier [0x009FD9D2+3165426]
	GetHandleVerifier [0x0079DA70+675216]
	(No symbol) [0x00691B3D]
	(No symbol) [0x0068EA18]
	(No symbol) [0x0068EBB5]
	(No symbol) [0x00681640]
	BaseThreadInitThunk [0x766E7BA9+25]
	RtlInitializeExceptionChain [0x7724C0CB+107]
	RtlClearBits [0x7724C04F+191]


In [10]:
print(visited_urls)

['https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012171532&theme=BRT%20Achtergrond&geometry.x=122598.07182599524&geometry.y=486194.1180860803&zoomlevel=11.03314400030912&bijbehorendeAdressen=Pand', 'https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100100091821&theme=BRT+Achtergrond&geometry.x=122451.49133918143&geometry.y=486232.7990251022&zoomlevel=14.984798429615658&bijbehorendeAdressen=Pand', 'https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363010000641658&theme=BRT+Achtergrond&geometry.x=122476.17445631293&geometry.y=486277.4703121955&zoomlevel=14.974878963647985', 'https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012247432&theme=BRT+Achtergrond&geometry.x=122499.95163649452&geometry.y=486282.70107565844&zoomlevel=12.990125267327986&bijbehorendeAdressen=Pand', 'https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012170419&theme=BRT+Achtergrond&geometry.x=122469.9885&geometry.y=486307.163&zoomlevel=13&bijbehorendeAdre

In [None]:
# Exporting the list to a csv-file, so the result will be saved locally and will not be lost
df_urls = pd.DataFrame(visited_urls)
urls_csv = df_urls.to_csv("visited_urls.csv", index=False)

In [None]:
# Importing the csv-file and reloading it into a list if necesserry
df_urls_d = pd.read_csv("visited_urls.csv")
urls_list = df_urls_d["0"].values.tolist()
urls_list

## Scraping addresses from the Kadasterregister

In [11]:
# Creating a list with URLs to scrape addresses from
# url_list = ["https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012181099&theme=BRT+Achtergrond&geometry.x=122518.23449999999&geometry.y=486193.337&zoomlevel=15&bijbehorendeAdressen=Pand", "https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012181081&theme=BRT+Achtergrond&geometry.x=122485.011&geometry.y=486135.5375&zoomlevel=15&bijbehorendeAdressen=Pand", "https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100012253901&theme=BRT+Achtergrond&geometry.x=122536.796&geometry.y=486176.6535&zoomlevel=15&bijbehorendeAdressen=Pand", "https://bagviewer.kadaster.nl/lvbag/bag-viewer/?searchQuery=roetersstraat&objectId=0363010000849823&theme=BRT+Achtergrond&geometry.x=122532.66&geometry.y=486132.4435&zoomlevel=15"]
url_list = visited_urls
# Creating an empty list to store the addresses and latitudes and longitudes in
all_scraped_addresses = []

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

for i in url_list:
    # Getting url from url_list
    driver.get(i)
    
    try:
        # Fetching the sidebar where all address information is present
        sidebar = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//app-root//app-sidebar"))
        )

        # Fetching the elements where the name of the single building or building block is located 
        elements_h1 = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//app-root//app-sidebar//h1"))
        )

        # Running a loop to see if this concerns a building block or seperate building
        # For this part of the loop, it concerns a building block, due to every building block starting with "Pand ID"
        if elements_h1[0].text.strip().startswith("Pand ID"):

            # Find the element mentioning the amount of buildings in that block
            amount_buildings = sidebar.find_elements(By.XPATH, ".//h2")
        
            # Create a variable containing only a number for the amount of buildings in the block
            if len(amount_buildings) >= 2:
                amount_addresses = amount_buildings[1].text
                number_addresses = int(amount_addresses.split()[0])

            # Fetching the addresses if the number of addresses is less than or equal to 5    
            if number_addresses <= 5:
                # Locating the addresses in the sidebar
                buttons_houses = sidebar.find_elements(By.XPATH, "//app-root//app-sidebar//sidebar-overzicht//ul//li//a//span")

                # Adding the addresses to the list into seperate sublists
                for button in buttons_houses:
                    address_text = button.text
                    all_scraped_addresses.append([address_text])
    
            # Fetching the addresses if the number of addresses is bigger than 5
            elif number_addresses > 5:
                # Locating the text connected to the button showing all buildings in the block
                button_more_houses = sidebar.find_elements(By.XPATH, ".//a//span")

                if len(button_more_houses) >= 9:
                    # Clicking the button connected to the previously mentioned text
                    parent_button = button_more_houses[8].find_element(By.XPATH, "./parent::a")
                    parent_button.click()
        
                # Locating the addresses within the sidebar
                houses_buttons = sidebar.find_elements(By.XPATH, "//app-root//app-sidebar-left//table//tbody//tr//ul//li")
        
                # Adding the addresses into to the list into seperate sublists
                for button in houses_buttons:
                    address_text = button.text
                    all_scraped_addresses.append([address_text])

        # If the elements_h1[0] part does not start with "Pand ID", then it concerns a seperate building, where just the title of the building is enough
        else:
            all_scraped_addresses.append([elements_h1[0].text])
        
        # Pause the search for a bit, so to not overwhelm the website
        time.sleep(1)

    # If no addresses were found or another error occured, the following code will be printed, so to not interrupt the other URLs
    # This will happen quite often if there are buildings which have no address attached to them, such as with: https://bagviewer.kadaster.nl/lvbag/bag-viewer/?objectId=0363100100091821&theme=BRT+Achtergrond&geometry.x=122451.49133918143&geometry.y=486232.7990251022&zoomlevel=14.984798429615658&bijbehorendeAdressen=Pand 
    except Exception as e:
        print(f"Error {e}")

driver.quit()

Error invalid literal for int() with base 10: 'Pand'
Error invalid literal for int() with base 10: 'Pand'


In [13]:
print(all_scraped_addresses)

[['Sarphatistraat 4, Amsterdam'], ['Sarphatistraat 6, Amsterdam'], ['Korte Lepelstraat 81, Amsterdam'], ['Nieuwe Prinsengracht 88-1, Amsterdam'], ['Nieuwe Prinsengracht 88-2, Amsterdam'], ['Nieuwe Prinsengracht 88-3, Amsterdam'], ['Nieuwe Prinsengracht 88 H, Amsterdam']]


## Using geopy to fill the list with further info needed for each building 

In [None]:
!pip install geopy

In [None]:
# Test for using geopy on the dataset
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="scraping_addresses.ipynb")
location = geolocator.geocode("Roetersstraat 170, Amsterdam")
print(location.address)
print((location.latitude, location.longitude))
print(location.raw)

In [None]:
# Looping over all previously added addresses and getting their relevant latitudes, longitudes and function
# Source for code: https://stackoverflow.com/questions/5807195/how-to-get-coordinates-of-address-from-python
for sublist in all_scraped_addresses:
    address = sublist[0]
    try:
        location = geolocator.geocode(address)
        if location:
            # Add latitude, longitude and function to the sublist
            sublist.append(location.latitude)
            sublist.append(location.longitude)
            sublist.append(location.raw["type"])
            # Adding the name of the building if it is present, otherwise add None to the sublist
            if location.raw["name"] == "":
                sublist.append(None)
            else:
                sublist.append(location.raw["name"])
        else:
            # Appending None if the location is not found
            sublist.append(None)
            sublist.append(None)
            sublist.append(None)
            sublist.append(None)
        
        # Adding a sleep time to comply towards usage policies
        time.sleep(1)


    except Exception as e:
        print(f"Error occurred for address {address}: {e}")
        

In [None]:
print(all_scraped_addresses)