In [1]:
# Import necessary libraries
import os
import requests as req
from bs4 import BeautifulSoup as bs
import re
import rasterio as rs
import numpy as np
import pandas as pd
import timeit

In [2]:
# URLs for the DSM and DTM data (with 1 meter resolution) provided by Geopunt Flanders
dataset_url = {}
dataset_url["DSM"] = ("https://www.geopunt.be/download?container=dhm-vlaanderen-ii-dsm-raster-1m"
                      "&title=Digitaal%20Hoogtemodel%20Vlaanderen%20II,%20DSM,%20raster,%201m")
dataset_url["DTM"] = ("https://www.geopunt.be/download?container=dhm-vlaanderen-ii-dtm-raster-1m"
                      "&title=Digitaal%20Hoogtemodel%20Vlaanderen%20II,%20DTM,%20raster,%201m")

In [3]:
# File paths for the raw and processed metadata csv files
metadata_folder_path = os.path.join(os.getcwd(), 'data', 'metadata')
# Create the folder if it doesn't exist
if not os.path.exists(metadata_folder_path):
    os.makedirs(metadata_folder_path)    
# Initialize csv file paths
metadata_raw_file_path = {}
metadata_processed_file_path = {}
for data_type in dataset_url.keys():
    # Raw files
    metadata_raw_file_path[data_type] = os.path.join(metadata_folder_path, data_type + 
                                                     '_GeoTIFF_1m_metadata_raw.csv')
    # Processed files
    metadata_processed_file_path[data_type] = os.path.join(metadata_folder_path, data_type + 
                                                           '_GeoTIFF_1m_metadata_processed.csv')

# Preprocessed and merge csv file path
metadata_processed_merged_file_path = os.path.join(metadata_folder_path, 
                                                   'GeoTIFF_1m_metadata_processed.csv')

In [4]:
# Links to the zip files that make up the datasets
dataset_zip_file_links = {}
for data_type in dataset_url.keys():  # Get the links from both datasets
    # Initialize the list for the links
    dataset_zip_file_links[data_type] = []
    try:
        # Create a request to the URL
        r = req.get(dataset_url[data_type])
        soup = bs(r.content, "html")
        # Find the element that contains the downloadable elements
        download_elem = soup.find_all("div", attrs={"class": "downloadfileblock"})[0]
        # Elements for the links to the zip files
        link_elems = download_elem.find_all("a")
        for link_elem in link_elems:
            # Get the hypertext reference of the element for a single zip file link
            dataset_zip_file_links[data_type].append(link_elem.get("href"))
    except:
        print('Please enter the correct URLs for the datasets.')

In [5]:
# Collect metadata from online GeoTIFF files in a Pandas DataFrame or load existing DataFrame
create_metadata_conditional = True
if create_metadata_conditional:  # if True, create csv files with metadata
    # Start the timer
    start_time = timeit.default_timer()

    for data_type in dataset_url.keys():  # for both datasets
        # Initialize the DataFrame
        metadata_df = pd.DataFrame()

        # Access each online GeoTIFF file
        for file_no, link in enumerate(dataset_zip_file_links[data_type]):    
            # Get the zip file name from the link
            m = re.search("/(?P<zip_file_name>[\w]+).zip$", link)
            zip_file_name = m.group("zip_file_name")

            # Create the link for the online TIFF file
            tif_request = "zip+" + link + "!/GeoTIFF/" + zip_file_name + ".tif"

            # Access the file to get metadata
            with rs.open(tif_request) as src:
                profile = src.profile

            # Collect metadata
            df_row = {}
            df_row['file_number'] = file_no+1
            df_row['file_name'] = zip_file_name + ".zip"
            df_row['file_link'] = tif_request
            df_row['coord_system'] = str(profile['crs'])
            df_row['bottom'] = src.bounds.bottom
            df_row['top'] = src.bounds.top
            df_row['left'] = src.bounds.left
            df_row['right'] = src.bounds.right
            df_row['height'] = profile['height']
            df_row['width'] = profile['width']
            
            # Save metadata in the DataFrame
            metadata_df = metadata_df.append(df_row, ignore_index=True)

            # Display execution time
            stop_time = timeit.default_timer()  # stop the timer
            print(f"{data_type} file {file_no} accessed. Execution time: {stop_time-start_time:.2f}")

        # Save raw metadata to a csv file
        metadata_df.to_csv(metadata_raw_file_path[data_type], sep=',', index=False)
    
else:
    # Initialize the dictionary to hold the DataFrames
    metadata_df_raw = {}
    # load existing DataFrame
    for data_type in dataset_url.keys():  # for both datasets
        metadata_df_raw[data_type] = pandas.read_csv(metadata_raw_file_path[data_type], sep=',', header=0)
        
# Total execution time
stop_time = timeit.default_timer()
print(f"All files accessed. Execution time: {stop_time-start_time:.2f}")

DSM file 0 accessed. Execution time: 20.89
DSM file 1 accessed. Execution time: 21.22
DSM file 2 accessed. Execution time: 39.60
DSM file 3 accessed. Execution time: 39.99
DSM file 4 accessed. Execution time: 113.66
DSM file 5 accessed. Execution time: 141.27
DSM file 6 accessed. Execution time: 141.64
DSM file 7 accessed. Execution time: 142.04
DSM file 8 accessed. Execution time: 142.44
DSM file 9 accessed. Execution time: 142.86
DSM file 10 accessed. Execution time: 168.55
DSM file 11 accessed. Execution time: 342.61
DSM file 12 accessed. Execution time: 343.04
DSM file 13 accessed. Execution time: 343.47
DSM file 14 accessed. Execution time: 343.85
DSM file 15 accessed. Execution time: 344.38
DSM file 16 accessed. Execution time: 344.76
DSM file 17 accessed. Execution time: 345.13
DSM file 18 accessed. Execution time: 345.52
DSM file 19 accessed. Execution time: 345.88
DSM file 20 accessed. Execution time: 346.23
DSM file 21 accessed. Execution time: 346.66
DSM file 22 accessed. Ex

In [6]:
# Load raw metadata from csv file
metadata_df_raw_loaded = {}
for data_type in dataset_url.keys():  # for both datasets
    metadata_df_raw_loaded[data_type] = pd.read_csv(metadata_raw_file_path[data_type], sep=',', header=0)

In [9]:
# Process raw metadata
def clean_metadata(df_in):
    """
    Cleans the GeoTIFF metadata DataFrame
    :param df_in: A Pandas DataFrame with metadata
    :return: A cleaned DataFrame
    """
    df_out = df_in
    # Change data types for the file_number, height and width to unsigned integers
    df_out = df_out.astype({'file_number': 'uint8', 'height': 'uint32', 'width': 'uint32'})
    # Set file_number as the index
    df_out = df_out.set_index('file_number', drop=True)
    # Change the order of the columns
    columns = ['file_name', 'file_link', 'coord_system', 'height', 'width', 
               'bottom', 'top', 'left', 'right']
    df_out = df_out[columns]
    return df_out

metadata_df_processed = {}
for data_type in dataset_url.keys():  # for both datasets
    metadata_df_processed[data_type] = clean_metadata(metadata_df_raw_loaded[data_type])
    
# Save processed metadata to a csv file
for data_type in dataset_url.keys():  # for both datasets
    metadata_df_processed[data_type].to_csv(metadata_processed_file_path[data_type], sep=',', index=False)

In [10]:
# Compare the contents of metadata for DSM and DTM files
# Check if columns other than file_name and file_link are equal
df_DSM = metadata_df_processed['DSM']
df_DTM = metadata_df_processed['DTM']
datasets_match_conditional = df_DSM.iloc[:, 2:].equals(df_DTM.iloc[:, 2:])
# Rename columns file_name and file_link
df_DSM = df_DSM.rename(columns={'file_name': 'dsm_file_name'})
df_DSM = df_DSM.rename(columns={'file_link': 'dsm_file_link'})
df_DTM = df_DTM.rename(columns={'file_name': 'dtm_file_name'})
df_DTM = df_DTM.rename(columns={'file_link': 'dtm_file_link'})
# Merge the DataFrames
df_merged = pd.concat([df_DSM, df_DTM.iloc[:, :2]], axis=1, join="inner").head()
# Change column order
columns = df_merged.columns.tolist()
columns = columns[:2] + columns[9:11] + columns[2:9]
df_merged = df_merged[columns]
df_merged.head()

Unnamed: 0_level_0,dsm_file_name,dsm_file_link,dtm_file_name,dtm_file_link,coord_system,height,width,bottom,top,left,right
file_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,DHMVIIDSMRAS1m_k01.zip,zip+https://downloadagiv.blob.core.windows.net...,DHMVIIDTMRAS1m_k01.zip,zip+https://downloadagiv.blob.core.windows.net...,EPSG:31370,9000,17000,238000.0,247000.0,145000.0,162000.0
2,DHMVIIDSMRAS1m_k02.zip,zip+https://downloadagiv.blob.core.windows.net...,DHMVIIDTMRAS1m_k02.zip,zip+https://downloadagiv.blob.core.windows.net...,EPSG:31370,12000,32000,238000.0,250000.0,162000.0,194000.0
3,DHMVIIDSMRAS1m_k03.zip,zip+https://downloadagiv.blob.core.windows.net...,DHMVIIDTMRAS1m_k03.zip,zip+https://downloadagiv.blob.core.windows.net...,EPSG:31370,10000,12000,238000.0,248000.0,194000.0,206000.0
4,DHMVIIDSMRAS1m_k04.zip,zip+https://downloadagiv.blob.core.windows.net...,DHMVIIDTMRAS1m_k04.zip,zip+https://downloadagiv.blob.core.windows.net...,EPSG:31370,7500,13000,218000.0,225500.0,53000.0,66000.0
5,DHMVIIDSMRAS1m_k05.zip,zip+https://downloadagiv.blob.core.windows.net...,DHMVIIDTMRAS1m_k05.zip,zip+https://downloadagiv.blob.core.windows.net...,EPSG:31370,14000,32000,218000.0,232000.0,66000.0,98000.0


In [11]:
# Save processed and merged metadata to a csv file
df_merged.to_csv(metadata_processed_merged_file_path, sep=',', index=False)