# Ingesting and Filtering OSM Data to GeoParquet format

This notebook automates the process of extracting specific features (buildings, restaurants, bus stops) for multiple cities (Pinerolo, Milan, Rome) from a large OpenStreetMap PBF file.

It uses `quackosm` for efficient filtering and saves the output as optimized `.geoparquet` files, ready for the benchmark scripts.

**Required libraries:** `quackosm`, `osmnx`, `pathlib`.

In [1]:
import os
import urllib.request
import quackosm
import osmnx as ox
from pathlib import Path

In [2]:
# Define Paths and URLs
# NOTE: PROJECT_ROOT is defined to display relative paths in the output, protecting user privacy.
WORKING_ROOT = Path('.').resolve().parent
PROJECT_ROOT = WORKING_ROOT.parent
RAW_DATA_DIR = WORKING_ROOT / 'data' / 'raw'
PROCESSED_DATA_DIR = WORKING_ROOT / 'data' / 'processed'

# The PBF file will be downloaded from this URL
PBF_URL = "https://download.geofabrik.de/europe/italy-latest.osm.pbf"
PBF_FILENAME = PBF_URL.split('/')[-1]
PBF_FILEPATH = RAW_DATA_DIR / PBF_FILENAME

# Ensure the required directories exist
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Download the PBF file only if it's missing
if not PBF_FILEPATH.exists():
    print(f"PBF file not found. Downloading from {PBF_URL}.")
    urllib.request.urlretrieve(PBF_URL, PBF_FILEPATH)
    print(f"Download complete. File saved to: {PBF_FILEPATH.relative_to(PROJECT_ROOT)}")
else:
    print(f"PBF file already found at: {PBF_FILEPATH.relative_to(PROJECT_ROOT)}. Skipping download.")

# A list of all datasets to be generated.
# Each dictionary defines a single extraction operation for a city and feature type.
extraction_tasks = [
    # Pinerolo
    {'city': 'Pinerolo, Italy', 'feature_type': 'buildings',   'tags': {'building': True}},
    {'city': 'Pinerolo, Italy', 'feature_type': 'restaurants', 'tags': {'amenity': 'restaurant'}},
    {'city': 'Pinerolo, Italy', 'feature_type': 'bus_stops',   'tags': {'highway': 'bus_stop'}},

    # Milano
    {'city': 'Milan, Italy',    'feature_type': 'buildings',   'tags': {'building': True}},
    {'city': 'Milan, Italy',    'feature_type': 'restaurants', 'tags': {'amenity': 'restaurant'}},
    {'city': 'Milan, Italy',    'feature_type': 'bus_stops',   'tags': {'highway': 'bus_stop'}},

    # Roma
    {'city': 'Rome, Italy',     'feature_type': 'buildings',   'tags': {'building': True}},
    {'city': 'Rome, Italy',     'feature_type': 'restaurants', 'tags': {'amenity': 'restaurant'}},
    {'city': 'Rome, Italy',     'feature_type': 'bus_stops',   'tags': {'highway': 'bus_stop'}},
]

print("\nStarting data extraction process.")
tasks_completed = 0
tasks_skipped = 0

# Iterate over each task and generate the corresponding file
for task in extraction_tasks:
    city_name_query = task['city']
    # Use a clean name for the output file (e.g., "pinerolo")
    city_name_file = city_name_query.split(',')[0].lower()
    feature_type = task['feature_type']
    tags_filter = task['tags']

    output_filename = f"{city_name_file}_{feature_type}.geoparquet"
    output_filepath = PROCESSED_DATA_DIR / output_filename

    print(f"\nStarting the extraction of '{feature_type}' for '{city_name_query}'.")

    # This check is crucial to avoid re-running lengthy extractions
    if output_filepath.exists():
        print(f"Result file '{output_filename}' already exists. Skipping task.")
        tasks_skipped += 1
        continue

    try:
        # Get the geographic boundary for the city
        print(f"Fetching boundary for {city_name_query}.")
        boundary_gdf = ox.geocode_to_gdf(city_name_query)
        print("Boundary fetched successfully.")

        # Configure the PBF reader with geographic and tag filters
        pbf_reader = quackosm.PbfFileReader(
            geometry_filter=boundary_gdf.geometry.iloc[0],
            tags_filter=tags_filter,
        )

        # Execute the conversion from PBF to GeoDataFrame
        print(f"Starting extraction from {PBF_FILENAME}.")
        features_gdf = pbf_reader.convert_pbf_to_geodataframe(PBF_FILEPATH)
        print(f"Extraction complete. Found {len(features_gdf)} features.")

        # Save the result to a GeoParquet file
        print(f"Saving data to {output_filename}.")
        features_gdf.to_parquet(output_filepath)
        print(f"Successfully saved file to {output_filepath.relative_to(PROJECT_ROOT)}.")
        tasks_completed += 1

    except Exception as e:
        print(f"ERROR while processing task for {city_name_query}: {e}.")

print("\nData preparation process finished.")
print(f"Tasks completed successfully: {tasks_completed}")
print(f"Tasks skipped (already done): {tasks_skipped}")

Output()

PBF file already found at: UseCasesManagement\data\raw\italy-latest.osm.pbf. Skipping download.

Starting data extraction process.

Starting the extraction of 'buildings' for 'Pinerolo, Italy'.
Fetching boundary for Pinerolo, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Output()

Extraction complete. Found 4499 features.
Saving data to pinerolo_buildings.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\pinerolo_buildings.geoparquet.

Starting the extraction of 'restaurants' for 'Pinerolo, Italy'.
Fetching boundary for Pinerolo, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Output()

Extraction complete. Found 17 features.
Saving data to pinerolo_restaurants.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\pinerolo_restaurants.geoparquet.

Starting the extraction of 'bus_stops' for 'Pinerolo, Italy'.
Fetching boundary for Pinerolo, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Extraction complete. Found 21 features.
Saving data to pinerolo_bus_stops.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\pinerolo_bus_stops.geoparquet.

Starting the extraction of 'buildings' for 'Milan, Italy'.
Fetching boundary for Milan, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Extraction complete. Found 62133 features.
Saving data to milan_buildings.geoparquet.


Output()

Successfully saved file to UseCasesManagement\data\processed\milan_buildings.geoparquet.

Starting the extraction of 'restaurants' for 'Milan, Italy'.
Fetching boundary for Milan, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Output()

Extraction complete. Found 2672 features.
Saving data to milan_restaurants.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\milan_restaurants.geoparquet.

Starting the extraction of 'bus_stops' for 'Milan, Italy'.
Fetching boundary for Milan, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Extraction complete. Found 2809 features.
Saving data to milan_bus_stops.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\milan_bus_stops.geoparquet.

Starting the extraction of 'buildings' for 'Rome, Italy'.
Fetching boundary for Rome, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Output()

Extraction complete. Found 200942 features.
Saving data to rome_buildings.geoparquet.


Output()

Successfully saved file to UseCasesManagement\data\processed\rome_buildings.geoparquet.

Starting the extraction of 'restaurants' for 'Rome, Italy'.
Fetching boundary for Rome, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Extraction complete. Found 3069 features.
Saving data to rome_restaurants.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\rome_restaurants.geoparquet.

Starting the extraction of 'bus_stops' for 'Rome, Italy'.
Fetching boundary for Rome, Italy.
Boundary fetched successfully.
Starting extraction from italy-latest.osm.pbf.


Output()

Extraction complete. Found 8448 features.
Saving data to rome_bus_stops.geoparquet.
Successfully saved file to UseCasesManagement\data\processed\rome_bus_stops.geoparquet.
Data preparation process finished.
Tasks completed successfully: 9
Tasks skipped (already done): 0
