### Jupyter notebook 01: Retrieving data from OpenStreetMap using OHSOME API and homogeneous grid cells

***Paper: Collaborative Toponyms in OpenStreetMap: an open-source framework to investigate the relationship with intrinsic quality parameters***

**Aims**

- To conduct a quantitative assessment of elements within OpenStreetMap (OSM) that have the 'name' attribute filled for potential categories of the Brazilian Authoritative Topographic Map; and

- To investigate the most significant intrinsic quality parameters that contribute to the reliability of toponyms in OSM.


**Brief Overview of the Proposed Methodology**

- Preliminary survey of potential OpenStreetMap (OSM) tags to provide relevant toponym information to categories of interest related to Brazilian Topographic Mapping;

- Execution of a quantitative analysis on collaboratively entered toponyms, utilizing homogeneous grid-based approaches; and

- Assessment of intrinsic quality parameters as indicators of the reliability of toponyms in a scientific context.

<img src="../utils/flowchart_paper_v2.png" width="600">

---

### Install the necessary libraries to the project

In [None]:
# In case of using colab, install these necessary libraries
%pip install requests -q
%pip install geopandas -q
%pip install folium -q
%pip install shapely -q

### Import the libraries

In [None]:
# Import library and some pre-installed modules
import os
import requests
import json
import pandas as pd
import geopandas as gpd
import time
from ipywidgets import widgets

### Connect to Google Drive

In [None]:
# In case of using colab, conect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

### Homogeneous Grid Cells
 - Statistical Grid (cell size of 200 x 200m) produced by Instituto Brasileiro de Geografia e Estatística (Brazilian Institute of Geography and Statistics)

  - https://geoftp.ibge.gov.br/recortes_para_fins_estatisticos/

#### Import Homogeneous Grid Cells from Google Drive

In [None]:
# Import the statistics grid in GeoJSON format

# @title Import the grid with the aggregated data extracted from OSM via the OHSOME API
grid = None

# Function for selecting and loading the GeoJSON file
def select_file(change):
    global grid
    selected_file = change['new']
    
    if selected_file != "Select the GeoJSON file with grid cells:":
        file_path = os.path.join('../data/input_code1/', selected_file)
        try:
            with open(file_path, 'r') as file:
                grid = json.load(file)
            print("File selected with success:", selected_file)
            print("File path:", file_path)
        except FileNotFoundError:
            print("File not found:", selected_file)

# Listing available GeoJSON files
file_list = [f for f in os.listdir('../data/input_code1/') if f.endswith('.geojson')]
options = ["Select the GeoJSON file with grid cells:"] + file_list

# Dropdown to select the GeoJSON file
dropdown = widgets.Dropdown(options=options)
dropdown.observe(select_file, names='value')

# Display the dropdown
display(dropdown)

In [None]:
# Preview grid cells
grid

In [None]:
# Count the total number of grid cells in GeoJSON
total_cells = len(grid['features'])
print(f"Total grid cells in GeoJSON: {total_cells}")

In [None]:
# Partition the original GeoJSON grid into subsets of up to 4 cells each

# Number of cells per batch
subset_size = 4

# Split the original grid cells into subsets
subsets = [grid['features'][i:i + subset_size] for i in range(0, len(grid['features']), subset_size)]

# Create a new FeatureCollection structure for each subset and add a batch ID ("lote_id")
grid_subsets = []
for index, subset in enumerate(subsets):
    grid_subset = {
        'type': 'FeatureCollection',
        'features': subset,
        'lote_id': f"lote{index + 1}",
        'crs': grid['crs']
    }
    grid_subsets.append(grid_subset)

In [None]:
# Calcular e imprimir o total de subsets criados
total_subsets = len(grid_subsets)
print(f"Total de subsets criados: {total_subsets}")

In [None]:
# Check the subsets
grid_subsets

#### Visualize the spatial distribution of the homogeneous grid cell

In [None]:
import folium
import ipywidgets as widgets
from IPython.display import display

# Function to calculate the centroid of a polygon (original grid)
def calculate_centroid(coordinates):
    x = [p[0] for p in coordinates]
    y = [p[1] for p in coordinates]
    centroid_x = sum(x) / len(coordinates)
    centroid_y = sum(y) / len(coordinates)
    return [centroid_y, centroid_x]

# Calculate the coordinates of the centroid of the original grid
first_polygon = grid['features'][0]['geometry']['coordinates'][0][0]
centroid_coords = calculate_centroid(first_polygon)

# Function to plot a subset
def plot_subset(subset_index):
    subset_to_plot = grid_subsets[subset_index]

    # GeoJson style
    style = {'fillColor': '#8C8989', 'color': '#e31a1c', 'weight': 2}

    # Initialize the Folium map at the centroid of the original grid
    m = folium.Map(location=centroid_coords, tiles='OpenStreetMap', zoom_start=14)

    # Add GeoJson to the map
    folium.GeoJson(
        subset_to_plot,
        name=f'Grade Estatística 200m - Lote {subset_index+1}',
        tooltip=folium.GeoJsonTooltip(fields=['id', 'POP10']),
        style_function=lambda x: style
    ).add_to(m)

    # Display the map
    display(m)

# Create the drop-down list with the subset indexes
dropdown = widgets.Dropdown(
    options=[(f'Lote {i+1}', i) for i in range(len(grid_subsets))],
    description='Select a Batch:',
    disabled=False,
)

# Update the map based on the selection
widgets.interactive(plot_subset, subset_index=dropdown)

### **OHSOME API**

 - Access to features, attributes and OSM history edits using the OHSOME API (*OpenStreetMap History Data Analytics Platform*)

> - https://docs.ohsome.org/ohsome-api/v1/


In [None]:
# URL of OHSOME API Metadata endpoint
URL = 'https://api.ohsome.org/v1/metadata'

# Request to the OHSOME API
response = requests.get(URL)

response_json = response.json()
response_json

### Retrieving data from OpenStreetMap using OHSOME API and homogeneous grid cells


#### Step 1 (*API Endpoint: Elements Aggregation*): count the number of OSM features (elements) and calculate the proportion of features with the attribute "name" fill in by contributors, for each grid cells:


 - Determine the total number of OSM features for interest tags, grouped by grid cell;

 - Quantify the total number of features with attribute "name" filled in; and

 - Calculate the proportion of features with attribute "name" filled in for each grid cell.

 - Period of data retrieved: 2007-10-08 to 2024-03-10;

In [None]:
# Approach for processing batches of 04 cells from the original grid

# Step 1 (API Endpoint: Elements Aggregation): count the number of OSM features
# (elements) and calculate the proportion of features with "name" attribute filled by contributors, for each grid cells:
# Aggregation method: count
# POST /elements/(aggregation)/groupBy/boundary/groupBy/tag

# Start the time counter
start_time = time.time()

# Load a copy of previously created grid_subsets
grid_subset2 = grid_subsets.copy()

# OHSOME API endpoint url
url_tag = "https://api.ohsome.org/v1/elements/count/groupBy/boundary/groupBy/tag"

# OSM tags of interest
tags_de_interesse = {
    'leisure': '*',
    'building': '*',
    'amenity': '*'
}

# Configuring basic parameters
params_base = {
    'time': '2007-10-08/2024-03-10'
}

# List to store the final results
final_results = {}

# Process each batch of grid_subsets
for lote_id, subset in enumerate(grid_subset2, start=1):
    for feature in subset['features']:
        cell_geojson = json.dumps({"type": "FeatureCollection", "features": [feature]})
        cell_id = feature['properties']['id']

        for tag, value in tags_de_interesse.items():
            # 1st: Aggregate the object of each interest tag by grid cell
            params = params_base.copy()
            params.update({
                'bpolys': cell_geojson,
                'filter': f'{tag}={value}',
                'groupByKey': tag,
                'groupByValues': value
            })

            response = requests.post(url_tag, data=params)
            if response.status_code == 200:
                data = response.json()

                total_count = sum(res.get('value', 0) for res in data.get('groupByResult', [])[0].get('result', []))
                feature['properties'][f'{tag}_total_count'] = total_count

            # 2nd: Count the features with the attribute 'name' filled in
            params['filter'] = f'{tag}={value} and name=*'
            response = requests.post(url_tag, data=params)
            if response.status_code == 200:
                data = response.json()

                name_count = sum(res.get('value', 0) for res in data.get('groupByResult', [])[0].get('result', []))
                feature['properties'][f'{tag}_name_count'] = name_count
                name_ratio_perc = (name_count / total_count) * 100 if total_count > 0 else 0
                feature['properties'][f'{tag}_name_ratio'] = name_ratio_perc

        # Add cell results to final_results
        final_results[cell_id] = feature['properties']

    print(f"{subset['lote_id']} successfully processed!")

# Stop the time counter
end_time = time.time()

# Calculate and display the total execution time
total_time_seconds = end_time - start_time
print(f"Total execution time: {total_time_seconds // 60} minutes and {total_time_seconds % 60} seconds")

In [None]:
# Check the grid cells results of step 1
grid_subset2

#### Step 2 (*API Endpoint: Contributions Aggregation*): count the total number of contributions for features with and without the attribute "name" filled in:

- Count the **total number of contributions** to the *interest tags* for the total features in the grid cells, with and without the attribute "name" filled in.

- Period of data retrieved: 2007-10-08 to 2024-03-10.

In [None]:
# Approach for processing batches of 04 cells from the original grid

# Step 2 (API Endpoint: Endpoint Contributions Aggregation): count the total number of
# contributions for features with and without a name attribute filled in.
# Aggregation method: count
# POST /contributions/count/groupBy/boundary

# Start the time counter
start_time = time.time()

# OHSOME API endpoint url
url_contributions = "https://api.ohsome.org/v1/contributions/count/groupBy/boundary"

# OSM tags of Interest
tags_de_interesse = {
    'leisure': '*',
    'building': '*',
    'amenity': '*'
}

# Configuring basic parameters
params_contributions_base = {
    'time': '2007-10-08/2024-03-10'
}

# Function to process the response
def process_response(response, cell_id):
    if response.status_code == 200:
        data = response.json()
        return sum(result.get('value', 0) for result in data.get('groupByResult', [])[0].get('result', []))
    else:
        print(f"Cell query error {cell_id}: {response.text}")
        return 0

# Process each batch of grid_subsets
for lote_id, subset in enumerate(grid_subset2, start=1):
  for feature in subset['features']:
        cell_geojson = json.dumps({"type": "FeatureCollection", "features": [feature]})
        cell_id = feature['properties']['id']

        for tag in tags_de_interesse:

            # Settings for all features
            params_contributions_all = params_contributions_base.copy()
            params_contributions_all.update({'bpolys': cell_geojson, 'filter': f'{tag}=*'})
            response_all = requests.post(url_contributions, data=params_contributions_all)
            contributions_all = process_response(response_all, cell_id)


            # Settings for all features with 'name' filled in
            params_contributions_name = params_contributions_base.copy()
            params_contributions_name.update({'bpolys': cell_geojson, 'filter': f'{tag}=* and name=*'})
            response_name = requests.post(url_contributions, data=params_contributions_name)
            contributions_name = process_response(response_name, cell_id)

            # Update the feature's properties
            feature['properties'][f'{tag}_total_contributions'] = contributions_all
            feature['properties'][f'{tag}_name_contributions'] = contributions_name

  print(f"{subset['lote_id']} successfully processed!")

# Stop the time counter
end_time = time.time()

# Calculate and display the total execution time
total_time_seconds = end_time - start_time
print(f"Total execution time: {total_time_seconds // 60} minutos and {total_time_seconds % 60} seconds")

In [None]:
# Check the grid cells results of step 2
grid_subset2

#### Step 3 (*API Endpoint: Contributions Aggregation*): Count the number of contributions in the past five years for features with the attribute "name" filled in:

 - Count the number of contributions in the past five years for tags of interest, aggregated by grid cells, with the attribute "name" filled in;

 - Period of data retrieved: 2019-03-09 to 2024-03-10

In [None]:
# Approach for processing batches of 04 cells from the original grid

# Step 3 (API Endpoint: Contributions Aggregation): Count the number of contributions
# in the past five years for features with a filled-in name
# Aggregation method: count
# POST /contributions/latest/count

# Start the time counter
start_time = time.time()

# OHSOME API endpoint url
url_latest_contributions = "https://api.ohsome.org/v1/contributions/latest/count"

# OSM tags of Interest
tags_de_interesse = {
    'leisure': '*',
    'building': '*',
    'amenity': '*'
}

# Configuring basic parameters
params_contributions_base = {
    'time': '2019-03-09/2024-03-10'
}

# Function to process the response
def process_response(response, cell_id):
    if response.status_code == 200:
        data = response.json()
        latest_result = data.get('result', [])
        return latest_result[-1].get('value', 0) if latest_result else 0
    else:
        print(f"Cell query error {cell_id}: {response.text}")
        return 0

# Process each batch of grid_subsets
for lote_id, subset in enumerate(grid_subset2, start=1):
    for feature in subset['features']:
        cell_geojson = json.dumps({"type": "FeatureCollection", "features": [feature]})
        cell_id = feature['properties']['id']

        # Settings for all features with 'name' filled in
        for tag in tags_de_interesse:
            params_latest_contributions = params_contributions_base.copy()
            params_latest_contributions.update({'bpolys': cell_geojson, 'filter': f'{tag}=* and name=*'})
            response = requests.post(url_latest_contributions, data=params_latest_contributions)
            latest_contributions_count = process_response(response, cell_id)

            # Update the feature's properties
            feature['properties'][f'{tag}_latest5_name_contributions'] = latest_contributions_count

    print(f"{subset['lote_id']} successfully processed!")

# Stop the time counter
end_time = time.time()

# Calculate and display the total execution time
total_time_seconds = end_time - start_time
print(f"Total execution time: {total_time_seconds // 60} minutes e {total_time_seconds % 60} seconds")

In [None]:
# Check the grid cells results of step 3
grid_subset2

#### Step 4 (*API Endpoint: Contributions Aggregation*): Count the total number of contributions to features with a filled-in name where a tagChange occurred:

- Count the total number of contributions to the tags of interest, aggregated by grid cell, with the attribute name filled in, considering the type of contribution (contributionType) tag change ('tagChange').

  - *contributionType available: ‘creation’, ‘deletion’, ‘tagChange’, ‘geometryChange’ ou uma combinação destes*

- Period of data retrieved: 2007-10-08 to 2024-03-10.

In [None]:
# Approach for processing batches of 04 cells from the original grid

# Step 4 (API Endpoint: Contributions Aggregation): Count the total number of
# contributions to features with a filled-in name where a tagChange occurred
# Aggregation method: count
# POST /contributions/count/groupBy/boundary

# Start the time counter
start_time = time.time()

# OHSOME API endpoint url
url_contributions = "https://api.ohsome.org/v1/contributions/count/groupBy/boundary"

# OSM tags of Interest
tags_de_interesse = {
    'leisure': '*',
    'building': '*',
    'amenity': '*'
}

# Configuring basic parameters
params_contributions_base = {
    'time': '2007-10-08/2024-03-10',
    'contributionType': 'tagChange'
}

# Function to process the response
def process_response(response, cell_id):
    if response.status_code == 200:
        data = response.json()
        return sum(result.get('value', 0) for result in data.get('groupByResult', [])[0].get('result', []))
    else:
        print(f"Erro na consulta da célula {cell_id}: {response.text}")
        return 0

# Process each batch of grid_subsets
for lote_id, subset in enumerate(grid_subset2, start=1):
    for feature in subset['features']:
        cell_geojson = json.dumps({"type": "FeatureCollection", "features": [feature]})
        cell_id = feature['properties']['id']

        # Settings for all features with 'name' filled in AND tagChange
        for tag in tags_de_interesse:
            params_contributions = params_contributions_base.copy()
            params_contributions.update({'bpolys': cell_geojson, 'filter': f'{tag}=* and name=*'})

            response = requests.post(url_contributions, data=params_contributions)
            contributions_count = process_response(response, cell_id)

            # Update the feature's properties
            feature['properties'][f'{tag}_name_tagChange_contributions'] = contributions_count

    print(f"{subset['lote_id']} successfully processed!")

# Stop the time counter
end_time = time.time()

# Calculate and display the total execution time
total_time_seconds = end_time - start_time
print(f"Total execution time: {total_time_seconds // 60} minutes and {total_time_seconds % 60} seconds")

In [None]:
# Check the grid cells results of step 4
grid_subset2

#### Step 5 (API Endpoint: Users Aggregation): Count the number of users (contributors) who edited features with attribute name filled in:

- Count the number of users who edited features of the OSM tags of Interest with attribute "name" attribute filled in, aggregated by grid cells.

- Period of data retrieved: 2007-10-08 to 2024-03-10.


In [None]:
# # Approach for processing batches of 04 cells from the original grid

# Step 5 (API Endpoint: Users Aggregation): Count the number of users (contributors)
# who edited features with attribute name filled in:
# Aggregation method: count
# POST /users/count/groupBy/boundary

# Start the time counter
start_time = time.time()

# OHSOME API endpoint url
url_users_count = "https://api.ohsome.org/v1/users/count/groupBy/boundary"

# OSM tags of Interest
tags_de_interesse = {
    'leisure': '*',
    'building': '*',
    'amenity': '*'
}

# Configuring basic parameters
params_users_count_base = {
    'time': '2007-10-08/2024-03-10'
}

# Function to process the response
def process_user_response(response, cell_id):
    if response.status_code == 200:
        data = response.json()
        for result in data.get('groupByResult', []):
            if result['groupByObject'] == cell_id:
                return result['result'][0]['value']
    return 0

# Process each batch of grid_subsets
for lote_id, subset in enumerate(grid_subset2, start=1):
    for feature in subset['features']:
        cell_geojson = json.dumps({"type": "FeatureCollection", "features": [feature]})
        cell_id = feature['properties']['id']

        # Settings for all features with 'name' filled
        for tag in tags_de_interesse:
            params_users_count = params_users_count_base.copy()
            params_users_count.update({'bpolys': cell_geojson, 'filter': f'{tag}=* and name=*'})

            response = requests.post(url_users_count, data=params_users_count)
            users_name_count = process_user_response(response, cell_id)

            # Update the feature's properties
            feature['properties'][f'{tag}_users_count_name'] = users_name_count

    print(f"{subset['lote_id']} successfully processed!")

# Stop the time counter
end_time = time.time()

# Calculate and display the total execution time
total_time_seconds = end_time - start_time
print(f"Total execution time: {total_time_seconds // 60} minutes and {total_time_seconds % 60} seconds")

In [None]:
# Check the grid cells results of step 5
grid_subset2

#### Save the updated grid cells with the information Extracted using the OHSOME API endpoints

In [None]:
output_filename = "../data/output_code1/grade_id77_passare_results.geojson"

# Create a new FeatureCollection to combine all the subsets
grid_subset2_results = {
    'type': 'FeatureCollection',
    'crs': grid_subset2[0]['crs'],
    'features': []
}

# Iterate over each subset and add its features to the combined FeatureCollection
for subset in grid_subset2:
    grid_subset2_results['features'].extend(subset['features'])

# Save the combined grid cells in a GeoJSON file
with open(output_filename, 'w') as file:
    json.dump(grid_subset2_results, file)

---