## Jupyter notebook 03: This notebook retrieves TagInfo data and generate a TreeMap

***Paper: Collaborative Toponyms in OpenStreetMap: an open-source framework to investigate the relationship with intrinsic quality parameters***

**Aims**

- This notebook retrieves TagInfo data for Brazil and generate a TreeMap


### Set the root directory

In [1]:
# Sets the root directory of the project as the working directory
import os
os.chdir('..')

In [None]:
# Get current working directory
os.getcwd()

### Requests data to API TagInfo, processes and saves the data.

- This script is responsible for collecting key combinations from the Taginfo API for Brazil, with incremental updates and checkpointing to handle large datasets efficiently.

[taginfo API](https://taginfo.geofabrik.de/south-america:brazil/taginfo/apidoc)

In [None]:
# Implements a script to collect key combinations from the Taginfo API for Brazil,
# with incremental updates and checkpointing to handle large datasets efficiently.

# Import necessary libraries
from pathlib import Path
import pandas as pd
import time
import requests
from tqdm.notebook import tqdm

# === CONFIGURAÇÕES GERAIS ===
output_dir = Path("results/0_taginfo")
output_dir.mkdir(parents=True, exist_ok=True)
key = "name"
base_url = "https://taginfo.geofabrik.de/south-america:brazil/api/4/key/combinations"
per_page = 100
sortname = "together_count"
sortorder = "desc"

# Arquivos auxiliares
partial_csv_path = output_dir / f"combinacoes_{key}_brasil_partial.csv"
final_csv_template = output_dir / f"combinacoes_{key}_brasil_taginfo_{'{date_until}'}.csv"
ultimo_lote_path = output_dir / f"combinacoes_{key}_brasil_lastpage.txt"

# === CHECA CHECKPOINT ===
if partial_csv_path.exists():
    df_partial = pd.read_csv(partial_csv_path)
    last_page = df_partial["page"].max() if "page" in df_partial.columns else 0
    print(f"[Checkpoint] Retomando da página {last_page + 1}")
else:
    last_page = 0
    print("[Checkpoint] Nenhum progresso anterior encontrado, começando do início.")

# === PEGA O TOTAL DE REGISTROS/PÁGINAS E DATA ===
params_inicial = {
    "key": key,
    "page": 1,
    "rp": per_page,
    "sortname": sortname,
    "sortorder": sortorder
}
resp = requests.get(base_url, params=params_inicial)
if resp.status_code != 200:
    raise Exception("Erro ao acessar a primeira página da API.")

result = resp.json()
total_registros = result.get("total", 0)
data_until = result.get("data_until", "data_desconhecida").replace(":", "-")
total_paginas = (total_registros // per_page) + (1 if total_registros % per_page > 0 else 0)
print(f"Total estimado de registros: {total_registros}, páginas: {total_paginas}, data_taginfo: {data_until}")

# === COLETA INCREMENTAL COM CHECKPOINT ===
for page in tqdm(range(last_page + 1, total_paginas + 1), desc="Coletando páginas"):
    params = {
        "key": key,
        "page": page,
        "rp": per_page,
        "sortname": sortname,
        "sortorder": sortorder
    }
    try:
        resp = requests.get(base_url, params=params, timeout=30)
    except Exception as e:
        print(f"Erro de conexão na página {page}: {e}")
        break
    if resp.status_code != 200:
        print(f"Erro na página {page}: {resp.status_code}")
        break
    result = resp.json()
    data = result.get("data", [])
    if not data:
        break
    # Inclui a coluna 'page' para rastreio e reprocessamento
    df_page = pd.DataFrame([{
        'other_key': item.get('other_key'),
        'together_count': item.get('together_count'),
        'to_fraction': item.get('to_fraction'),
        'from_fraction': item.get('from_fraction'),
        'page': page
    } for item in data])
    header = not partial_csv_path.exists() or (page == 1 and last_page == 0)
    df_page.to_csv(partial_csv_path, mode='a', header=header, index=False)
    # Atualiza o checkpoint (última página)
    with open(ultimo_lote_path, 'w') as f:
        f.write(str(page))
    time.sleep(0.2)

# === CONSOLIDAÇÃO FINAL ===
if partial_csv_path.exists():
    df = pd.read_csv(partial_csv_path)
    df = df.drop(columns=["page"])
    df = df.drop_duplicates()
    df = df.sort_values(by="together_count", ascending=False)
    final_csv_path = str(final_csv_template).format(date_until=data_until)
    df.to_csv(final_csv_path, index=False)
    print(f"Arquivo CSV FINAL salvo em {final_csv_path} ({len(df)} linhas).")
    display(df.head(10))
else:
    print("Não foi possível salvar o arquivo incremental.")


#### v1
 - This first version retrieves data from TagInfoAPI without saving incrementally

In [3]:
from pathlib import Path
import pandas as pd
import time
import requests
from tqdm.notebook import tqdm  # Use tqdm.notebook para notebooks Jupyter

# Definindo o diretório de saída
output_dir = Path("results/0_taginfo")
output_dir.mkdir(parents=True, exist_ok=True)

# Parâmetros fixos
key = "name"
base_url = "https://taginfo.geofabrik.de/south-america:brazil/api/4/key/combinations"
per_page = 100
sortname = "together_count"
sortorder = "desc"

# Primeiro, vamos descobrir o total de páginas
params_inicial = {
    "key": key,
    "page": 1,
    "rp": per_page,
    "sortname": sortname,
    "sortorder": sortorder
}
resp = requests.get(base_url, params=params_inicial)
if resp.status_code != 200:
    raise Exception("Erro ao acessar a primeira página da API.")

result = resp.json()
total_registros = result.get("total", 0)
total_paginas = (total_registros // per_page) + (1 if total_registros % per_page > 0 else 0)
print(f"Total estimado de registros: {total_registros}, páginas: {total_paginas}")

# Coleta paginada com tqdm
all_rows = []
for page in tqdm(range(1, total_paginas + 1), desc="Coletando páginas"):
    params = {
        "key": key,
        "page": page,
        "rp": per_page,
        "sortname": sortname,
        "sortorder": sortorder
    }
    resp = requests.get(base_url, params=params)
    if resp.status_code != 200:
        print(f"Erro na página {page}: {resp.status_code}")
        break
    result = resp.json()
    data = result.get("data", [])
    if not data:
        break
    for item in data:
        all_rows.append({
            'other_key': item.get('other_key'),
            'together_count': item.get('together_count'),
            'to_fraction': item.get('to_fraction'),
            'from_fraction': item.get('from_fraction'),
        })
    time.sleep(0.2)

# Cria e salva DataFrame
df = pd.DataFrame(all_rows)
df = df.sort_values(by="together_count", ascending=False)
csv_path = output_dir / "combinacoes_name_brasil_taginfo.csv"
df.to_csv(csv_path, index=False)
print(f"Arquivo CSV salvo em {csv_path} ({len(df)} linhas).")
display(df.head(10))

Total estimado de registros: 5200, páginas: 52


Coletando páginas:   0%|          | 0/52 [00:00<?, ?it/s]

Arquivo CSV salvo em results/0_taginfo/combinacoes_name_brasil_taginfo.csv (5200 linhas).


Unnamed: 0,other_key,together_count,to_fraction,from_fraction
0,highway,2711498,0.6601,0.3496
1,surface,1153187,0.2807,0.4187
2,oneway,822382,0.2002,0.6556
3,source,742834,0.1808,0.1185
4,lanes,398113,0.0969,0.7127
5,maxspeed,353815,0.0861,0.7226
6,alt_name,321363,0.0782,0.9925
7,amenity,279801,0.0681,0.6332
8,addr:street,267073,0.065,0.1619
9,waterway,267013,0.065,0.1783


### Generate TreeMap
 - This step generates a TreeMap plots

#### TreeMap using Plotly

In [None]:
# == 1st VERSION using Plotly ===
import pandas as pd
from pathlib import Path
import plotly.express as px

# Carregar CSV se necessário
# df = pd.read_csv(csv_path)

# Remove registros com zero ou poucos elementos (ajuste o threshold)
df = df[df["to_fraction"] > 0.01]

# Extrai a chave principal (tudo antes de ':')
df["main_key"] = df["other_key"].str.split(":").str[0]

# Ordena do maior para menor
df = df.sort_values(by="together_count", ascending=False)

# Filtrar apenas os top-N
top_n = 30
df_plot = df.head(top_n)

# TreeMap
fig = px.treemap(
    df_plot,
    path=["main_key", "other_key"], # agrupamento por chave principal
    values="together_count",
    hover_data={"together_count": True, "to_fraction": True},
    color="main_key",
    title="TreeMap: Top 30 das combinações 'name' + outras tags no OSM Brasil (data: 2025-06-23)",
)

fig.update_traces(
    texttemplate="%{label}<br>%{value:,}",
    marker=dict(
        line=dict(width=0.5, color='white')
    )
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))

fig.show()

In [None]:
# == 2st VERSION using Plotly ===
# Generate a TreeMap with only the main key using Plotly Express

import plotly.express as px
import pandas as pd

# Carregar CSV se necessário
df = pd.read_csv(csv_path)

# Remove registros pouco relevantes (ajuste o threshold)
df = df[df["to_fraction"] > 0.01]

# Extrai a chave principal (tudo antes de ':')
df["main_key"] = df["other_key"].str.split(":").str[0]

# Ordena do maior para menor
df = df.sort_values(by="together_count", ascending=False)

# Filtrar apenas os top-N
top_n = 30
df_plot = df.head(top_n)

# Agrupamento apenas pela key principal
df_plot = df_plot.groupby("main_key", as_index=False)["together_count"].sum()

# TreeMap com cores aleatórias (deixar o Plotly escolher)
fig = px.treemap(
    df_plot,
    path=["main_key"],  # agrupamento só pela key principal
    values="together_count",
    color="main_key",
    title="TreeMap: Top 30 keys com 'name' no OSM Brasil (data: 2025-06-23)",
)

# Key em negrito
fig.update_traces(
    texttemplate="<b>%{label}</b><br>%{value:,}",
    textfont=dict(size=12),
    marker=dict(line=dict(width=0.2, color='white'))
)

# Remover eixos para gráfico mais clean
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)

# Margens ajustadas
fig.update_layout(margin=dict(t=60, l=25, r=25, b=40))

fig.show()

In [None]:
# == 3rd VERSION using Plotly ===
# Improved version to generate a TreeMap with only the main key using Plotly Express and random colors

from pathlib import Path
import plotly.express as px
import random
import pandas as pd

# Carregar CSV se necessário
df = pd.read_csv('results/0_taginfo/combinacoes_name_brasil_taginfo.csv')

# Remove registros pouco relevantes (ajuste o threshold)
df = df[df["to_fraction"] > 0.01]

# Extrai a chave principal (tudo antes de ':')
df["main_key"] = df["other_key"].str.split(":").str[0]

# Ordena do maior para menor
df = df.sort_values(by="together_count", ascending=False)

# Filtrar apenas os top-N
top_n = 30
df_plot = df.head(top_n)

# Agrupamento apenas pela key principal
df_plot = df_plot.groupby("main_key", as_index=False)["together_count"].sum()

# Gerar cores hex únicas e aleatórias
def random_color():
    return "#{:06x}".format(random.randint(0, 0xFFFFFF))

unique_keys = df_plot["main_key"].unique()
# random.seed(42)  # Para reprodutibilidade!
colors = [random_color() for _ in unique_keys]
color_map = dict(zip(unique_keys, colors))

# TreeMap com cores aleatórias
fig = px.treemap(
    df_plot,
    path=["main_key"],
    values="together_count",
    color="main_key",
    color_discrete_map=color_map,
    title="TreeMap: Top 30 das combinações 'name' + outras tags no OSM Brasil (data: 2025-06-23)",
)

# Key em negrito
fig.update_traces(
    texttemplate="<b>%{label}</b><br>%{value:,}",
    textfont=dict(size=11),
    marker=dict(line=dict(width=0.1, color='white'))
)

# Remover eixos
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)

# Margens ajustadas
fig.update_layout(margin=dict(t=60, l=25, r=25, b=40))

fig.show()

In [70]:
# SALVAR FIGURA EM 300dpi
# Requer "kaleido" instalado: pip install -U kaleido
fig.write_image(os.path.join("results", "0_taginfo", "treemap_top30_keys_brasil_2025-06-23.png"), format="png", scale=3)  # PNG 300dpi aprox

#### TreeMap using Matplotlib and Squarify

In [None]:
# == 1st VERSION using Matplotlib and Squarify ===
# Generate a TreeMap with only the main key using Squarify

import pandas as pd
import squarify
import matplotlib.pyplot as plt

# df = pd.read_csv('path.csv')

df = df[df["to_fraction"] > 0.01]
df["main_key"] = df["other_key"].str.split(":").str[0]
df_grouped = df.groupby("main_key", as_index=False)["together_count"].sum()
df_grouped = df_grouped.sort_values("together_count", ascending=False).head(30)

# Cores automáticas, mas você pode customizar
cmap = plt.get_cmap('tab20')
colors = [cmap(i) for i in range(len(df_grouped))]

# TreeMap
fig, ax = plt.subplots(figsize=(14, 8))
squarify.plot(
    sizes=df_grouped["together_count"],
    label=[f"{k}\n{v:,}" for k,v in zip(df_grouped["main_key"], df_grouped["together_count"])],
    color=colors,
    alpha=.65,
    text_kwargs={'fontsize':12, 'weight':'bold'}
    , ax=ax
)
plt.axis('off')
plt.title("TreeMap: Top 30 keys com 'name' no OSM Brasil (data: 2025-06-23)", fontsize=20, weight='bold')
plt.tight_layout()

# Salvar em 300dpi
#plt.savefig("/results/0_taginfo+treemap_brasil_2025-06-23.png", dpi=300)