# Jupyter notebook 05: this notebook use the **Mapillary API** to download images by metadata given a set of OSM points after CLIP matching

## Install necessary libraries

In [None]:
# Install necessary libraries
%pip install fiona -q

[OpenAI multimodal model / CLIP ](https://github.com/openai/CLIP)

In [None]:
# Se estiver usando o Colab, instalar o CLIP no ambiente usando o repositório original da OpenAI
# Dessa maneira não precisa de conta e nem token
!git clone https://github.com/openai/CLIP.git
%cd CLIP
%pip install ftfy regex -q
%pip install git+https://github.com/openai/CLIP.git -q

## Import libraries and modules

In [1]:
# Import library and some pre-installed modules for the pipeline
import os, sys, json, time, gc, math, glob, logging
from pathlib import Path
import numpy as np
import geopandas as gpd
import pandas as pd
import fiona
import torch
import tensorflow as tf
import clip  # import do repositório oficial da OpenAI (instalar o CLIP previamente)
import requests
from time import sleep
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from PIL import Image
from io import BytesIO
from datetime import datetime
# from tqdm import tqdm # ideal para scripts .py
from tqdm.notebook import tqdm # Se estiver usando tqdm no notebook:
from sklearn.neighbors import BallTree
from shapely.geometry import Point, box
from geopy.distance import geodesic
# Importando o módulo de visualização do IPython
from IPython.display import display,Markdown, Image as IPImage

In [2]:
# Sets the root directory of the project as the working directory
os.chdir('..')

In [3]:
# Get current working directory
os.getcwd()

'c:\\DEV\\PhD_Thesis_Step3_OSM_Toponyms'

In [4]:
# Import and Reload the modules to ensure any changes are reflected
import importlib
import src.mapillary_metadata_enricher as mapillary_metadata_enricher

importlib.reload(mapillary_metadata_enricher)

<module 'src.mapillary_metadata_enricher' from 'c:\\DEV\\PhD_Thesis_Step3_OSM_Toponyms\\src\\mapillary_metadata_enricher.py'>

In [5]:
# Função para ler o token do Mapillary
from src.mapillary_metadata_enricher import (
    ler_token_mapillary,
)

TOKEN = ler_token_mapillary()

## Setup GPU

In [6]:
# Configurar para utilizar a GPU T4, caso disponível
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # Restringir TensorFlow para utilizar a primeira GPU T4
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("Utilizando GPU:", gpus[0])
    except RuntimeError as e:
        # Erro na configuração da GPU
        print(e)
else:
    print("Nenhuma GPU disponível, utilizando CPU.")

Utilizando GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [7]:
# This cell confirm whether TensorFlow is utilizing a GPU and to see which specific GPU is being used
device_name = tf.test.gpu_device_name()

if device_name:
    print(f'A GPU ativa é: {device_name}')
else:
    print('Nenhuma GPU está ativa.')

A GPU ativa é: /device:GPU:0


In [8]:
# The nvidia-smi command-line allow verify that the correct GPU is being used and for monitoring
# its performance and resource usage
!nvidia-smi

Mon Aug 25 10:41:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.14                 Driver Version: 566.14         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090      WDDM  |   00000000:01:00.0  On |                  Off |
|  0%   36C    P2             55W /  450W |    1148MiB /  24564MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Load input data
 - Points of interest (POIs) from OpenStreetMap (OSM) with toponyms to validate.
 - Points from Mapillary Coverage Tiles with metadata.

In [9]:
# POIs with toponyms from OSM to validate
# Caminho do arquivo
osm_path = "results/2_toponyms_retrieval/step7_latest_name_ohsome"
osm_fp = os.path.join(osm_path, "step7_consolidado_ohsome_filtrado.gpkg")

# Leitura única
gdf_osm = gpd.read_file(osm_fp)
print(f"Arquivo carregado: {len(gdf_osm)} features.")

# iteração com tqdm
for _, row in tqdm(gdf_osm.iterrows(), total=len(gdf_osm), desc="Iterando pelos POIs do OSM"):
    # Seu processamento aqui
    pass

Arquivo carregado: 1627 features.


Iterando pelos POIs do OSM:   0%|          | 0/1627 [00:00<?, ?it/s]

In [None]:
display(gdf_osm.head())

In [11]:
len(gdf_osm)

1627

In [None]:
# column names of the "gdf_osm" GeoDataFrame
gdf_osm.columns.tolist()

In [None]:
# get CRS gdf_osm
gdf_osm.crs

In [None]:
# Load do "gdf_mapillary_candidatos_atualizado.gpkg" (pontos candidatos com os metadados e urls atualizadas)

out_path = "results/3_mapillary_coverage/mapillary_candidatos_subset"
fp_mapillary_candidatos = os.path.join(out_path, "20250812_mpl_coverage_bh_meta_candidatos_atualizados.gpkg")

# Conte o total de features
with fiona.open(fp_mapillary_candidatos) as src:
    total_features = len(src)
print(f"Total de feições: {total_features}")

chunk_size = 10000
dfs = []

with fiona.open(fp_mapillary_candidatos) as src:
    features = []
    # Barra de progresso!
    for i, feat in enumerate(tqdm(src, total=total_features, desc="Carregando pontos")):
        features.append(feat)
        if (i+1) % chunk_size == 0 or (i+1) == total_features:
            df_chunk = gpd.GeoDataFrame.from_features(features, crs=src.crs)
            dfs.append(df_chunk)
            # Limpa variáveis e força garbage collector
            del df_chunk
            features = []
            gc.collect()

# Concatenar todos os chunks em um único GeoDataFrame
gdf_mapillary_candidatos_atualizado = gpd.GeoDataFrame(pd.concat(dfs, ignore_index=True), crs=dfs[0].crs)
print("GeoDataFrame final:", gdf_mapillary_candidatos_atualizado.shape)

# Liberar a memória dos dfs intermediários
del dfs
gc.collect()

In [None]:
display(gdf_mapillary_candidatos_atualizado.head())

In [None]:
# size of image points from Mapillary coverage tiles
len(gdf_mapillary_candidatos_atualizado)

In [None]:
# lista todas colunas gdf_mapillary
gdf_mapillary_candidatos_atualizado.columns.tolist()

In [None]:
gdf_mapillary_candidatos_atualizado.crs

## Pipeline CLIP + Download Mapillary images

**CLIP Multimodal tool applied to select the best candidates from the Mapillary images**

 * Aplicação do modelo multimodal CLIP (OpenAI) para seleção iterativa das melhores imagens do Mapillary com potencial para validar os topônimos do OSM
 * CLIP (Contrastive Language–Image Pretraining) is a neural network model developed by OpenAI that learns to associate images with corresponding text descriptions [[CLIP 2021](https://openai.com/index/clip/)].

**Lógica do Pipeline**

1. **Definição dos descritores textuais multimodais**
  - Construção de um conjunto de descrições em inglês com foco em aspectos visuais que indicam sinalização
  ou nomes em fachadas de edifícios públicos (escolas, hospitais, museus, etc.).
  - As descrições são agrupadas por categoria/classe (educação, saúde, lazer, etc.) e refletem linguagem visual interpretável pelo modelo CLIP.
  - As descrições são tokenizadas previamente e carregadas para GPU quando disponível.

2. **Avaliação visual com CLIP (modelo da OpenAI)**

   - Processar apenas os thumbnails dos pontos Mapillary selecionados (`thumb_256_url` ou `thumb_1024_url`).
   - Tokenizar previamente os descritores textuais (em inglês) e manter na GPU (se disponível).
   - Avaliar cada imagem contra os descritores usando o CLIP.
   - Calcular a similaridade (softmax) e obter o score mais alto por imagem.

3. **Filtragem e ranqueamento por score**

   - Apenas imagens com score acima do limiar (`threshold`, ex: 0.25) são consideradas relevantes.
   - Essas imagens são ranqueadas por score para eventual download da imagem em alta resolução.

4. **Armazenamento de resultados com rastreabilidade completa**

   - Para cada par OSM–Mapillary com correspondência:
     - Registrar:
       - Atributos do ponto OSM
       - Atributos do ponto Mapillary
       - Score CLIP

5. **Donwload estruturado das imagens**

   - Estrutura de pasta (exemplo):
      - classe → ex: edif_ensino
       - tag=value → concatenação com underline: amenity_school (substituir * por any)
       - @osmId → formatado para node_1980234785, way_0000000 etc.

### Definir o modelo CLIP da OpenAI a ser utilziado

---
*Sobre os modelos CLIP disponíveis:*

- A OpenAI disponibiliza vários modelos no repositório do CLIP, cada um com diferentes tamanhos e desempenho:

| Modelo      | Arquitetura                         | Desempenho | Velocidade   | RAM/GPU |
| ----------- | ----------------------------------- | ---------- | ------------ | ------- |
| ViT-B/32    | Vision Transformer, Base, Patch 32  | Bom        | Muito rápido | Leve    |
| ViT-B/16    | Vision Transformer, Base, Patch 16  | Melhor     | Médio        | Média   |
| ViT-L/14    | Vision Transformer, Large, Patch 14 | Ótimo      | Lento        | Alta    |
| RN50, RN101 | ResNet 50, 101                      | Razoável   | Rápido       | Leve    |

---

In [16]:
# Definir o dispositivo de execução e carregar o modelo CLIP da OpenAI
device = "cuda" if torch.cuda.is_available() else "cpu"

# Carrega o modelo ViT-L/14 e o pré-processador de imagens
model, preprocess = clip.load("ViT-L/14", device=device)

#### Teste do modelo CLIP em imagem do repositório oficial

In [None]:
# Caso necessário, baixar uma imagem diretamento do repositório oficial para teste
!wget https://raw.githubusercontent.com/openai/CLIP/main/CLIP.png

In [None]:
# Teste para comparar uma imagem com descrições de texto e calcular a probabilidade de a imagem
# corresponder a cada descrição.

image = preprocess(Image.open("data/CLIP.png")).unsqueeze(0).to(device)
texts = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(texts)

    logits_per_image, logits_per_text = model(image, texts)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Probabilidades:", probs)

### Definição dos descritores textuais (*Tokens*)

In [17]:
# Descrições para CLIP: sinalizações com possíveis topônimos

# Descrições em inglês, pois o treinamento original do modelo da OpenAI foi feito majoritariamente
# com imagens emparelhadas com descrições em inglês extraídas da web (ex: Reddit, Wikipedia, etc).
# O vocabulário e os padrões visuais que o modelo “entende” tendem a estar mais bem representados no inglês

edif_ensino_descriptions = [
    "front of a school building with large name sign above the entrance",
    "school gate showing the institution name clearly displayed",
    "educational institution with painted name on outer wall",
    "school building facade with visible school logo and name",
    "university entrance sign with building in the background",
    "kindergarten entrance showing colorful sign with school name",
    "visible school name on a wall-mounted sign at the main entrance",
    "school signboard placed next to the entrance gate with institution name",
    "primary school entrance with large name painted above the doorway",
    "education building front with blue and white signage showing the name"
]

edif_saude_descriptions = [
    "hospital building with red cross and visible institution name sign",
    "public health clinic with entrance sign showing name and medical icon",
    "medical facility with name on wall and symbol like red cross or stethoscope",
    "health center entrance with medical signboard and institution name",
    "urban clinic facade with mounted nameplate and health icon",
    "emergency care building with name and ambulance access sign",
    "dental clinic sign with name and dental symbol near sidewalk",
    "health post building with a large nameplate next to the main door",
    "pharmacy with red or green cross and store name clearly shown"
]

edif_desenv_social_descriptions = [
    "public shelter with name sign at entrance and institutional logo",
    "social assistance center with government signage and building entrance",
    "social support building with plaque identifying the institution",
    "urban aid center with logo and painted name above the door",
    "government-run shelter with signage on gate or fence",
    "social facility for community services with name clearly displayed",
    "community service building with wall sign and public logo",
    "child welfare center entrance with name and emblem above door"
]

edif_constr_lazer_descriptions = [
    "theater building with name above main entrance",
    "library facade with identifying sign or logo",
    "cultural center with plaque or mural showing name",
    "stadium entrance with institution name in large letters",
    "park entrance with archway and signage showing name",
    "community center with name sign near front door",
    "museum with clear name sign near pedestrian entrance",
    "public playground area with wooden sign showing location name",
    "art center entrance with decorative nameplate"
]

edif_pub_civil_descriptions = [
    "city hall building with official sign and institution name",
    "municipal building with plaque or wall text identifying it",
    "government office entrance with emblem and name",
    "administrative public building with nameboard above entrance",
    "urban public service building with logo and wall sign",
    "civil institution with coat of arms and name near front gate",
    "court building with national symbol and visible signage"
]

edif_turistica_descriptions = [
    "tourist attraction with descriptive sign or plaque at the site",
    "urban square with engraved name on stone or metal plaque",
    "art installation or sculpture with informational board",
    "hotel facade with illuminated or mounted name sign",
    "monument with text description at the base or entrance",
    "tourist spot with sign next to the sidewalk showing name",
    "fountain or landmark with name inscribed or signed nearby",
    "panoramic viewpoint area with name sign and railing",
    "historic building with sign explaining its significance"
]

edif_metro_ferroviaria_descriptions = [
    "subway entrance with metal sign showing station name",
    "train station facade with identifying text above gate",
    "metro station with illuminated nameboard and transit symbol",
    "rail terminal entrance with directional sign and name",
    "urban transport station with platform signage and location name",
    "public transit hub with wall-mounted route map and station name",
    "underground station entrance with curved roof and name plaque",
    "rail station nameboard with schedule display nearby"
]

In [18]:
# Dicionário de descritores por classe
CLASS_DESCRIPTION_MAP = {
    "edif_ensino": edif_ensino_descriptions,
    "edif_saude": edif_saude_descriptions,
    "edif_desenv_social": edif_desenv_social_descriptions,
    "edif_constr_lazer": edif_constr_lazer_descriptions,
    "edif_pub_civil": edif_pub_civil_descriptions,
    "edif_turistica": edif_turistica_descriptions,
    "edif_metro_ferroviaria": edif_metro_ferroviaria_descriptions
}

### CLIP com batch e salva parciais


In [23]:
# Versao 2 melhorada
# Setup: Direórios, Logging, Session, Modelo CLIP, e Tokens por classe

# --------------------------------------------------------------------------------------
# Diretórios de saída e logging
# --------------------------------------------------------------------------------------

#date_str = datetime.now().strftime('%Y%m%d') # Criar diretório com a data atual no formato YYYYMMDD
#OUT_DIR = Path(f"results/4_CLIP_results/2_ViT_L14/{date_str}")
OUT_DIR = Path(f"results/4_CLIP_results/2_ViT_L14/20250819")
CHUNKS_DIR = OUT_DIR / "chunks"

# Criar diretórios se ainda não existirem
OUT_DIR.mkdir(parents=True, exist_ok=True)
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)

logfile = OUT_DIR / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_clip_log.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(logfile, mode='w', encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()
logger.info("Inicializando pipeline...")

# --------------------------------------------------------------------------------------
# Helpers de persistência incremental
# --------------------------------------------------------------------------------------
PROGRESS_FILE = OUT_DIR / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_progress.json"
LOGS_NDJSON   = OUT_DIR / f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_clip_logs.ndjson"

def load_progress(progress_file=PROGRESS_FILE):
    if progress_file.exists():
        with open(progress_file, "r", encoding="utf-8") as f:
            p = json.load(f)
        return set(p.get("processed_osm_ids", [])), int(p.get("batch_counter", 0))
    return set(), 0

def save_progress(processed_osm_ids, batch_counter, progress_file=PROGRESS_FILE):
    with open(progress_file, "w", encoding="utf-8") as f:
        json.dump({
            "processed_osm_ids": list(processed_osm_ids),
            "batch_counter": int(batch_counter),
            "last_update": datetime.now().isoformat()
        }, f, ensure_ascii=False, indent=2)

def append_ndjson(records, ndjson_path=LOGS_NDJSON):
    if not records:
        return
    with open(ndjson_path, "a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def _geom_cols_to_wkb(df: pd.DataFrame) -> pd.DataFrame:
    """Converte quaisquer colunas de geometria shapely para WKB (_wkb) e remove as originais."""
    try:
        from shapely.geometry import base as shapely_base
        from shapely import wkb as shapely_wkb
    except Exception:
        # Se shapely não estiver disponível, apenas retorna sem converter
        return df

    df = df.copy()
    for col in list(df.columns):
        if col.startswith("geometry"):
            # Converte apenas se detectar objetos shapely
            if df[col].map(lambda v: isinstance(v, shapely_base.BaseGeometry)).any():
                df[col + "_wkb"] = df[col].map(
                    lambda g: shapely_wkb.dumps(g) if isinstance(g, shapely_base.BaseGeometry) else pd.NA
                )
                df.drop(columns=[col], inplace=True)
    return df


def _to_wkb_series(series):
    from shapely.geometry import base as shapely_base
    from shapely import wkb as shapely_wkb
    return series.apply(lambda g: shapely_wkb.dumps(g) if isinstance(g, shapely_base.BaseGeometry) else pd.NA)

def write_chunk_parquet(rows, batch_counter, chunks_dir=CHUNKS_DIR):
    """Grava GeoParquet com 'geometry' (Mapillary).
       Outras colunas de geometria (ex.: 'geometry_osm') viram WKB para não quebrar.
       Se GeoParquet falhar por qualquer motivo, cai no fallback: tudo em WKB + Parquet normal.
    """
    if not rows:
        return None

    df = pd.DataFrame(rows)

    # chave única opcional
    if "@osmId" in df.columns and "image_id" in df.columns:
        df["unique_key"] = df["@osmId"].astype(str) + "_" + df["image_id"].astype(str)

    fname = chunks_dir / f"clip_matches_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{batch_counter:06d}.parquet"

    if "geometry" in df.columns:
        try:
            import geopandas as gpd
            from shapely.geometry import base as shapely_base

            # 1) Converte TODAS as colunas de geometria EXCETO a 'geometry' principal para WKB
            #    (ex.: 'geometry_osm', 'geometry_mapillary_aux', etc.)
            to_convert = [c for c in df.columns if c.startswith("geometry") and c != "geometry"]
            for col in to_convert:
                # só converte se de fato houver shapely na coluna
                if df[col].map(lambda v: isinstance(v, shapely_base.BaseGeometry)).any():
                    df[col + "_wkb"] = _to_wkb_series(df[col])
                    df.drop(columns=[col], inplace=True)

            # 2) Cria GeoDataFrame usando 'geometry' (Mapillary) como geometria ativa
            crs = getattr(globals().get("gdf_osm", None), "crs", None)
            gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=crs)

            # 3) Salva como GeoParquet
            gdf.to_parquet(fname, index=False)
            return fname

        except Exception as e:
            logger.warning(f"GeoParquet indisponível ({type(e).__name__}: {e}); convertendo TODAS as geometrias para WKB e salvando Parquet comum.")
            # FALLBACK: converte inclusive a 'geometry' principal para WKB
            try:
                from shapely.geometry import base as shapely_base
                from shapely import wkb as shapely_wkb

                for col in [c for c in df.columns if c.startswith("geometry")]:
                    if df[col].map(lambda v: isinstance(v, shapely_base.BaseGeometry)).any():
                        df[col + "_wkb"] = df[col].map(
                            lambda g: shapely_wkb.dumps(g) if isinstance(g, shapely_base.BaseGeometry) else pd.NA
                        )
                        df.drop(columns=[col], inplace=True)
            except Exception as _:
                pass  # se não tiver shapely, segue com o que der

    # Parquet padrão (sem colunas shapely — ou já convertidas para WKB)
    df.to_parquet(fname, index=False)
    return fname
# --------------------------------------------------------------------------------------
# Sessão HTTP (otimização 1): Session + raise_for_status + timeout padrão menor
# --------------------------------------------------------------------------------------
def build_requests_session(total_retries=2, backoff=0.3, status_forcelist=(500, 502, 503, 504)):
    session = requests.Session()
    retries = Retry(
        total=total_retries,
        backoff_factor=backoff,
        status_forcelist=status_forcelist,
        allowed_methods=["GET", "HEAD"]
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=50, pool_maxsize=50)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    # Timeout padrão de 5s será usado nas chamadas (pode ajustar por parâmetro no batch)
    return session

session = build_requests_session()
logger.info("Sessão HTTP configurada.")

# --------------------------------------------------------------------------------------
# Modelo CLIP e função para gerar tokens dinâmicos por classe
# --------------------------------------------------------------------------------------
# Assumindo que você já definiu: `model`, `preprocess`, `text_tokens` e `device` ANTES.
# Se ainda não, ajuste este bloco para carregar o modelo e criar os tokens.

try:
    _ = device  # checar existência
except NameError:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.warning(f"`device` não estava definido; usando '{device}'.")

# Garante eval e move o modelo para device:
try:
    model.eval()
    model.to(device)  # se ainda não estiver no device
except NameError as e:
    raise RuntimeError("O objeto `model` não está definido no ambiente. Defina-o antes de seguir.") from e

# Usa o dicionário: CLASS_DESCRIPTION_MAP
def get_tokens_for_row(row, device="cuda"):
    classe = row.get("classe", "").strip().lower()
    descriptions = CLASS_DESCRIPTION_MAP.get(classe)

    if not descriptions:
        descriptions = ["urban building with visible nameplate or signage"]  # fallback genérico

    return clip.tokenize(descriptions).to(device)

logger.info(f"Modelo em eval() e tokens movidos para '{device}'.")

2025-08-22 15:22:51,022 [INFO] Inicializando pipeline...
2025-08-22 15:22:51,024 [INFO] Sessão HTTP configurada.
2025-08-22 15:22:51,028 [INFO] Modelo em eval() e tokens movidos para 'cuda'.


In [24]:
def batch_image_matches_clip(
    df_rows,
    threshold=0.25,
    preprocess=None,
    model=None,
    device="cuda",
    session=None,
    timeout=5.0
):
    """
    df_rows: lista de dicts [{row, map_row, thumb_1024_url, thumb_256_url}]
    Retorna:
        results: lista de tuplas (idx_no_batch, score, match_bool)
        logs:    lista de dicts com info de download/tempo/erros
    """
    if session is None:
        session = requests.Session()

    logs = []
    results = []

    # ------------------------------------------------------------------------------------
    # Agrupa os pares por CLASSE (para tokens diferentes por subgrupo)
    # ------------------------------------------------------------------------------------
    from collections import defaultdict
    grouped_rows = defaultdict(list)

    for idx, d in enumerate(df_rows):
        classe = d['row'].get("classe", "").strip().lower()
        grouped_rows[classe].append((idx, d))

    # ------------------------------------------------------------------------------------
    # Processa cada subgrupo (classe) com seus tokens específicos
    # ------------------------------------------------------------------------------------
    for classe, items in grouped_rows.items():
        images = []
        batch_info = []
        sublogs = []

        # Gera tokens desta classe
        try:
            # Pega qualquer linha do grupo para obter os tokens
            tokens_device = get_tokens_for_row(items[0][1]['row'], device=device)
        except Exception as e:
            logger.error(f"[ERRO TOKENIZER] Falha ao gerar tokens para classe '{classe}': {e}")
            continue

        for i_batch, d in items:
            img_url_used = None
            erro = ""
            tempo_ini = time.time()

            for urlcol in ('thumb_1024_url', 'thumb_256_url'):
                url = d.get(urlcol)
                if not url or pd.isna(url):
                    continue
                try:
                    resp = session.get(url, timeout=timeout)
                    resp.raise_for_status()
                    image = Image.open(BytesIO(resp.content)).convert("RGB")
                    images.append(preprocess(image))
                    img_url_used = url
                    break
                except Exception as e:
                    erro = f"{type(e).__name__}: {e}"
                    logger.warning(f"[ERRO DOWNLOAD] {erro} - {url}")
                    continue

            tempo_fim = time.time()
            sublogs.append({
                "osm_id": d['row'].get("@osmId", None) if hasattr(d['row'], 'get') else d['row'].to_dict().get("@osmId", None),
                "mapillary_image_id": d['map_row'].get("image_id", None) if hasattr(d['map_row'], 'get') else d['map_row'].to_dict().get("image_id", None),
                "classe": classe,
                "url_1024": d.get("thumb_1024_url"),
                "url_256": d.get("thumb_256_url"),
                "url_used": img_url_used,
                "error": erro if img_url_used is None else "",
                "time_s": round(tempo_fim - tempo_ini, 3)
            })

            batch_info.append(i_batch if img_url_used else None)

        if not images:
            logs.extend(sublogs)
            results.extend([(i, 0.0, False) for i, _ in items])
            continue

        # Inferência com CLIP
        image_tensor = torch.stack(images).to(device)
        with torch.no_grad():
            logits_per_image, _ = model(image_tensor, tokens_device)
            probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()
            scores = probs.max(axis=1)

        img_idx = 0
        for idx in batch_info:
            if idx is not None:
                score = float(scores[img_idx])
                results.append((idx, score, score >= threshold))
                img_idx += 1
            else:
                results.append((idx, 0.0, False))

        logs.extend(sublogs)

    return results, logs

In [None]:
# Versao 2 melhorada

# Pipeline principal com:
#  - checkpoint incremental (progress.json)
#  - gravação em GeoParquet (usando write_chunk_parquet da Célula 1)
#  - pré-filtro por bounding box (opcional) + Haversine
#  - model.eval() & text_tokens já no device (Célula 1)
#  - sessão HTTP com timeout=5s (Célula 1)
#  - GEOMETRIA FINAL = Mapillary (geometry), e OSM em geometry_osm

# ------------------------------------------------
# Parâmetros ajustáveis
# ------------------------------------------------
BATCH_SIZE = 128
SAVE_EVERY_BATCHES = 50      # salva a cada N lotes
FLUSH_MIN_ROWS = 500        # ou salva quando acumular N resultados
OSM_ID_COL = "@osmId"
MAX_K = 100                 # k máximo quando usando BallTree puro (fallback)
RADIUS_METERS = 100
USE_BBOX_PREFILTER = False   # pré-filtrar por bbox antes de calcular Haversine (mais ráipido, porém retorna de resultados soprepostos)

# ------------------------------------------------
# Carrega progresso anterior
# ------------------------------------------------
processed_osm_ids, batch_counter = load_progress()
logger.info(f"Retomando com {len(processed_osm_ids)} OSMs já processados; batch_counter={batch_counter}")

# ------------------------------------------------
# Normalização mínima de colunas de URL
# ------------------------------------------------
for col in ("thumb_256_url", "thumb_1024_url", "thumb_2048_url", "thumb_original_url"):
    if col in gdf_mapillary_candidatos_atualizado.columns:
        gdf_mapillary_candidatos_atualizado[col] = gdf_mapillary_candidatos_atualizado[col].astype("string")

# ------------------------------------------------
# Coords + BallTree (para fallback)
# ------------------------------------------------
gdf_mapillary_candidatos_atualizado["coords"] = gdf_mapillary_candidatos_atualizado.geometry.apply(lambda g: (g.y, g.x))
coords_mapillary = gdf_mapillary_candidatos_atualizado["coords"].to_list()
coords_mapillary_rad = np.radians(coords_mapillary)
tree = BallTree(coords_mapillary_rad, metric='haversine')

# Arrays para bbox rápido
lats = gdf_mapillary_candidatos_atualizado.geometry.y.values
lons = gdf_mapillary_candidatos_atualizado.geometry.x.values

# ------------------------------------------------
# Buffers desta execução
# ------------------------------------------------
resultados_buffer = []
rows_batch = []
log_buffer = []
batches_since_last_save = 0

# ------------------------------------------------
# Filtra OSM já processados
# ------------------------------------------------
if OSM_ID_COL in gdf_osm.columns and processed_osm_ids:
    mask = ~gdf_osm[OSM_ID_COL].astype(str).isin({str(x) for x in processed_osm_ids})
    gdf_osm_iter = gdf_osm[mask]
else:
    gdf_osm_iter = gdf_osm

logger.info(f"Processando {len(gdf_osm_iter)} OSM restantes (de {len(gdf_osm)}) e {len(gdf_mapillary_candidatos_atualizado)} imagens Mapillary")

# ------------------------------------------------
# Utilitários
# ------------------------------------------------
EARTH_R = 6371000.0  # metros

def haversine_dist_m(lat1_deg, lon1_deg, lats2_deg, lons2_deg):
    lat1 = np.radians(lat1_deg)
    lon1 = np.radians(lon1_deg)
    lat2 = np.radians(lats2_deg)
    lon2 = np.radians(lons2_deg)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return EARTH_R * c

def bbox_mask(lat_deg, lon_deg, radius_m=RADIUS_METERS):
    # ~1 deg lat ~ 111_320 m
    delta_lat = radius_m / 111320.0
    # ~1 deg lon ~ 111_320 * cos(lat)
    denom = 111320.0 * max(1e-9, math.cos(math.radians(lat_deg)))
    delta_lon = radius_m / denom
    return (
        (lats >= lat_deg - delta_lat) & (lats <= lat_deg + delta_lat) &
        (lons >= lon_deg - delta_lon) & (lons <= lon_deg + delta_lon)
    )

# ------------------------------------------------
# Processamento e flush de um batch
# (GEOMETRIA FINAL = MAPILLARY)
# ------------------------------------------------
def process_and_flush(rows_batch, resultados_buffer, log_buffer):
    if not rows_batch:
        return resultados_buffer, log_buffer

    t0 = time.time()
    results, logs = batch_image_matches_clip(
        df_rows=rows_batch,  # agora tokens são internos
        threshold=0.25,
        preprocess=preprocess,
        model=model,
        device=device,
        session=session,
        timeout=5.0
    )
    t1 = time.time()
    log_buffer.extend(logs)

    for i, score, match in results:
        if match and i is not None:
            row_ = rows_batch[i]["row"]             # OSM (GeoSeries/Series)
            map_row_ = rows_batch[i]["map_row"]     # Mapillary (GeoSeries/Series)

            # Extrai geometrias
            geom_osm = row_.geometry
            geom_map = map_row_.geometry    # <- ESTA SERÁ A GEOMETRIA FINAL

            # Evita colisão de colunas 'geometry'
            row_dict = row_.to_dict()
            map_dict = map_row_.to_dict()
            row_dict.pop("geometry", None)
            map_dict.pop("geometry", None)

            resultados_buffer.append({
                **row_dict,                     # todas as colunas do OSM (sem geometry)
                **map_dict,                     # todas as colunas do Mapillary (sem geometry)
                "geometry_osm": geom_osm,       # preserva geometria OSM
                "geometry": geom_map,           # GEOMETRIA FINAL (Mapillary)
                "dist_m": float(map_row_["dist_m"]),
                "score": float(score)
            })

    logger.info(f"Lote de {len(rows_batch)} imagens processado em {t1 - t0:.1f}s")
    return resultados_buffer, log_buffer

# ------------------------------------------------
# Loop principal com checkpoint e persistência
# ------------------------------------------------
try:
    for _, row in tqdm(gdf_osm_iter.iterrows(), total=len(gdf_osm_iter), desc="Processando POIs do OSM"):
        lat_osm = float(row.geometry.y)
        lon_osm = float(row.geometry.x)

        if USE_BBOX_PREFILTER:
            # 1) Pré-corte barato por BBox
            m = bbox_mask(lat_osm, lon_osm, RADIUS_METERS)
            if not np.any(m):
                if OSM_ID_COL in row.index:
                    processed_osm_ids.add(str(row[OSM_ID_COL]))
                continue

            sub = gdf_mapillary_candidatos_atualizado.loc[m].copy()

            # 2) Distância precisa (Haversine) só no subset
            dists_m = haversine_dist_m(lat_osm, lon_osm, sub.geometry.y.values, sub.geometry.x.values)
            sub["dist_m"] = dists_m
            candidatos = sub[sub["dist_m"] <= RADIUS_METERS].copy()

            # Fallback: se nenhum dentro de 200 m mas subset ainda grande, pega top-N mais próximos
            if len(candidatos) == 0 and len(sub) > 0:
                N = min(MAX_K, len(sub))
                idx_top = np.argpartition(dists_m, N-1)[:N]
                candidatos = sub.iloc[idx_top].copy()
                candidatos["dist_m"] = dists_m[idx_top]

        else:
            # Sem prefilter: usa BallTree e aplica raio
            coord_osm_rad = np.radians([[lat_osm, lon_osm]])
            dist_rad, ind = tree.query(coord_osm_rad, k=min(MAX_K, len(gdf_mapillary_candidatos_atualizado)))
            dist_m = dist_rad[0] * EARTH_R
            candidatos = gdf_mapillary_candidatos_atualizado.iloc[ind[0]].copy()
            candidatos["dist_m"] = dist_m
            candidatos = candidatos[candidatos["dist_m"] <= RADIUS_METERS].copy()

        if candidatos.empty:
            if OSM_ID_COL in row.index:
                processed_osm_ids.add(str(row[OSM_ID_COL]))
            continue

        # Monta pares (OSM, Mapillary)
        for _, map_row in candidatos.iterrows():
            rows_batch.append({
                "row": row,
                "map_row": map_row,
                "thumb_1024_url": map_row.get('thumb_1024_url'),
                "thumb_256_url": map_row.get('thumb_256_url')
            })

            if len(rows_batch) >= BATCH_SIZE:
                resultados_buffer, log_buffer = process_and_flush(rows_batch, resultados_buffer, log_buffer)
                rows_batch.clear()

                # Persistência incremental
                batches_since_last_save += 1
                if batches_since_last_save >= SAVE_EVERY_BATCHES or len(resultados_buffer) >= FLUSH_MIN_ROWS:
                    batch_counter += 1
                    out_path = write_chunk_parquet(resultados_buffer, batch_counter)  # GeoParquet (usa 'geometry' = Mapillary)
                    append_ndjson(log_buffer)
                    save_progress(processed_osm_ids, batch_counter)

                    logger.info(f"Parcial salva: {out_path.name if out_path else '—'} | "
                                f"resultados={len(resultados_buffer)} | logs={len(log_buffer)} | "
                                f"OSMs concl.: {len(processed_osm_ids)}")

                    resultados_buffer.clear()
                    log_buffer.clear()
                    batches_since_last_save = 0
                gc.collect()

        # Concluiu este OSM
        if OSM_ID_COL in row.index:
            processed_osm_ids.add(str(row[OSM_ID_COL]))

    # Último batch
    if rows_batch:
        resultados_buffer, log_buffer = process_and_flush(rows_batch, resultados_buffer, log_buffer)
        rows_batch.clear()

finally:
    # Flush final (também em KeyboardInterrupt)
    batch_counter += 1
    out_path = write_chunk_parquet(resultados_buffer, batch_counter)  # GeoParquet
    append_ndjson(log_buffer)
    save_progress(processed_osm_ids, batch_counter)

    logger.info(f"Flush final: wrote {out_path.name if out_path else '—'}, "
                f"{len(resultados_buffer)} resultados, {len(log_buffer)} logs, "
                f"{len(processed_osm_ids)} OSMs concluídos.")
    
    # Força flush de todos os handlers (log é escrito em disco com segurança)
    for handler in logger.handlers:
        handler.flush()
    
    resultados_buffer.clear()
    log_buffer.clear()
    gc.collect()

##### Consolidate and save unique results

In [None]:
# Carregar todos os ".parquet" e salvar em um único arquivo GPKG

# Lista de arquivos parquet
parts = sorted(glob.glob(str(CHUNKS_DIR / "clip_matches_*.parquet")))

# Leitura com barra de progresso
gdfs = []
for p in tqdm(parts, desc="Lendo arquivos .parquet"):
    gdfs.append(gpd.read_parquet(p))

# Concatenar todos
gdf_final = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

In [30]:
len(gdf_final)

108475

In [31]:
gdf_final.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [32]:
print(gdf_final.dtypes)

id_celula             object
classe                object
tag                   object
value                 object
@changesetId           int64
                      ...   
dist_m               float64
geometry            geometry
score                float64
unique_key            object
geometry_osm_wkb      object
Length: 241, dtype: object


In [33]:
# Converter colunas problemáticas (como numpy.ndarray, lista, dict, tuple) para string — exceto geometry
colunas_convertidas = {}

for col in tqdm(gdf_final.columns, desc="Convertendo colunas problemáticas"):
    if col == "geometry":
        continue

    # Contar quantos valores precisam de conversão
    mask = gdf_final[col].apply(lambda x: isinstance(x, (tuple, list, dict, np.ndarray, bytes)))
    n_convertidos = mask.sum()

    if n_convertidos > 0:
        gdf_final[col] = gdf_final[col].apply(
            lambda x: str(x) if isinstance(x, (tuple, list, dict, np.ndarray, bytes)) else x
        )
        colunas_convertidas[col] = int(n_convertidos)

# Exibir resumo
if colunas_convertidas:
    print("Colunas convertidas para string:")
    for col, n in colunas_convertidas.items():
        print(f" - {col}: {n} valores convertidos")
else:
    print("Nenhuma coluna precisava de conversão.")

Convertendo colunas problemáticas:   0%|          | 0/241 [00:00<?, ?it/s]

Colunas convertidas para string:
 - coords: 108475 valores convertidos
 - geometry_osm_wkb: 108475 valores convertidos


In [34]:
# drop de colunas intermediárias desnecessárias
colunas_a_dropar = [col for col in colunas_convertidas if col in ["coords", "geometry_osm_wkb"]]
gdf_para_exportar = gdf_final.drop(columns=colunas_a_dropar).copy()

In [None]:
gdf_para_exportar.columns.tolist()

In [37]:
# Salvar como parquet
output_path = os.path.join(OUT_DIR, f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_clip_mpl_resultado_final.parquet")
gdf_para_exportar.to_parquet(output_path, index=False)
print(f"Salvo com sucesso: {output_path}")

Salvo com sucesso: results\4_CLIP_results\2_ViT_L14\20250819\20250822_152754_clip_mpl_resultado_final.parquet


In [38]:
# Identificação de pontos OSMs sem imagem Mapilllary

#1. Set de IDs com correspondência CLIP (match Mapillary)
ids_com_imagem = set(gdf_para_exportar["@osmId"])

#2. Filtrar pontos OSM que não possuem nenhuma imagem válida próxima
gdf_sem_imagens_mpl = gdf_osm[~gdf_osm["@osmId"].isin(ids_com_imagem)].copy()

print(f"OSMs com imagem CLIP: {len(ids_com_imagem)}")
print(f"OSMs sem imagem válida: {len(gdf_sem_imagens_mpl)}")

OSMs com imagem CLIP: 1254
OSMs sem imagem válida: 373


In [None]:
# Salvar resultado final em GPKG (com duas camadas distintas):
# Nome do arquivo GPKG com timestamp para versão única

output_gpkg = os.path.join(OUT_DIR, f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_clip_osm_mpl_results_final.gpkg")

# Salvar camada dos pares com correspondência CLIP
gdf_para_exportar.to_file(output_gpkg, driver="GPKG", layer="clip_vitL14_mpl_matches")

# Salvar camada dos pontos OSM sem imagens válidas
gdf_sem_imagens_mpl.to_file(output_gpkg, driver="GPKG", layer="osm_no_mpl_images")

print(f"GeoPackage salvo com duas camadas:\n {output_gpkg}")

In [15]:
# Reload do "20250822_152831_clip_osm_mpl_results_final.gpkg"

# Caminhos
out_path = "results/4_CLIP_results/2_ViT_L14/20250819"
fp_gpkg = os.path.join(out_path, "20250822_152831_clip_osm_mpl_results_final.gpkg")
layer_name = "clip_vitL14_mpl_matches"

# Verifica se a layer existe
layers = fiona.listlayers(fp_gpkg)
if layer_name not in layers:
    raise ValueError(f"Layer '{layer_name}' não encontrada no arquivo. Layers disponíveis: {layers}")

# Conta feições da layer
with fiona.open(fp_gpkg, layer=layer_name) as src:
    total_features = len(src)
print(f"Total de feições na layer '{layer_name}': {total_features}")

# Leitura em chunks
chunk_size = 10_000
dfs = []

with fiona.open(fp_gpkg, layer=layer_name) as src:
    features = []
    for i, feat in enumerate(tqdm(src, total=total_features, desc=f"Carregando '{layer_name}'")):
        features.append(feat)
        if (i + 1) % chunk_size == 0 or (i + 1) == total_features:
            # CRS: Fiona pode retornar dict (src.crs) ou WKT (src.crs_wkt)
            crs = src.crs
            if crs is None and getattr(src, "crs_wkt", None):
                try:
                    from pyproj import CRS
                    crs = CRS.from_wkt(src.crs_wkt).to_wkt()
                except Exception:
                    crs = None

            df_chunk = gpd.GeoDataFrame.from_features(features, crs=crs)
            dfs.append(df_chunk)

            # Limpeza
            features.clear()
            del df_chunk
            gc.collect()

# Concatena tudo em um único GeoDataFrame final
if not dfs:
    # vazio
    gdf_mpl_final = gpd.GeoDataFrame(geometry=[])
else:
    gdf_mpl_final = gpd.GeoDataFrame(
        pd.concat(dfs, ignore_index=True),
        crs=dfs[0].crs
    )

print("GeoDataFrame final:", gdf_mpl_final.shape)

# Libera memória intermediária
del dfs
gc.collect()

Total de feições na layer 'clip_vitL14_mpl_matches': 108475


Carregando 'clip_vitL14_mpl_matches':   0%|          | 0/108475 [00:00<?, ?it/s]

GeoDataFrame final: (108475, 239)


18

In [16]:
display(gdf_mpl_final)

Unnamed: 0,geometry,id_celula,classe,tag,value,@changesetId,@contributionChangesetId,@creation,@osmId,@osmType,...,computed_geometry_type,computed_geometry_coordinates,captured_date,mesh,mesh_id,mesh_url,sfm_cluster_url,dist_m,score,unique_key
0,POINT (-44.04466 -19.99024),200ME60356N90866,edif_ensino,amenity,school,12291812,12291812.0,1,node/1830091373,node,...,Point,"[-44.044614809247, -19.990220050764]",2021-05-22T17:27:18.797000+00:00,"{'id': '577704926921396', 'url': 'https://scon...",577704926921396,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,37.792386,0.355469,node/1830091373_294025759131160
1,POINT (-44.04468 -19.99028),200ME60356N90866,edif_ensino,amenity,school,12291812,12291812.0,1,node/1830091373,node,...,Point,"[-44.044642263015, -19.990251174585]",2021-05-22T17:27:18.297000+00:00,"{'id': '1012597642863885', 'url': 'https://sco...",1012597642863885,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,37.804249,0.584961,node/1830091373_307923321060745
2,POINT (-44.04470 -19.99031),200ME60356N90866,edif_ensino,amenity,school,12291812,12291812.0,1,node/1830091373,node,...,Point,"[-44.044669286533, -19.99028178551]",2021-05-22T17:27:17.797000+00:00,"{'id': '787881018573286', 'url': 'https://scon...",787881018573286,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,38.413131,0.592773,node/1830091373_982441749168567
3,POINT (-44.04463 -19.99021),200ME60356N90866,edif_ensino,amenity,school,12291812,12291812.0,1,node/1830091373,node,...,Point,"[-44.044587591826, -19.990188639796]",2021-05-22T17:27:19.297000+00:00,"{'id': '402381914875580', 'url': 'https://scon...",402381914875580,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,38.629732,0.440186,node/1830091373_798420737483679
4,POINT (-44.04473 -19.99034),200ME60356N90866,edif_ensino,amenity,school,12291812,12291812.0,1,node/1830091373,node,...,Point,"[-44.044695073472, -19.990310530965]",2021-05-22T17:27:17.297000+00:00,"{'id': '569087531104456', 'url': 'https://scon...",569087531104456,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,39.079634,0.346680,node/1830091373_970234847060373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108470,POINT (-43.94716 -19.98322),200ME60458N90870,edif_comerc_serv,healthcare,*,124096290,124096290.0,1,node/711921964,node,...,Point,"[-43.947185846893, -19.98322623627]",2019-10-25T07:44:26+00:00,,,,,38.950672,0.511230,node/711921964_1265027787280010
108471,POINT (-43.94713 -19.98328),200ME60458N90870,edif_comerc_serv,healthcare,*,124096290,124096290.0,1,node/711921964,node,...,Point,"[-43.947116487133, -19.983318245942]",2019-10-25T15:20:34.169000+00:00,"{'id': '795145147801243', 'url': 'https://scon...",795145147801243,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,https://scontent.fplu40-1.fna.fbcdn.net/m1/v/t...,38.993018,0.531250,node/711921964_222881509639515
108472,POINT (-43.94715 -19.98325),200ME60458N90870,edif_comerc_serv,healthcare,*,124096290,124096290.0,1,node/711921964,node,...,Point,"[-43.94716990436, -19.983262058745]",2019-10-25T07:44:27+00:00,,,,,39.095704,0.391113,node/711921964_3919115544883393
108473,POINT (-43.94717 -19.98318),200ME60458N90870,edif_comerc_serv,healthcare,*,124096290,124096290.0,1,node/711921964,node,...,Point,"[-43.947201021744, -19.983170396422]",2019-10-25T07:44:25+00:00,,,,,39.095872,0.543945,node/711921964_1003959927024726


In [17]:
gdf_mpl_final.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- undefined
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

#### Visualisation of results in folium

In [None]:
import folium

# --- Funções para tooltips customizados ---

def make_osm_tooltip(row):
    campos = ['id_celula', 'classe', 'tag', 'value', '@osmId', '@timestamp', '@version', 'name']
    label = ""
    for k in campos:
        if k in row and pd.notnull(row[k]):
            if k == "classe" or k == "name":
                label += f"<b>{k}:</b> <b>{row[k]}</b><br>"
            else:
                label += f"<b>{k}:</b> {row[k]}<br>"
    return label

def make_mapillary_tooltip(row):
    campos = ['image_id', 'sequence_id',
              'thumb_256_url', 'thumb_1024_url', 'thumb_2048_url', 'thumb_original_url',
              'captured_date', 'score', 'dist_m']
    label = ""
    for k in campos:
        if k in row and pd.notnull(row[k]):
            if k.startswith("thumb_"):
                label += f'<b>{k}:</b> <a href="{row[k]}" target="_blank">Abrir imagem</a><br>'
            else:
                label += f"<b>{k}:</b> {row[k]}<br>"
    return label

# --- Centralizar o mapa em cima dos resultados ---
def get_center(gdf):
    bounds = gdf.total_bounds
    center_lat = (bounds[1] + bounds[3]) / 2
    center_lon = (bounds[0] + bounds[2]) / 2
    return [center_lat, center_lon]

m = folium.Map(location=get_center(gdf_final), zoom_start=14)

# --- Pontos OSM: marcadores azuis ---
for _, row in gdf_osm.iterrows():
    geom = row.geometry
    if geom.geom_type == 'Point':
        folium.Marker(
            location=[geom.y, geom.x],
            icon=folium.Icon(icon="glyphicon glyphicon-map-marker", prefix='glyphicon', color='blue', icon_color='white'),
            popup=folium.Popup(make_osm_tooltip(row), max_width=350)
        ).add_to(m)

# --- Pontos Mapillary validados: círculos vermelhos ---
for _, row in gdf_final.iterrows():
    geom = row.geometry
    if geom.geom_type == 'Point':
        folium.CircleMarker(
            location=[geom.y, geom.x],
            radius=4,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.7,
            popup=folium.Popup(make_mapillary_tooltip(row), max_width=350)
        ).add_to(m)

display(m)

Para visualizar determinada imagem do Mapillary no navegador:
https://www.mapillary.com/app/?pKey=295642088766022

## Testes de implementação

---
#### 1º Implementação: CLIP por bounding box

* `Resultados limitados`

* Considera que as thumbs_urls estão atualizadas

* Estatégia:
  1. **Pré-processamento e organização espacial**
    - Criar coordenadas auxiliares (`(lat, lon)`) para facilitar o cálculo de distâncias geodésicas.
    - Construir um índice espacial (`R-tree`) para o conjunto de pontos do Mapillary, otimizando as consultas de vizinhança.
  2. **Busca inteligente dos vizinhos mais próximos**
    - Para cada ponto OSM:
      - Criar uma bounding box (0.0015° → Aproximadamente ~150 metros) para selecionar pontos candidatos do Mapillary Coverage tiles por sobreposição espacial via R-tree.
      - Calcular a distância geodésica (em metros) entre o ponto OSM e os pontos candidatos.
      - Ordenar os candidatos por distância crescente.
      - Selecionar os N (ex: 30) mais próximos para avaliação com CLIP.

In [None]:
from geopy.distance import geodesic
from shapely.geometry import box
from tqdm import tqdm

# Tokenizar os descritores uma única vez
text_tokens = clip.tokenize(all_descriptions).to(device)

# Extrair as coordenadas pré-computadas no Mapillary
gdf_mapillary["coords"] = gdf_mapillary.geometry.apply(lambda g: (g.y, g.x))

# Cria índice espacial (R-tree)
map_idx = gdf_mapillary.sindex

# Define o "buffer" para pré-seleção espacial (em graus)
buffer_deg = 0.0015  # Aproximadamente ~150 metros

resultados = []
bbox_list = []

for idx, row in tqdm(gdf_osm.iterrows(), total=len(gdf_osm), desc="Processando POIs do OSM"):
    pt = row.geometry
    lat_osm, lon_osm = pt.y, pt.x # coordenadas do ponto OSM

    # Criar bounding box para busca espacial rápida
    bounds = pt.buffer(buffer_deg).bounds  # minx, miny, maxx, maxy
    bbox_geom = box(*bounds)
    bbox_list.append({"osm_id": row.get('@osmId', idx), "geometry": bbox_geom})
    possible_matches_index = list(map_idx.intersection(bounds))
    candidates = gdf_mapillary.iloc[possible_matches_index].copy()

    # Calcula distância geodésica para os candidatos
    candidates["dist_m"] = candidates["coords"].apply(
        lambda coord: geodesic((lat_osm, lon_osm), coord).meters
    )

    # Seleciona os 30 mais próximos
    candidates = candidates.sort_values("dist_m").head(30)

    # Avalia com CLIP
    for _, map_row in candidates.iterrows():
        url_thumb = map_row.get('thumb_256_url')
        if not url_thumb:
            continue

        match, score = image_matches_clip(
            url_thumb, text_tokens,
            threshold=0.25,
            preprocess=preprocess,
            model=model,
            device=device
        )

        if match:
            resultados.append({
                **row.to_dict(),
                **map_row.to_dict(),
                "dist_m": map_row["dist_m"],
                "score": score,
                "thumb_url": url_thumb
            })

gdf_bbox = gpd.GeoDataFrame(bbox_list, crs="EPSG:4674")

#### 2º Implementaçao: CLIP sem batch

- Considera que as thumbs_urls estão atualizadas

In [None]:
def image_matches_clip(url, text_tokens, threshold=0.25, preprocess=None, model=None, device="cpu"):
    try:
        response = requests.get(url, timeout=10)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image_tensor = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            logits_per_image, _ = model(image_tensor, text_tokens)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy().flatten()
            score = probs.max()
        return score >= threshold, score
    except Exception as e:
        print(f"[ERRO] {e} - {url}")
        return False, 0.0

In [None]:
# Tokeniza os descritores uma única vez
text_tokens = clip.tokenize(all_descriptions).to(device)

# Extrai coordenadas do Mapillary (lat, lon) em radianos
gdf_mapillary["coords"] = gdf_mapillary.geometry.apply(lambda g: (g.y, g.x))
coords_mapillary_rad = np.radians(gdf_mapillary["coords"].to_list())

# Cria índice BallTree com métrica haversine
tree = BallTree(coords_mapillary_rad, metric='haversine')

# Parâmetros de consulta
k_vizinhos = 100

resultados = []

for idx, row in tqdm(gdf_osm.iterrows(), total=len(gdf_osm), desc="Processando POIs do OSM"):
    lat_osm, lon_osm = row.geometry.y, row.geometry.x
    coord_osm_rad = np.radians([[lat_osm, lon_osm]])

    # Consulta BallTree para encontrar os K vizinhos mais próximos
    dist_rad, ind = tree.query(coord_osm_rad, k=k_vizinhos)
    dist_m = dist_rad[0] * 6371000  # Converte radianos → metros (6371000 → raio da Terra)

    candidatos = gdf_mapillary.iloc[ind[0]].copy()
    candidatos["dist_m"] = dist_m

    # Filtro de candidatos a até 150 metros do ponto OSM
    candidatos = candidatos[candidatos["dist_m"] <= 200]
    if candidatos.empty:
        continue  # Nenhuma imagem próxima, pula para o próximo ponto

    # Avaliação com CLIP apenas nos candidatos válidos
    for _, map_row in candidatos.iterrows():
        url_thumb = map_row.get('thumb_1024_url')
        if not url_thumb or pd.isna(url_thumb):
            continue

        match, score = image_matches_clip(
            url_thumb, text_tokens,
            threshold=0.25,
            preprocess=preprocess,
            model=model,
            device=device
        )

        if match:
            resultados.append({
                **row.to_dict(),
                **map_row.to_dict(),
                "dist_m": map_row["dist_m"],
                "score": score
            })

#### 3º Implementação: CLIP com batch

- Considera que as urls dos thumbnaisl estão atualizadas

In [None]:
# Função para avaliação em batch
def batch_image_matches_clip(
    urls, text_tokens, threshold=0.25,
    preprocess=None, model=None, device="cpu"
):
    images = []
    indices = []
    for idx, url in enumerate(urls):
        try:
            response = requests.get(url, timeout=10)
            image = Image.open(BytesIO(response.content)).convert("RGB")
            images.append(preprocess(image))
            indices.append(idx)
        except Exception as e:
            print(f"[ERRO] {e} - {url}")
            continue

    if not images:
        return []

    image_tensor = torch.stack(images).to(device)
    with torch.no_grad():
        logits_per_image, _ = model(image_tensor, text_tokens)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        scores = probs.max(axis=1)

    results = []
    for i, score in zip(indices, scores):
        results.append((i, score, score >= threshold))
    return results


In [None]:
# Pipeline principal: busca espacial + CLIP em batch

batch_size = 64  # Ou 128/256, dependendo da GPU
k_vizinhos = 100

resultados = []
urls_batch = []
rows_batch = []

for idx, row in tqdm(gdf_osm.iterrows(), total=len(gdf_osm), desc="Processando POIs do OSM"):
    lat_osm, lon_osm = row.geometry.y, row.geometry.x
    coord_osm_rad = np.radians([[lat_osm, lon_osm]])
    dist_rad, ind = tree.query(coord_osm_rad, k=k_vizinhos)
    dist_m = dist_rad[0] * 6371000

    candidatos = gdf_mapillary.iloc[ind[0]].copy()
    candidatos["dist_m"] = dist_m
    candidatos = candidatos[candidatos["dist_m"] <= 200]
    if candidatos.empty:
        continue

    for _, map_row in candidatos.iterrows():
        url_thumb = map_row.get('thumb_1024_url')
        if not url_thumb or pd.isna(url_thumb):
            continue
        urls_batch.append(url_thumb)
        rows_batch.append((row, map_row))

        if len(urls_batch) == batch_size:
            results = batch_image_matches_clip(
                urls_batch, text_tokens, threshold=0.25,
                preprocess=preprocess, model=model, device=device
            )
            for i, score, match in results:
                if match:
                    row_, map_row_ = rows_batch[i]
                    resultados.append({
                        **row_.to_dict(),
                        **map_row_.to_dict(),
                        "dist_m": map_row_["dist_m"],
                        "score": score
                    })
            urls_batch = []
            rows_batch = []
            gc.collect()  # Libera RAM entre batches

# Último lote (se sobrar)
if urls_batch:
    results = batch_image_matches_clip(
        urls_batch, text_tokens, threshold=0.25,
        preprocess=preprocess, model=model, device=device
    )
    for i, score, match in results:
        if match:
            row_, map_row_ = rows_batch[i]
            resultados.append({
                **row_.to_dict(),
                **map_row_.to_dict(),
                "dist_m": map_row_["dist_m"],
                "score": score
            })
    urls_batch = []
    rows_batch = []
    gc.collect()

print(f"Total de correspondências: {len(resultados)}")

#### 4º Implementação: CLIP com batch (não salva parciais)

In [None]:
# Configuração do Logging para o CLIP pipeline
import logging
from pathlib import Path
from datetime import datetime

out_dir = "results/4_CLIP_results"
Path(out_dir).mkdir(parents=True, exist_ok=True)

# Configura logging para registrar mensagens em arquivo e console
logfile = f"results/4_CLIP_results/clip_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(logfile, mode='w', encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("clip_pipeline")

In [None]:
# Função batch_image_matches_clip

def batch_image_matches_clip(
    df_rows, text_tokens, threshold=0.25,
    preprocess=None, model=None, device="cpu"
):
    """
    df_rows: lista de dicts [{row, map_row, url_1024, url_256}]
    Retorna: lista de resultados (cada resultado é um dict)
    """
    images = []
    batch_info = []  # Guardar idx e qual url funcionou
    logs = []

    for idx, d in enumerate(df_rows):
        img_url_used = None
        erro = ""
        tempo_ini = time.time()

        # Tenta thumb_1024_url primeiro, depois thumb_256_url
        for urlcol in ['thumb_1024_url', 'thumb_256_url']:
            url = d[urlcol]
            if not url or pd.isna(url):
                continue
            try:
                response = requests.get(url, timeout=10)
                image = Image.open(BytesIO(response.content)).convert("RGB")
                images.append(preprocess(image))
                img_url_used = url
                break  # Sucesso!
            except Exception as e:
                erro = str(e)
                logger.warning(f"[ERRO] {e} - {url}")
                continue

        tempo_fim = time.time()
        logs.append({
            "osm_id": d['row'].get("@osmId", None),
            "mapillary_image_id": d['map_row'].get("image_id", None),
            "url_1024": d.get("thumb_1024_url"),
            "url_256": d.get("thumb_256_url"),
            "url_used": img_url_used,
            "error": erro if img_url_used is None else "",
            "time_s": round(tempo_fim-tempo_ini, 3)
        })
        batch_info.append(idx if img_url_used else None)

    if not images:
        return [], logs

    image_tensor = torch.stack(images).to(device)
    with torch.no_grad():
        logits_per_image, _ = model(image_tensor, text_tokens)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        scores = probs.max(axis=1)

    results = []
    img_idx = 0
    for idx in batch_info:
        if idx is not None:
            score = scores[img_idx]
            results.append((idx, score, score >= threshold))
            img_idx += 1
        else:
            results.append((idx, 0.0, False))  # Não foi possível processar

    return results, logs

In [None]:
# Pipeline principal: CLIP em batch, com logging e registro detalhado

batch_size = 128  # # Ou 128/256, dependendo da GPU

resultados = []
log_lote = []
rows_batch = []

logger.info(f"Pipeline CLIP iniciado para {len(gdf_osm)} POIs e {len(gdf_mapillary_candidatos_atualizado)} imagens Mapillary")

# BallTree com os candidatos já atualizados
gdf_mapillary_candidatos_atualizado["coords"] = gdf_mapillary_candidatos_atualizado.geometry.apply(lambda g: (g.y, g.x))
coords_mapillary_rad = np.radians(gdf_mapillary_candidatos_atualizado["coords"].to_list())
tree = BallTree(coords_mapillary_rad, metric='haversine')

for idx, row in tqdm(gdf_osm.iterrows(), total=len(gdf_osm), desc="Processando POIs do OSM"):
    lat_osm, lon_osm = row.geometry.y, row.geometry.x
    coord_osm_rad = np.radians([[lat_osm, lon_osm]])
    # Busca sempre nos candidatos atualizados!
    dist_rad, ind = tree.query(coord_osm_rad, k=min(100, len(gdf_mapillary_candidatos_atualizado)))
    dist_m = dist_rad[0] * 6371000

    candidatos = gdf_mapillary_candidatos_atualizado.iloc[ind[0]].copy()
    candidatos["dist_m"] = dist_m
    candidatos = candidatos[candidatos["dist_m"] <= 200].copy()
    if candidatos.empty:
        continue

    for _, map_row in candidatos.iterrows():
        rows_batch.append({
            "row": row,
            "map_row": map_row,
            "thumb_1024_url": map_row.get('thumb_1024_url'),
            "thumb_256_url": map_row.get('thumb_256_url')
        })
        if len(rows_batch) == batch_size:
            t0 = time.time()
            results, logs = batch_image_matches_clip(
                rows_batch, text_tokens, threshold=0.25,
                preprocess=preprocess, model=model, device=device
            )
            t1 = time.time()
            for i, score, match in results:
                if match and i is not None:
                    row_, map_row_ = rows_batch[i]["row"], rows_batch[i]["map_row"]
                    resultados.append({
                        **row_.to_dict(),
                        **map_row_.to_dict(),
                        "dist_m": map_row_["dist_m"],
                        "score": score
                    })
            log_lote.extend(logs)
            logger.info(f"Lote de {batch_size} imagens processado em {t1-t0:.1f}s")
            rows_batch = []
            gc.collect()

# Último lote (restante)
if rows_batch:
    results, logs = batch_image_matches_clip(
        rows_batch, text_tokens, threshold=0.25,
        preprocess=preprocess, model=model, device=device
    )
    for i, score, match in results:
        if match and i is not None:
            row_, map_row_ = rows_batch[i]["row"], rows_batch[i]["map_row"]
            resultados.append({
                **row_.to_dict(),
                **map_row_.to_dict(),
                "dist_m": map_row_["dist_m"],
                "score": score
            })
    log_lote.extend(logs)
    logger.info(f"Último lote ({len(rows_batch)} imagens) processado.")

logger.info(f"Total de correspondências: {len(resultados)}")

In [None]:
# Converts the resultados list, which contains dictionaries of matching OSM and Mapillary data,
# into a pandas DataFrame and then into a GeoPandas GeoDataFrame
df_resultados = pd.DataFrame(resultados)

# Convert the 'score' column to float64
df_resultados['score'] = df_resultados['score'].astype('float64')
gdf_resultados = gpd.GeoDataFrame(df_resultados, geometry="geometry", crs="EPSG:4674")

In [None]:
display(gdf_resultados.head())

In [None]:
len(gdf_resultados)

Estratégia para salvar os pares OSM–Mapillary com correspondência positiva e também preservar os pontos OSM sem nenhuma imagem válida (onde não tem conbertura do Mapillary)
0.   Verificar se tem alguma coluna com tupla e remover antes de salvar o GPKG
1.   Fazer um set dos OSM IDs que apareceram em gdf_resultados
2.   Filtrar os que não estão nesse set, e salvar como gdf_sem_imagem

In [None]:
# Check data types
# Veja os tipos das colunas
print(gdf_resultados.dtypes)

# Possíveis colunas problemáticas para GPKG (tuplas, listas, dicionários, objetos complexos etc.)
for col in gdf_resultados.columns:
    if gdf_resultados[col].apply(lambda x: isinstance(x, tuple)).any():
        print(f"Coluna '{col}' contém tupla!")

In [None]:
# Converte qualquer tupla/lista para string (menos a geometry!)
for col in gdf_resultados.columns:
    if col != "geometry":
        gdf_resultados[col] = gdf_resultados[col].apply(
            lambda x: str(x) if isinstance(x, (tuple, list, dict)) else x
        )

In [None]:
ids_com_imagem = set(gdf_resultados["@osmId"])
gdf_sem_imagem = gdf_osm[~gdf_osm["@osmId"].isin(ids_com_imagem)]

In [None]:
# Definir output path e filename para slavar o gdf_resultados como um arquivo GeoPackage
output_dir = "data/output_tests/CLIP_results_test"
os.makedirs(output_dir, exist_ok=True)

output_gpkg = os.path.join(output_dir, "CLIP_osm_mapillary_results_test2_v2_ViT_L14.gpkg")
output_gpkg

In [None]:
# Salvar resultados com matches CLIP
gdf_resultados.to_file(output_gpkg, driver="GPKG", layer="clip_matches")

# Salvar pontos OSM sem imagens próximas válidas
gdf_sem_imagem.to_file(output_gpkg, driver="GPKG", layer="osm_no_image")

print(f"GeoPackage salvo com duas camadas: {output_gpkg}")

In [None]:
# Reload the gdf_resultados GeoDataFrame from the saved GeoPackage file
gdf_resultados = gpd.read_file(output_gpkg, layer="clip_matches")
display(gdf_resultados.head())

In [None]:
len(gdf_resultados)

---
## Download estruturado das imagens do Mapillary

In [18]:
def clean_path(s):
    """Sanitize a string to be used in folder/file names."""
    if pd.isna(s):
        return "undefined"
    return str(s).replace("/", "_").replace("*", "any").strip()

def build_output_path(row, root_dir):
    """Builds the output directory path for a given row based on attributes."""
    classe = clean_path(row.get('classe'))
    tag = clean_path(row.get('tag'))
    value = clean_path(row.get('value'))
    osm_id = clean_path(row.get('@osmId', row.get('id'))).replace("/", "_")

    subfolder = os.path.join(root_dir, classe, f"{tag}_{value}", osm_id)
    os.makedirs(subfolder, exist_ok=True)

    image_id = row.get('image_id')
    filename = f"{image_id}.jpg"
    filepath = os.path.join(subfolder, filename)

    return filepath, row.get('thumb_original_url')

def download_image_from_url(url, filepath, cooldown=1):
    """Downloads an image from a URL and saves to filepath."""
    if not url or pd.isna(url):
        return
    if os.path.exists(filepath):
        return  # Já existe

    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            f.write(response.content)
        if cooldown:
            sleep(cooldown)
    except Exception as e:
        print(f"[ERRO] Falha ao baixar imagem: {url} → {e}")

def download_all_images_from_gdf(gdf, root_dir="results/5_mapillary_images", cooldown=1):
    print(f"Iniciando download de {len(gdf)} imagens para: {root_dir}")
    for _, row in tqdm(gdf.iterrows(), total=len(gdf), desc="Baixando imagens"):
        filepath, url = build_output_path(row, root_dir)
        download_image_from_url(url, filepath, cooldown=cooldown)

In [19]:
date_str = datetime.now().strftime('%Y%m%d') # Criar diretório com a data atual
root_dir = Path(f"results/5_CLIP_mapillary_images/{date_str}")

# Criar diretórios se ainda não existirem
root_dir.mkdir(parents=True, exist_ok=True)

# Executar Donwload
download_all_images_from_gdf(gdf_mpl_final, root_dir)

Iniciando download de 108475 imagens para: results\5_CLIP_mapillary_images\20250825


Baixando imagens:   0%|          | 0/108475 [00:00<?, ?it/s]

[ERRO] Falha ao baixar imagem: https://scontent-ord5-3.xx.fbcdn.net/m1/v/t6/An9zRqyT5gcNHZUpNufLABTFX1Ml1YX8r1xwfPoLpQxuNPrkZXLybZRCZBsJhMb1_6cPSHosiGoYIkEEjSDYrPUxQRCnatQyAhvGmZKI_XDrrz8TCHx9hT070ehVz7YhYnrnDGK8LFMjBotKuRSF1Q?edm=ALXxkZ8EAAAA&_nc_gid=k2vznl4ESubKo0JIaLBZew&_nc_oc=Admf-5K-YJ95V_u4ZCuF7t3sRTxQFBcIBU7Zd5F-AU79mFF_SjuRZOygapjqfdjQa_w&ccb=10-5&oh=00_AfUwb3eOdWUaWB5wx-FbBkOEaaeP6COyU7uH3e_DCiPGqA&oe=68C03757&_nc_sid=201bca → ('Connection aborted.', ConnectionResetError(10054, 'Foi forçado o cancelamento de uma conexão existente pelo host remoto', None, 10054, None))
[ERRO] Falha ao baixar imagem: https://scontent-ord5-1.xx.fbcdn.net/m1/v/t6/An-OzuWVO7LMu6Olg5YXbSENSYLgZFN8OY2gGjK0rnvTlUlj_UUnpeBxefSoTV9k4ugVXxm--aY5V1Yaf5c_7CmUzxHKKM9XKxuTUp5HgsAG6aArTvIUa-osNVmUlvjyhFFp61lXprvXKRnQY5XqDA?edm=ALXxkZ8EAAAA&_nc_gid=Y8ZrB3vxlbc3wK2dh6nL5Q&_nc_oc=AdlRQEvCv9Q7E9xU8E06N0vBDNT1LRq1i3k4M4A08p9awDCrA4pTDp6UQOP6lg8LsMA&ccb=10-5&oh=00_AfWElTQ4opugmpamynr4BP18ZPzLa2d7BSWe39EBNxfiOQ&oe=