🚌 Projet MDM - Mobilité Durable en Montagne ⛰️

*Author: Laurent Sorba*

*Date: 14/09/2025*

**Description :**

This notebook combines up to 3 stop GeoJSONs, reproject inputs to EPSG:3857 if needed,
cluster by proximity, represent cluster by centroid or medoid, and plot.

See https://github.com/data-for-good-grenoble/mobilite_durable/issues/37

In [None]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import MultiPoint
from sklearn.cluster import DBSCAN

from src.settings import EPSG_WEB_MERCATOR, EPSG_WGS84

# Default colour mapping
"""
Colors:
- green = in A+B+C
- blue  = in any 2 sources
- orange/yellow/brown = only in TDG / OSM / C2C
"""

COLOR_SINGLE = {"TDG": "orange", "OSM": "yellow", "C2C": "brown"}
COLOR_TWO = "blue"
COLOR_THREE = "green"
COLOR_NONE = "gray"

TARGET_EPSG = EPSG_WEB_MERCATOR  # metric CRS for clustering and output


def read_and_ensure_crs(path, label, target_epsg=TARGET_EPSG):
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"{path} not found")
    gdf = gpd.read_file(str(p))
    gdf = gdf[gdf.geometry.type == "Point"].copy()
    if gdf.empty:
        raise ValueError(f"{path} contains no Point geometries")
    if gdf.crs is None:
        raise ValueError(f"{path} has no CRS defined; cannot reproject")
    # reproject to target if needed
    if gdf.crs != target_epsg:
        gdf = gdf.to_crs(target_epsg)
    gdf["source"] = label
    # Normalise stop name column: unify 'name' and 'stop_name' into 'stop_name'
    try:
        if "stop_name" in gdf.columns and "name" in gdf.columns:
            # Prefer non-empty stop_name, otherwise fall back to name
            sn = gdf["stop_name"].astype(str)
            nm = gdf["name"].astype(str)
            use_sn = sn.str.strip().astype(bool)
            gdf["stop_name"] = sn.where(use_sn, nm)
        elif "stop_name" not in gdf.columns and "name" in gdf.columns:
            gdf["stop_name"] = gdf["name"]
        # else: keep existing stop_name as is
    except Exception:
        pass
    # Normalize stop id column: unify '@id' and 'stop_id' into 'stop_id'
    try:
        if "stop_id" in gdf.columns and "id" in gdf.columns:
            gdf["stop_id"] = gdf["id"]
        elif "stop_id" not in gdf.columns and "id" in gdf.columns:
            gdf["stop_id"] = gdf["id"]
        # else: keep existing stop_id as is
    except Exception:
        pass
    return gdf


def medoid_point(points):
    coords = np.array([[p.x, p.y] for p in points])
    if len(coords) == 1:
        return points[0]
    D = np.sqrt(((coords[:, None, :] - coords[None, :, :]) ** 2).sum(axis=2))
    idx = D.sum(axis=1).argmin()
    return points[int(idx)]


def cluster_and_represent(gdf_all, tol, use_medoid):
    # gdf_all is expected in TARGET_EPSG (metric)
    coords = np.array([[pt.x, pt.y] for pt in gdf_all.geometry])
    db = DBSCAN(eps=tol, min_samples=1, metric="euclidean").fit(coords)
    gdf_all = gdf_all.copy()
    gdf_all["cluster"] = db.labels_
    clusters = []
    for cid, sub in gdf_all.groupby("cluster"):
        geoms = list(sub.geometry)
        sources = sorted(sub["source"].unique().tolist())
        members = sub["stop_id"].tolist() if "stop_id" in sub.columns else list(sub.index)
        names = sorted({str(x).strip() for x in sub["stop_name"].dropna() if str(x).strip()})
        rep_pt = medoid_point(geoms) if use_medoid else MultiPoint(geoms).centroid

        clusters.append(
            {
                "cluster": int(cid),
                "geometry": rep_pt,
                "members": members,
                "count": len(sub),
                "sources": sources,
                "names": names,
            }
        )
    agg = gpd.GeoDataFrame(clusters, geometry="geometry", crs=gdf_all.crs)

    def pick_color(sources):
        n = len(sources)
        if n >= 3:
            return COLOR_THREE
        if n == 2:
            return COLOR_TWO
        if n == 1:
            return COLOR_SINGLE.get(sources[0], COLOR_NONE)
        return COLOR_NONE

    agg["color"] = agg["sources"].apply(pick_color)
    agg["in_TDG"] = agg["sources"].apply(lambda s: "TDG" in s)
    agg["in_OSM"] = agg["sources"].apply(lambda s: "OSM" in s)
    agg["in_C2C"] = agg["sources"].apply(lambda s: "C2C" in s)
    return agg

In [None]:
import folium


def plot_clusters_interactive(agg_gdf, title=None, point_radius=5, tiles="OpenStreetMap"):
    """
    Display an interactive Leaflet map (zoom/scroll) using folium.
    - agg_gdf: GeoDataFrame with columns [geometry(Point in EPSG:3857), colour, name(optional)]
    - title: optional map title (added as child HTML)
    - point_radius: circle marker radius in pixels
    - tiles: folium base tiles name or URL template
    Returns the folium.Map object, which renders in notebooks.
    """

    # Convert to WGS84 for web maps
    gdf_ll = agg_gdf.to_crs(EPSG_WGS84)

    # Determine initial center
    if not gdf_ll.empty:
        center = [gdf_ll.geometry.y.mean(), gdf_ll.geometry.x.mean()]
    else:
        center = [45.1885, 5.7245]  # Grenoble as default

    m = folium.Map(location=center, zoom_start=11, tiles=tiles)

    # Optionally add a title
    if title:
        title_html = f'<h4 style="position: fixed; top: 10px; left: 50px; z-index: 9999; background: rgba(255,255,255,0.8); padding: 6px 10px; margin: 0;">{title}</h4>'
        m.get_root().html.add_child(folium.Element(title_html))

    # Group by colours for consistency
    color_labels = [
        ("green", "in A+B+C"),
        ("blue", "in any 2 sources"),
        ("orange", "only in TDG"),
        ("yellow", "only in OSM"),
        ("brown", "only in C2C"),
        ("gray", "none/other"),
    ]

    bounds = []
    for color, label in color_labels:
        sub = gdf_ll[gdf_ll["color"] == color]
        if len(sub) == 0:
            continue
        fg = folium.FeatureGroup(name=label, show=True)
        for _, row in sub.iterrows():
            pt = row.geometry
            popup_txt = []
            # TODO somehow some values are not displayed
            if "names" in row:
                popup_txt.append(f"names: {','.join(row['names'])}")
            if "members" in row:
                popup_txt.append(f"members: {','.join(row['members'])}")
            if "cluster" in row:
                popup_txt.append(f"cluster: {row['cluster']}")
            if "sources" in row:
                popup_txt.append(f"sources: {','.join(row['sources'])}")
            if "count" in row:
                popup_txt.append(f"count: {row['count']}")

            popup = "<br>".join(popup_txt) if popup_txt else None

            folium.CircleMarker(
                location=(pt.y, pt.x),
                radius=point_radius,
                color=color,
                fill=True,
                fill_opacity=0.8,
                fill_color=color,
                popup=popup,
            ).add_to(fg)
            bounds.append((pt.y, pt.x))
        fg.add_to(m)

    folium.LayerControl(collapsed=False).add_to(m)

    # Fit bounds if we have any points
    if bounds:
        m.fit_bounds(bounds, padding=(20, 20))

    return m

In [None]:
# Set theu input GeoJSONs (can be 1 to 3 sources)
a_path = Path("../data/transportdatagouv/2025-09-08_stops_38.geojson")
b_path = Path("../data/OSM/bus_stop_isere.geojson")
c_path = None  # Path("../data/C2C/
tol_m = 25.0
use_medoid = False

gdfs = []
for pth, lab in ((a_path, "TDG"), (b_path, "OSM"), (c_path, "C2C")):
    if pth:
        gdfs.append(read_and_ensure_crs(pth, lab, target_epsg=TARGET_EPSG))
gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True, sort=False), crs=gdfs[0].crs)
res = cluster_and_represent(gdf_all, tol_m, use_medoid)
out_nb = res[["geometry", "cluster", "count", "sources", "in_TDG", "in_OSM", "in_C2C", "color"]]

m = plot_clusters_interactive(out_nb, title="Stops diff clusters (colored by presence)")
m  # display in notebook