In [2]:
import random
import networkx as nx
import osmnx as ox
import matplotlib.pyplot as plt
import pandas as pd
from geopy.distance import geodesic
import requests
import json
import uuid
import os
from datetime import datetime
import folium
from folium.plugins import AntPath
from collections import defaultdict

In [3]:
# ===================== CONFIGURATION =====================
config = {
    "n_users": 100,  # Number of users (each user gets a distinct graph)
    "country_code": "FR",  # Country code for city selection
    "username": "",  # Geonames API username
    "output_base_dir": "Generated_Data"  # Base directory for saving data
}

# Generate timestamped output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = os.path.join(config["output_base_dir"], f"Data_For_{timestamp}")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
print(f"✅ Output directory created: {output_dir}")

# Load the sensitive POI categories from a JSON file
with open("sensitive_poi_categories.json", "r") as file:
    sensitive_poi_categories = json.load(file)

✅ Output directory created: Generated_Data/Data_For_2025-03-18_11-31-36


In [None]:
def get_random_cities_geonames(country_code, username, n_users):
    """
    Fetches `n_users` random city names from GeoNames API for a given country,
    excluding the city 'Gustavia'.

    Parameters:
        country_code (str): The country ISO code (e.g., "FR" for France).
        username (str): GeoNames API username (requires a free account).
        n_users (int): Number of random cities to return.

    Returns:
        list: A list of `n_users` unique city names (or fewer if unavailable).
    """
    max_results = n_users
    url = f"https://secure.geonames.org/searchJSON?country={country_code}&featureClass=P&maxRows={max_results}&username={username}"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()

        if "geonames" not in data or not isinstance(data["geonames"], list):
            print("⚠️ Warning: Unexpected response format from GeoNames API.")
            return []

        # Extract and filter city names (exclude 'Gustavia')
        cities = [place["name"] for place in data["geonames"] if "name" in place and place["name"].lower() != "gustavia" and place["name"].lower() != "marigot"]

        # Shuffle and select `n_users` unique cities
        random.shuffle(cities)
        return cities[:n_users] if cities else []

    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching cities from GeoNames: {e}")
        return []

In [5]:
def get_random_villages_geonames(country_code, username, n=5):
    url = f"https://secure.geonames.org/searchJSON?country={country_code}&featureClass=P&featureCode=PPL&maxRows=1000&username={username}"
    
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        villages = [place["name"] for place in data["geonames"] if "name" in place]

        # Mélanger et sélectionner aléatoirement
        random.shuffle(villages)
        
        return villages[:n] if villages else []
    
    return []

In [6]:
def plot_graph_on_map(G, city, user_id):
    """
    Displays the user's POI graph on a real map using Folium.

    Parameters:
        G (networkx.DiGraph): The directed graph of POIs.
        city (str): The city where the graph is generated.
        user_id (str): Unique identifier of the user.
        output_dir (str): Directory to save the map.

    Returns:
        folium.Map: The interactive map object.
    """
    if not G.nodes:
        print(f"⚠️ No nodes in graph for {user_id}. Skipping map generation.")
        return None
    # Get node positions (latitude, longitude)
    pos = nx.get_node_attributes(G, "pos")
    categories = nx.get_node_attributes(G, "category")

    # Get the map center (average lat/lon)
    avg_lat = sum([p[1] for p in pos.values()]) / len(pos)
    avg_lon = sum([p[0] for p in pos.values()]) / len(pos)

    # Create a Folium map centered at the average location
    m = folium.Map(location=[avg_lat, avg_lon], zoom_start=14, control_scale=True)

# Association des valeurs à leurs couleurs
    value_color_map = {}

    # Define category colors
    category_colors = {
        "healthcare": "red",
        "religious_sites": "blue",
        "education": "green",
        "finance": "purple",
        "social_services": "orange",
        "residential": "cyan",
        "transportation": "magenta",
        "law_enforcement": "brown",
        "government": "black",
        "shopping": "yellow"
    }

    for category, details in sensitive_poi_categories.items():
        color = category_colors.get(category, "gray")
        for value in details["values"]:
            value_color_map[value] = color
    

    # Add POI nodes (markers)
    for node, (lon, lat) in pos.items():
        node_value = nx.get_node_attributes(G, "category").get(node, None)
        color = value_color_map.get(node_value, "gray")
        folium.CircleMarker(
            location=[lat, lon],
            radius=6,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.8,
            popup=f"POI-Category: {node_value}",
        ).add_to(m)

    # Add edges with directional arrows
    for u, v, edge_data in G.edges(data=True):
        coords = [[pos[u][1], pos[u][0]], [pos[v][1], pos[v][0]]]  # Format: [lat, lon]
        # Retrieve weights (list of movement tuples)
        weight_info = edge_data.get("weight", [])
        # Format weight details for popup
        weight_text = "\n".join([
            f"⏳ TS: {w['ts']}, TD: {w['td']}, FS: {w['fs']}, ST: {w['st']} min, DT: {w['dt']} min"
            for w in weight_info
        ]) if weight_info else "No movement data"
        # Use AntPath for animated edges
        AntPath(locations=coords, color="black", delay=800).add_to(m)  # Animated path
        # Static fallback using PolyLine
        folium.PolyLine(locations=coords, color="blue", weight=3, opacity=0.7, popup=weight_text).add_to(m)

    # Ensure the 'map' directory exists within output_dir
    map_dir = os.path.join(output_dir, "map")
    os.makedirs(map_dir, exist_ok=True)  # Create directory if it doesn't exist

    # Save the map
    map_filename = os.path.join(map_dir, f"user_map_{user_id}_from_{city}.html")
    m.save(map_filename)
    print(f"✅ Map for user {user_id} saved: {map_filename}")

    return m

In [7]:
def generate_multiple_weights():
    """
    Generates multiple weight tuples for an edge, representing movement statistics.
    
    Each weight tuple contains:
    - ts (time segment): Represents different time slots of the day (e.g., morning, afternoon, evening).
    - td (type of day): Categorizes days as either weekdays or weekends.
    - fs (frequency of movement): Counts how often the movement occurs between two locations.
    - st (stay time): The average duration (in minutes) a user stays at the source location before moving.
    - dt (travel duration): The average time (in minutes) it takes to travel from source to destination.

    Steps:
    1. Generate a set of raw weight tuples.
    2. Group the tuples by (ts, td) to aggregate statistics.
    3. Compute the final aggregated values (sum for `fs`, average for `st` and `dt`).

    Returns:
        List of aggregated weight dictionaries.
    """

    num_tuples = random.randint(1, 400)  # Random number of weight tuples per edge

    # Step 1: Generate raw weight tuples
    raw_weights = [
        {
            "ts": random.randint(0, 3),  # Time segment (morning, afternoon, evening, night)
            "td": random.choice([1, 2]),  # Type of day (e.g., 1 = weekday, 2 = weekend)
            "fs": random.randint(1, 1),  # Frequency of movement (number of times this movement occurs)
            "st": random.randint(5, 70),  # Stay time (in minutes) before leaving
            "dt": round(random.uniform(1, 30), 2)  # Travel duration (in minutes)
        }
        for _ in range(num_tuples)
    ]

    # Step 2: Grouping by (ts, td) to aggregate data
    grouped_weights = defaultdict(lambda: {"fs": 0, "st": [], "dt": []})

    for weight in raw_weights:
        key = (weight["ts"], weight["td"])  # Group by (time segment, type of day)
        grouped_weights[key]["fs"] += weight["fs"]  # Sum frequency
        grouped_weights[key]["st"].append(weight["st"])  # Collect stay time for averaging
        grouped_weights[key]["dt"].append(weight["dt"])  # Collect travel duration for averaging

    # Step 3: Compute the final aggregated weight tuples
    aggregated_weights = [
        {
            "ts": ts,
            "td": td,
            "fs": data["fs"],  # Total movement frequency
            "st": round(sum(data["st"]) / len(data["st"]), 2),  # Average stay time at source
            "dt": round(sum(data["dt"]) / len(data["dt"]), 2)  # Average travel duration
        }
        for (ts, td), data in grouped_weights.items()
    ]

    return aggregated_weights

In [8]:
# ===================== MAIN FUNCTION =====================
def generate_user_graphs(cities, output_dir, max_out_degree=2, max_in_degree=2, radius=2):
    """
    Generates directed POI graphs for users based on random city selections.

    Parameters:
    - cities (list): List of city names to generate graphs for.
    - output_dir (str): Directory to save generated graphs.
    - max_out_degree (int): Maximum outgoing connections per node.
    - max_in_degree (int): Maximum incoming connections per node.
    - radius (int): Connection radius in km.
    """

    for user_idx, city in enumerate(cities):
        user_id = str(uuid.uuid4())  # Unique user identifier
        print(f"Processing user {user_id} in city {city}...")

        # Random number of POIs (between 10 and 30)
        n_poi_per_user = random.randint(5, 20)

        # Initialize POI data
        poi_data = []

        # Fetch POIs
        while len(poi_data) < n_poi_per_user:
            remaining_needed = n_poi_per_user - len(poi_data)
            categories_sampled = random.sample(list(sensitive_poi_categories.keys()), 
                                               min(remaining_needed, len(sensitive_poi_categories)))

            for category in categories_sampled:
                found_poi = None
                for value in sensitive_poi_categories[category]["values"]:
                    try:
                        pois = ox.features_from_place(city, {sensitive_poi_categories[category]["tags"][0]: value})
                        if not pois.empty:
                            found_poi = pois.sample(1)
                            found_poi["category"] = value                            
                            break
                    except:
                        continue

                if found_poi is not None:
                    poi_data.append(found_poi)

                if len(poi_data) >= n_poi_per_user:
                    break

        # Merge POIs into DataFrame
        if not poi_data:
            print(f"No POIs found for user {user_id} in {city}. Skipping...")
            continue

        poi_df = pd.concat(poi_data)
        poi_df = poi_df[["geometry", "category"]].reset_index(drop=True)

        # Extract coordinates
        if not poi_df.empty:
            poi_df["lon"] = poi_df["geometry"].apply(lambda geom: geom.centroid.x)
            poi_df["lat"] = poi_df["geometry"].apply(lambda geom: geom.centroid.y)

        # Build Directed Graph
        G = nx.DiGraph()

        # Add nodes
        for idx, row in poi_df.iterrows():
            G.add_node(idx, pos=(row["lon"], row["lat"]), category=row["category"])

        # Compute distances and possible connections
        edges = []
        for i in range(len(poi_df)):
            for j in range(len(poi_df)):
                if i != j:  # Avoid self-loops
                    dist = geodesic((poi_df.loc[i, "lat"], poi_df.loc[i, "lon"]),
                                    (poi_df.loc[j, "lat"], poi_df.loc[j, "lon"])).km
                    if dist < radius:
                        edges.append((i, j))

        # Shuffle edges to introduce randomness
        random.shuffle(edges)

        # Track degrees
        out_degree = {node: 0 for node in G.nodes}
        in_degree = {node: 0 for node in G.nodes}

        # Add edges while respecting max degree constraints
        for u, v in edges:
            if out_degree[u] < max_out_degree and in_degree[v] < max_in_degree:
                G.add_edge(u, v, weight=generate_multiple_weights())  # Assign multiple weight tuples
                out_degree[u] += 1
                in_degree[v] += 1

        # Ensure Graph Connectivity
        components = list(nx.weakly_connected_components(G))
        if len(components) > 1:
            print(f"User {user_id} graph is disconnected. Connecting components...")
            largest_component = max(components, key=len)
            for component in components:
                if component != largest_component:
                    node_from_largest = random.choice(list(largest_component))
                    node_from_other = random.choice(list(component))
                    G.add_edge(node_from_largest, node_from_other, weight=generate_multiple_weights())

        # Save the graph as JSON
        graph_data = {
            "user_id": user_id,
            "city": city,
            "num_poi": len(G.nodes),
            "nodes": [{"id": n, "category": G.nodes[n]["category"], "position": G.nodes[n]["pos"]} for n in G.nodes],
            "edges": [
                {
                    "source": u,
                    "target": v,
                    "weight": d["weight"]  # This is now a list of weight tuples
                }
                for u, v, d in G.edges(data=True)
            ]
        }

        file_path = f"{output_dir}/user_graph_{user_id}_from_{city}.json"
        with open(file_path, "w") as f:
            json.dump(graph_data, f, indent=4)

        print(f"Graph for user {user_id} with {len(G.nodes)} POIs saved in {file_path}")

        # Plot Directed Graph
         # plt.figure(figsize=(8, 6))
         # pos = nx.get_node_attributes(G, "pos")
         # categories = nx.get_node_attributes(G, "category")
          #colors = {key: plt.cm.tab10(i) for i, key in enumerate(sensitive_poi_categories.keys())}
         # node_colors = [colors[categories[n]] for n in G.nodes]
        
        plot_graph_on_map(G, city, user_id)
        
         # nx.draw(G, pos, with_labels=True, node_size=500, node_color=node_colors, edge_color="gray", arrows=True)
         # plt.title(f"User {user_id} - POI Directed Graph in {city} ({len(G.nodes)} POIs)")
         # plt.show()

In [None]:
# ===================== EXECUTION =====================
cities = get_random_cities_geonames(config["country_code"], config["username"], config["n_users"])
generate_user_graphs(cities, output_dir)
#villages = get_random_villages_geonames(config["country_code"], config["username"], config["n_users"])
#generate_user_graphs(villages, output_dir)

Processing user b782cbd0-59f7-47d1-b9a0-2d7a08b705e9 in city Paris 19 Buttes-Chaumont...
Graph for user b782cbd0-59f7-47d1-b9a0-2d7a08b705e9 with 7 POIs saved in Generated_Data/Data_For_2025-03-18_11-31-36/user_graph_b782cbd0-59f7-47d1-b9a0-2d7a08b705e9_from_Paris 19 Buttes-Chaumont.json
✅ Map for user b782cbd0-59f7-47d1-b9a0-2d7a08b705e9 saved: Generated_Data/Data_For_2025-03-18_11-31-36/map/user_map_b782cbd0-59f7-47d1-b9a0-2d7a08b705e9_from_Paris 19 Buttes-Chaumont.html
Processing user b39e1b6e-80b2-4e61-9b16-e187a9e753ec in city Le Mans...
User b39e1b6e-80b2-4e61-9b16-e187a9e753ec graph is disconnected. Connecting components...
Graph for user b39e1b6e-80b2-4e61-9b16-e187a9e753ec with 6 POIs saved in Generated_Data/Data_For_2025-03-18_11-31-36/user_graph_b39e1b6e-80b2-4e61-9b16-e187a9e753ec_from_Le Mans.json
✅ Map for user b39e1b6e-80b2-4e61-9b16-e187a9e753ec saved: Generated_Data/Data_For_2025-03-18_11-31-36/map/user_map_b39e1b6e-80b2-4e61-9b16-e187a9e753ec_from_Le Mans.html
Process