## OD Pairs

In this notebook, we will use the ADVAN foot traffic data of July, 2024 (67,000+ records) to create a set of origin-destination (OD) pairs. Here are the steps:

- Create Graph for pedestrian network using data from [DVRPC](https://www.arcgis.com/home/item.html?id=5959ca82848f4833a65cd90ef991c080)
- Load the ADVAN foot traffic data to the nodes in the Graph
- Create OD pairs by running `networkx.single_source_dijkstra_path_length()` on the Graph, and set the "cutoff" parameter to 2640 feet (0.5 miles)
- Compute the weight of each OD pair using the formula, for each OD pair $(i,j)$:

$$
\begin{equation}
w_{ij} = \text{visits}_i \times \frac{\text{visits}_j}{\sum_{\text{reachable POIs for i}} \text{visits}_j}
\end{equation}
$$

- For each OD pair, we then compute the top 5 shortest paths using the `networkx.shortest_simple_paths()` function, we will use these paths to select the best path after graph update. Because people may not want to choose a fully-shaded long route, so we keep them to choose in a reasonable choice pool. This also saves computer resources.


In [None]:
import geopandas as gpd
import networkx as nx
from shapely.geometry import Point
from tqdm import tqdm

# Load the pedestrian sidewalk network
CRS = "EPSG:6565"
sidewalk_edges = gpd.read_file("../data/sites/test/DVRPC_Pedestrian_Network.geojson").to_crs(CRS)

# Compute length of each LineString in feet (CRS 6565 is in feet)
sidewalk_edges['length_ft'] = sidewalk_edges.geometry.length

columns_to_keep = [
    'objectid',         # Unique sidewalk ID
    'geometry',         # LineString geometry
    'length_ft',        # Length in feet
]

sidewalk_edges = sidewalk_edges[columns_to_keep]

# Unique node ID mapping
node_id_counter = 0
node_id_map = {}

def make_node_id(point, precision=3):
    key = (round(point.x, precision), round(point.y, precision))
    if key not in node_id_map:
        global node_id_counter
        node_id_map[key] = node_id_counter
        node_id_counter += 1
    return node_id_map[key]

# Initialize graph
G = nx.Graph()

# Add edges with progress bar and unique edge IDs
edge_id_counter = 0

for idx, row in tqdm(sidewalk_edges.iterrows(), total=sidewalk_edges.shape[0], desc="Building Graph"):
    geom = row.geometry
    if geom is None or geom.is_empty or geom.geom_type != "LineString":
        continue

    start_point = Point(geom.coords[0])
    end_point = Point(geom.coords[-1])

    u = make_node_id(start_point)
    v = make_node_id(end_point)

    G.add_edge(
        u, v,
        geometry=geom,
        length=row['length_ft'],
        objectid=row['objectid'],
        edge_id=edge_id_counter
    )
    edge_id_counter += 1

# Attach coordinates to each node
for (coord, node_id) in node_id_map.items():
    G.nodes[node_id]['x'] = coord[0]
    G.nodes[node_id]['y'] = coord[1]

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

from scipy.spatial import cKDTree
import numpy as np
from tqdm import tqdm
import pandas as pd

# foot_traffic = gpd.read_file("../data/foot_traffic/advan_sg.geojson").to_crs(CRS)
foot_traffic = pd.read_csv("../data/foot_traffic/df_philly_24.csv")
foot_traffic = gpd.GeoDataFrame(
    foot_traffic.loc[foot_traffic['RAW_VISIT_COUNTS'].notnull(), ['LATITUDE', 'LONGITUDE', 'RAW_VISIT_COUNTS']],
    geometry=gpd.points_from_xy(
        foot_traffic.loc[foot_traffic['RAW_VISIT_COUNTS'].notnull(), 'LONGITUDE'],
        foot_traffic.loc[foot_traffic['RAW_VISIT_COUNTS'].notnull(), 'LATITUDE']
    ),
    crs="EPSG:4326"
).to_crs(CRS)
foot_traffic = foot_traffic[["geometry", "RAW_VISIT_COUNTS"]]

# KDTree using node coordinates
node_coords = np.array([[data['x'], data['y']] for node, data in G.nodes(data=True)])
node_ids = [node for node in G.nodes]

kdtree = cKDTree(node_coords)

# Initialize node visit counts
for node in G.nodes:
    G.nodes[node]['visit_count'] = 0

# Map POIs to nearest graph nodes
for idx, row in tqdm(foot_traffic.iterrows(), total=foot_traffic.shape[0], desc="Mapping POIs"):
    poi_point = row.geometry
    visits = row.RAW_VISIT_COUNTS

    dist, node_idx = kdtree.query([poi_point.x, poi_point.y])
    nearest_node = node_ids[node_idx]
    G.nodes[nearest_node]['visit_count'] += visits

print("POIs successfully mapped to graph nodes.")

Here we get the OD pairs.

In [None]:
import pickle
import pandas as pd
import networkx as nx
import nx_cugraph as nxcg
from collections import defaultdict
from tqdm import tqdm
import numpy as np

# ---------------------------------------------
# 1. Load the OD-pair DataFrame from Parquet
# ---------------------------------------------
OD_pairs_df = pd.read_parquet("/mnt/e/Capstone/data/sites/test/OD_pairs.parquet")
OD_pairs = OD_pairs_df.to_dict(orient="records")
print(f"Loaded {len(OD_pairs)} OD pairs from file.")

# ------------------------------------------------
# 2. Load and Prepare the NetworkX Graph (G_ideal)
# ------------------------------------------------
with open("/mnt/e/Capstone/data/sites/test/G_ideal.pkl", "rb") as f:
    G_ideal = pickle.load(f)

# Add a single numeric weight attribute 'lambda_e' (required by Dijkstra)
for u, v, data in G_ideal.edges(data=True):
    data["lambda_e"] = data["length"]

# ----------------------------------------------------------
# 3. Convert the NetworkX graph to an nx_cugraph-backed graph
# ----------------------------------------------------------
# This conversion happens once; after this, any NetworkX call
# on `nxcg_G` that is supported will dispatch to cuGraph.
nxcg_G = nxcg.from_networkx(G_ideal)  # :contentReference[oaicite:2]{index=2}

# --------------------------------------------------------
# 4. Build a lookup that groups destinations by each origin
# --------------------------------------------------------
od_lookup = defaultdict(list)
for od in OD_pairs:
    origin = od["origin_node"]
    dest   = od["dest_node"]
    od_lookup[origin].append(dest)

# ------------------------------------------------------------
# 5. For each origin, run a single‐source Dijkstra on the GPU—
#    collecting lengths for just those destinations of interest
# ------------------------------------------------------------
OD_results_ideal = []

# tqdm to show progress; the call inside will automatically
for origin_node in tqdm(od_lookup.keys(), desc="Computing ideal shortest paths"):
    # weight='lambda_e' tells Dijkstra to use the numeric attribute we set.
    lengths = nx.single_source_dijkstra_path_length(
        G_ideal, source=origin_node, weight="lambda_e", cutoff=2640
    )

    # Extract only the distances to the specific destination nodes we care about
    for dest_node in od_lookup[origin_node]:
        path_length = lengths.get(dest_node, np.inf)
        OD_results_ideal.append({
            "origin_node": origin_node,
            "dest_node":   dest_node,
            "path_length": path_length
        })

In [None]:
# ----------------------------------------------------
# 6. Save the results (e.g. back to a Parquet for lookup)
# ----------------------------------------------------
results_df = pd.DataFrame(OD_results_ideal)
results_df.to_parquet("/mnt/e/Capstone/data/sites/test/OD_results_ideal.parquet", index=False)

print("Finished computing and storing all OD shortest-path lengths.")


Here we compute the weight for each OD pair.

In [None]:
import pickle
import pandas as pd
import networkx as nx
from tqdm import tqdm

# 1) Load graph and set up edge weights
with open("data/foot_traffic/G.pkl", "rb") as f:
    G_ideal = pickle.load(f)
for u, v, data in G_ideal.edges(data=True):
    data["lambda_e"] = data["length"]

# 2) Build node DataFrame
nodes = [{"node": n, **attrs} for n, attrs in G_ideal.nodes(data=True)]
nodes_df = pd.DataFrame(nodes)

# 3) Define both origin and destination sets as those with visit_count > 0
high_visit = set(
    nodes_df.loc[nodes_df["visit_count"] > 0, "node"].tolist()
)

records = []
for origin in tqdm(high_visit, desc="Computing ideal shortest paths"):
    # run Dijkstra from each high-visit origin
    lengths = nx.single_source_dijkstra_path_length(
        G_ideal,
        source=origin,
        weight="lambda_e",
        cutoff=2640
    )
    lengths.pop(origin, None)  # drop self

    for dest, dist in lengths.items():
        # only keep if dest also has visit_count>0, and enforce origin<dest
        if dest in high_visit and origin < dest:
            records.append({
                "origin_node": origin,
                "dest_node":   dest,
                "path_length": dist
            })

# 4) Save
od_df = pd.DataFrame.from_records(records)
od_df.to_csv("data/foot_traffic/od_results_ideal.csv", index=False)


import pickle
import pandas as pd
import networkx as nx
import numpy as np

# --- 1) Load the graph and extract visit_count per node ---
with open("data/foot_traffic/G.pkl", "rb") as f:
    G_ideal = pickle.load(f)

# Build a DataFrame mapping node -> visit_count
nodes = [{"node": n, **data} for n, data in G_ideal.nodes(data=True)]
nodes_df = pd.DataFrame(nodes)
# make sure we have a column named 'visit_count' in nodes_df

# --- 2) Load your precomputed OD pairs + baseline distances ---
od_df = pd.read_csv("data/foot_traffic/od_results_ideal.csv")
# od_df has columns: origin_node, dest_node, path_length

# --- 3) For each origin, compute sum of dest traffic ---
# First, merge the dest visit_count into od_df
od_df = od_df.merge(
    nodes_df[["node", "visit_count"]].rename(columns={"node":"dest_node","visit_count":"traffic_d"}),
    on="dest_node",
    how="left"
)

od_df = od_df.merge(
    nodes_df[["node", "visit_count"]].rename(columns={"node":"origin_node","visit_count":"traffic_o"}),
    on="origin_node",
    how="left"
)


# Then compute, per origin, the total traffic of all its destinations
sum_traffic = od_df.groupby("origin_node")["traffic_d"].transform("sum")

# --- 4) Compute W(o,d) = traffic_d / sum_traffic_for_that_origin ---
od_df["W"] = od_df["traffic_d"] / sum_traffic

# --- 5) (Optional) Save W as a numpy array aligned with od_df order ---
# W = od_df["W"].values
# np.save("data/foot_traffic/W.npy", W)

# If you want, also write back the full table with W:
od_df.to_csv("data/foot_traffic/od_results_with_W.csv", index=False)


Here we calculate the top 5 shortest paths for each OD pair.

In [None]:
import pickle
import pandas as pd
import networkx as nx
from tqdm import tqdm

# 1) Load the road graph
with open("data/foot_traffic/G.pkl", "rb") as f:
    G = pickle.load(f)

# 2) Load the baseline OD pairs
od_df = pd.read_csv("data/foot_traffic/od_results_ideal.csv")
od_pairs = list(zip(od_df["origin_node"], od_df["dest_node"]))

def get_edge_ids_from_node_path(G, node_path):
    edge_ids = []
    for u, v in zip(node_path, node_path[1:]):
        data = G[u][v]
        eid = data["edge_id"]
        edge_ids.append(int(eid))
    return edge_ids

# 3) For each OD pair, compute the 5 shortest edge-id paths
od_top5_paths = []

for origin, dest in tqdm(od_pairs, desc="Building OD → top-5 edge paths"):
    try:
        all_paths_gen = nx.shortest_simple_paths(G, origin, dest, weight="length")
        top5 = []
        for path_num, node_path in enumerate(all_paths_gen):
            if path_num >= 5:
                break
            edge_ids = get_edge_ids_from_node_path(G, node_path)
            top5.append(edge_ids)
        # If less than 5 paths, pad with empty list
        while len(top5) < 5:
            top5.append([])
        od_top5_paths.append({
            "origin_node": int(origin),
            "dest_node": int(dest),
            "top_1_shortest": top5[0],
            "top_2_shortest": top5[1],
            "top_3_shortest": top5[2],
            "top_4_shortest": top5[3],
            "top_5_shortest": top5[4],
        })
    except nx.NetworkXNoPath:
        continue

# 4) Save as Pickle
od_top5_df = pd.DataFrame(od_top5_paths)
od_top5_df.to_pickle("data/foot_traffic/od_top5_shortest_paths.pkl")

print("Top-5 shortest paths for all OD pairs saved!")


Building OD → top-5 edge paths: 100%|██████████| 1162489/1162489 [8:27:35<00:00, 38.17it/s]   


Top-5 shortest paths for all OD pairs saved!
