# Clean outputs

Post-processing step after notebooks 02 and 03.

Removes features with NaN corner coordinates from `hexes.geojson`
and drops orphan rows referencing those hex IDs from all connectivity parquets.

In [1]:
import json
import math
import pandas as pd
from pathlib import Path

OUT_DIR = Path("../../database/data")

In [2]:
# Find hex IDs with NaN or Inf corner coordinates
with open(OUT_DIR / "hexes.geojson") as f:
    gj = json.load(f)

bad_ids = set()
for feat in gj["features"]:
    coords = feat["geometry"]["coordinates"][0]
    if any(not math.isfinite(v) for pt in coords for v in pt):
        bad_ids.add(feat["properties"]["id"])

print(f"Hex IDs with NaN/Inf corners: {sorted(bad_ids)} ({len(bad_ids)} total)")

Hex IDs with NaN/Inf corners: [5519] (1 total)


In [3]:
# Drop bad features from hexes.geojson
before = len(gj["features"])
gj["features"] = [f for f in gj["features"] if f["properties"]["id"] not in bad_ids]
with open(OUT_DIR / "hexes.geojson", "w") as f:
    json.dump(gj, f)
print(f"hexes.geojson: {before} -> {len(gj['features'])} features")

hexes.geojson: 8415 -> 8414 features


In [4]:
# Drop orphan rows from all connectivity parquets and round weights to 6 dp
for path in sorted(OUT_DIR.glob("connectivity_*.pq")):
    df = pd.read_parquet(path)
    mask = df["end_id"].astype(int).isin(bad_ids) | df["start_id"].isin(bad_ids)
    dropped = mask.sum()
    df = df[~mask].copy()
    df["weight"] = df["weight"].round(6)
    df.to_parquet(path, index=False)
    print(f"{path.name}: dropped {dropped} rows, {len(df):,} remaining, weight rounded to 6dp")

connectivity_05m_00d-07d.pq: dropped 0 rows, 1,028,478 remaining, weight rounded to 6dp


connectivity_05m_07d-14d.pq: dropped 0 rows, 1,886,555 remaining, weight rounded to 6dp


connectivity_05m_07d-28d.pq: dropped 0 rows, 3,407,838 remaining, weight rounded to 6dp
connectivity_10m_00d-07d.pq: dropped 1564 rows, 1,015,297 remaining, weight rounded to 6dp


connectivity_10m_07d-14d.pq: dropped 2345 rows, 1,864,143 remaining, weight rounded to 6dp


connectivity_10m_07d-28d.pq: dropped 3348 rows, 3,320,873 remaining, weight rounded to 6dp
connectivity_15m_00d-07d.pq: dropped 1497 rows, 967,342 remaining, weight rounded to 6dp


connectivity_15m_07d-14d.pq: dropped 2296 rows, 1,802,339 remaining, weight rounded to 6dp


connectivity_15m_07d-28d.pq: dropped 3268 rows, 3,215,520 remaining, weight rounded to 6dp
