# Clean outputs

Post-processing step after notebooks 02 and 03.

Removes features with NaN corner coordinates from `hexes.geojson`
and drops orphan rows referencing those hex IDs from all connectivity parquets.

In [1]:
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

OUT_DIR = Path("../../database/data")

In [2]:
# Find hex IDs with NaN or Inf corner coordinates
with open(OUT_DIR / "hexes.geojson") as f:
    gj = json.load(f)

bad_ids = set()
for feat in gj["features"]:
    coords = feat["geometry"]["coordinates"][0]
    if any(not math.isfinite(v) for pt in coords for v in pt):
        bad_ids.add(feat["properties"]["id"])

print(f"Hex IDs with NaN/Inf corners: {sorted(bad_ids)} ({len(bad_ids)} total)")

Hex IDs with NaN/Inf corners: [] (0 total)


In [3]:
# Drop bad features from hexes.geojson and meta.json
before = len(gj["features"])
gj["features"] = [f for f in gj["features"] if f["properties"]["id"] not in bad_ids]
with open(OUT_DIR / "hexes.geojson", "w") as f:
    json.dump(gj, f)
print(f"hexes.geojson: {before} -> {len(gj['features'])} features")

# meta.json is columnar: {"id": {"0": 0, ...}, "lon": {...}, ...}
with open(OUT_DIR / "meta.json") as f:
    meta = json.load(f)
if bad_ids:
    ids_col = meta["id"]  # {"0": 0, "1": 1, ...}
    bad_row_keys = {k for k, v in ids_col.items() if v in bad_ids}
    before_meta = len(ids_col)
    meta = {col: {k: v for k, v in col_data.items() if k not in bad_row_keys}
            for col, col_data in meta.items()}
    with open(OUT_DIR / "meta.json", "w") as f:
        json.dump(meta, f)
    print(f"meta.json: {before_meta} -> {len(meta['id'])} entries")
else:
    print(f"meta.json: no bad IDs, unchanged ({len(meta['id'])} entries)")

hexes.geojson: 8414 -> 8414 features
meta.json: no bad IDs, unchanged (8425 entries)


In [4]:
# Drop orphan rows from all connectivity parquets, round weights to 3 sig figs, and drop zero-weight rows
for path in sorted(OUT_DIR.glob("connectivity_*.pq")):
    df = pd.read_parquet(path)
    mask = df["end_id"].astype(int).isin(bad_ids) | df["start_id"].isin(bad_ids)
    dropped = mask.sum()
    df = df[~mask].copy()
    w = df["weight"].values
    exp = np.floor(np.log10(w))
    df["weight"] = np.round(w * 10 ** (2 - exp)) / 10 ** (2 - exp)
    before_zero = len(df)
    df = df[df["weight"] > 0]
    dropped_zero = before_zero - len(df)
    df.to_parquet(path, index=False)
    print(f"{path.name}: dropped {dropped} orphans, {dropped_zero} zero-weight rows, {len(df):,} remaining")

connectivity_05m_00d-07d.pq: dropped 0 orphans, 0 zero-weight rows, 1,028,478 remaining


connectivity_05m_07d-14d.pq: dropped 0 orphans, 0 zero-weight rows, 1,892,431 remaining


connectivity_05m_07d-28d.pq: dropped 0 orphans, 0 zero-weight rows, 3,407,838 remaining
connectivity_10m_00d-07d.pq: dropped 0 orphans, 0 zero-weight rows, 1,015,297 remaining


connectivity_10m_07d-14d.pq: dropped 0 orphans, 0 zero-weight rows, 1,864,143 remaining


connectivity_10m_07d-28d.pq: dropped 0 orphans, 0 zero-weight rows, 3,320,873 remaining
connectivity_15m_00d-07d.pq: dropped 0 orphans, 0 zero-weight rows, 967,342 remaining


connectivity_15m_07d-14d.pq: dropped 0 orphans, 0 zero-weight rows, 1,802,339 remaining


connectivity_15m_07d-28d.pq: dropped 0 orphans, 0 zero-weight rows, 3,215,520 remaining
