# Clean outputs

Post-processing step after notebooks 02 and 03.

Removes features with NaN corner coordinates from `hexes.geojson`
and drops orphan rows referencing those hex IDs from all connectivity parquets.

In [None]:
import json
import math
import numpy as np
import pandas as pd
from pathlib import Path

_cwd = Path.cwd()
if (_cwd / "../database/data").exists():
    OUT_DIR = (_cwd / "../database/data").resolve()
else:
    OUT_DIR = (_cwd / "../../database/data").resolve()
print(f"OUT_DIR: {OUT_DIR}")

In [None]:
# Find hex IDs with NaN or Inf corner coordinates
with open(OUT_DIR / "hexes.geojson") as f:
    gj = json.load(f)

bad_ids = set()
for feat in gj["features"]:
    coords = feat["geometry"]["coordinates"][0]
    if any(not math.isfinite(v) for pt in coords for v in pt):
        bad_ids.add(feat["properties"]["id"])

print(f"Hex IDs with NaN/Inf corners: {sorted(bad_ids)} ({len(bad_ids)} total)")

In [None]:
# Drop bad features from hexes.geojson and meta.json
before = len(gj["features"])
gj["features"] = [f for f in gj["features"] if f["properties"]["id"] not in bad_ids]
with open(OUT_DIR / "hexes.geojson", "w") as f:
    json.dump(gj, f)
print(f"hexes.geojson: {before} -> {len(gj['features'])} features")

# meta.json is columnar: {"id": {"0": 0, ...}, "lon": {...}, ...}
with open(OUT_DIR / "meta.json") as f:
    meta = json.load(f)
if bad_ids:
    ids_col = meta["id"]  # {"0": 0, "1": 1, ...}
    bad_row_keys = {k for k, v in ids_col.items() if v in bad_ids}
    before_meta = len(ids_col)
    meta = {col: {k: v for k, v in col_data.items() if k not in bad_row_keys}
            for col, col_data in meta.items()}
    with open(OUT_DIR / "meta.json", "w") as f:
        json.dump(meta, f)
    print(f"meta.json: {before_meta} -> {len(meta['id'])} entries")
else:
    print(f"meta.json: no bad IDs, unchanged ({len(meta['id'])} entries)")

In [None]:
# Drop orphan rows from all connectivity parquets, round weights to 3 sig figs, and drop zero-weight rows
for path in sorted(OUT_DIR.glob("connectivity_*.pq")):
    df = pd.read_parquet(path)
    mask = df["end_id"].astype(int).isin(bad_ids) | df["start_id"].isin(bad_ids)
    dropped = mask.sum()
    df = df[~mask].copy()
    w = df["weight"].values
    exp = np.floor(np.log10(w))
    df["weight"] = np.round(w * 10 ** (2 - exp)) / 10 ** (2 - exp)
    before_zero = len(df)
    df = df[df["weight"] > 0]
    dropped_zero = before_zero - len(df)
    df.to_parquet(path, index=False)
    print(f"{path.name}: dropped {dropped} orphans, {dropped_zero} zero-weight rows, {len(df):,} remaining")

## Prune small disconnected hex clusters

Uses cube-coordinate adjacency to find connected components of the hex grid.
Components with â‰¤ 3 hexes are removed from all data files.

In [None]:
MIN_CLUSTER_SIZE = 4   # remove components with fewer than this many hexes

# Parse cube coordinates from hex label strings like "(-1, -19, 20)"
with open(OUT_DIR / "hex_label_to_id.json") as f:
    label_to_id = json.load(f)

def parse_cube(label: str):
    return tuple(int(x) for x in label.strip("()").split(","))

cubes = {parse_cube(lbl): int(hex_id) for lbl, hex_id in label_to_id.items()}
cube_set = set(cubes)

# Six cube-coordinate neighbor directions
DIRECTIONS = [(1,-1,0),(-1,1,0),(1,0,-1),(-1,0,1),(0,1,-1),(0,-1,1)]

# BFS connected components
visited = set()
components = []   # list of sets of int IDs

for cube in cube_set:
    if cube in visited:
        continue
    component = set()
    queue = [cube]
    while queue:
        c = queue.pop()
        if c in visited:
            continue
        visited.add(c)
        component.add(cubes[c])
        for d in DIRECTIONS:
            nb = (c[0]+d[0], c[1]+d[1], c[2]+d[2])
            if nb in cube_set and nb not in visited:
                queue.append(nb)
    components.append(component)

components.sort(key=len)
print(f"Total components: {len(components)}")
print(f"Size distribution (size: count):")
from collections import Counter
size_counts = Counter(len(c) for c in components)
for size in sorted(size_counts):
    print(f"  {size:4d} hex: {size_counts[size]:4d} component(s)")

small_ids = set().union(*(c for c in components if len(c) < MIN_CLUSTER_SIZE))
print(f"\nHex IDs to prune (cluster size < {MIN_CLUSTER_SIZE}): {len(small_ids)}")

In [None]:
# Remove small-cluster hexes from hexes.geojson, meta.json, and parquets
with open(OUT_DIR / "hexes.geojson") as f:
    gj = json.load(f)
before = len(gj["features"])
gj["features"] = [f for f in gj["features"] if f["properties"]["id"] not in small_ids]
with open(OUT_DIR / "hexes.geojson", "w") as f:
    json.dump(gj, f)
print(f"hexes.geojson: {before} -> {len(gj['features'])} features")

with open(OUT_DIR / "meta.json") as f:
    meta = json.load(f)
ids_col = meta["id"]
bad_row_keys = {k for k, v in ids_col.items() if v in small_ids}
before_meta = len(ids_col)
meta = {col: {k: v for k, v in col_data.items() if k not in bad_row_keys}
        for col, col_data in meta.items()}
with open(OUT_DIR / "meta.json", "w") as f:
    json.dump(meta, f)
print(f"meta.json: {before_meta} -> {len(meta['id'])} entries")

for path in sorted(OUT_DIR.glob("connectivity_*.pq")):
    df = pd.read_parquet(path)
    before_pq = len(df)
    mask = df["end_id"].astype(int).isin(small_ids) | df["start_id"].isin(small_ids)
    df = df[~mask]
    df.to_parquet(path, index=False)
    print(f"{path.name}: {before_pq:,} -> {len(df):,} rows ({mask.sum()} dropped)")