<a href="https://colab.research.google.com/github/eth0-02/Astro-Theme-Creek/blob/master/Spatial_Join_Hydrshed_with_Counties.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# ===== ONE-CELL COLAB: Tag each hydroshed with County name → ONE CSV (robust filenames) =====

# Deps
!apt-get -qq update
!apt-get -qq install -y libspatialindex-dev >/dev/null
!pip -q install geopandas shapely pyproj fiona rtree >/dev/null

import geopandas as gpd
import pandas as pd
import re, os
from pathlib import Path
from google.colab import files

print("📤 Upload the two GPKGs:\n  • Hydrosheds (e.g., Kencounty_hydroshed_storage.gpkg)\n  • Counties (e.g., kenya counties topojson.gpkg)")
uploaded = files.upload()
if not uploaded:
    raise SystemExit("No files uploaded.")

# Map uploaded paths
uploaded_paths = [f"/content/{name}" for name in uploaded.keys()]
print("Uploaded:", uploaded_paths)

# --- Fuzzy file role detection (handles spaces, (1), (3), etc.) ---
def norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", s.lower())

hydros_path = None
counties_path = None
for p in uploaded_paths:
    n = norm(Path(p).name)
    # look for words regardless of spaces: 'kencounty' + 'hydroshed' → hydros layer
    if ("kencounty" in n and "hydroshed" in n) or ("hydroshed" in n and "storage" in n and "county" in n):
        hydros_path = p
    # look for 'kenya' + 'counties' → counties layer
    if ("kenya" in n and "counties" in n) or ("county" in n and "topojson" in n):
        counties_path = p

# If still ambiguous, fall back to reading both and scoring
import fiona
def read_first(path):
    layers = fiona.listlayers(path)
    layer = layers[0] if layers else None
    gdf = gpd.read_file(path, layer=layer) if layer else gpd.read_file(path)
    return gdf, layer

if not hydros_path or not counties_path:
    if len(uploaded_paths) != 2:
        raise SystemExit("Please upload exactly two files.")
    g1, _ = read_first(uploaded_paths[0])
    g2, _ = read_first(uploaded_paths[1])

    def county_score(gdf):
        score = 0
        # Kenya counties ~47 features
        if 40 <= len(gdf) <= 60: score += 2
        # name-like county fields
        name_cols = [c for c in gdf.columns if gdf[c].dtype == object and re.search(r"(county|name|adm1)", c, re.I)]
        if name_cols: score += 2
        return score

    s1, s2 = county_score(g1), county_score(g2)
    if s1 > s2:
        counties_path, hydros_path = uploaded_paths[0], uploaded_paths[1]
    elif s2 > s1:
        counties_path, hydros_path = uploaded_paths[1], uploaded_paths[0]
    else:
        # tie-breaker: smaller feature count likely counties
        counties_path, hydros_path = (uploaded_paths[0], uploaded_paths[1]) if len(g1) < len(g2) else (uploaded_paths[1], uploaded_paths[0])

print(f"🗂️ Hydros file:   {hydros_path}")
print(f"🗂️ Counties file: {counties_path}")

# --- Read data ---
hydros, hydros_layer = read_first(hydros_path)
counties, counties_layer = read_first(counties_path)
print(f"✅ Hydros:   {len(hydros)} features | CRS={hydros.crs} | layer={hydros_layer}")
print(f"✅ Counties: {len(counties)} features | CRS={counties.crs} | layer={counties_layer}")

if len(hydros) == 0 or len(counties) == 0:
    raise SystemExit("One of the layers is empty.")

# --- CRS handling: prefer counties CRS if projected; else EPSG:21037 (Kenya UTM 37S) ---
def ensure_projected(gdf, target="EPSG:21037"):
    if gdf.crs is None:
        print(f"• CRS missing → assigning {target}")
        return gdf.set_crs(target, allow_override=True)
    if gdf.crs.is_geographic:
        print(f"• Geographic CRS → reprojecting to {target}")
        return gdf.to_crs(target)
    return gdf

target_crs = counties.crs if (counties.crs and not counties.crs.is_geographic) else "EPSG:21037"
counties = ensure_projected(counties, target_crs)
hydros   = ensure_projected(hydros,   target_crs)
if counties.crs != hydros.crs:
    hydros = hydros.to_crs(counties.crs)

# --- Pick county name field robustly ---
def pick_county_name_field(df):
    prefs = ["CNAME", "COUNTY", "County", "COUNTY_NAME", "COUNTY_NAM", "ADM1_EN", "NAME_1", "NAME", "Name", "BNAME"]
    for c in prefs:
        if c in df.columns:
            return c
    cand = [c for c in df.columns if df[c].dtype == object and re.search(r"(county|name|adm1)", c, re.I)]
    if cand:
        return cand[0]
    raise ValueError("Could not find a county name field in counties. Please rename to something like 'COUNTY' or 'NAME'.")

COUNTY_NAME_FIELD = pick_county_name_field(counties)
print(f"🔎 County name field: {COUNTY_NAME_FIELD}")

# --- Spatial join: try 'within', fallback 'intersects' to handle tiny boundary slivers ---
county_for_join = counties[[COUNTY_NAME_FIELD, "geometry"]].rename(columns={COUNTY_NAME_FIELD: "COUNTY_NAME"})
hydros_tagged = gpd.sjoin(hydros, county_for_join, how="left", predicate="within").drop(columns=["index_right"])
unmatched = int(hydros_tagged["COUNTY_NAME"].isna().sum())
print(f"🧩 Join 'within': unmatched = {unmatched} / {len(hydros_tagged)}")
if unmatched > 0:
    try2 = gpd.sjoin(hydros, county_for_join, how="left", predicate="intersects").drop(columns=["index_right"])
    unmatched2 = int(try2["COUNTY_NAME"].isna().sum())
    if unmatched2 < unmatched:
        print(f"↪️ Switched to 'intersects' — unmatched now {unmatched2}")
        hydros_tagged = try2

# --- Outputs (ONE CSV only + GPKG for GIS) ---
OUT_GPKG  = "/content/hydrosheds_with_county.gpkg"
OUT_LAYER = "hydros_with_county"
OUT_CSV   = "/content/hydrosheds_with_county.csv"   # single CSV with ALL attributes + COUNTY_NAME

hydros_tagged.to_file(OUT_GPKG, layer=OUT_LAYER, driver="GPKG")
hydros_tagged.drop(columns="geometry").to_csv(OUT_CSV, index=False)

print("\n✅ Done!")
print(f"GeoPackage: {OUT_GPKG}  (layer: {OUT_LAYER})")
print(f"Single CSV: {OUT_CSV}")

# Trigger downloads
try:
    files.download(OUT_GPKG); files.download(OUT_CSV)
except Exception:
    print("If auto-download is blocked, use the Files pane (left) to download.")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
📤 Upload the two GPKGs:
  • Hydrosheds (e.g., Kencounty_hydroshed_storage.gpkg)
  • Counties (e.g., kenya counties topojson.gpkg)


Saving Kencounty_hydroshed_storage.gpkg to Kencounty_hydroshed_storage (4).gpkg
Saving kenya counties topojson.gpkg to kenya counties topojson (2).gpkg
Uploaded: ['/content/Kencounty_hydroshed_storage (4).gpkg', '/content/kenya counties topojson (2).gpkg']
🗂️ Hydros file:   /content/Kencounty_hydroshed_storage (4).gpkg
🗂️ Counties file: /content/kenya counties topojson (2).gpkg
✅ Hydros:   2502 features | CRS=EPSG:4326 | layer=Kencounty_hydroshed_storage1
✅ Counties: 47 features | CRS=EPSG:4326 | layer=Kenya counties topojson
• Geographic CRS → reprojecting to EPSG:21037
• Geographic CRS → reprojecting to EPSG:21037
🔎 County name field: name
🧩 Join 'within': unmatched = 957 / 2502
↪️ Switched to 'intersects' — unmatched now 18

✅ Done!
GeoPackage: /content/hydrosheds_with_county.gpkg  (layer: hydros_with_county)
Single CSV: /content/hydrosheds_with_county.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>