In [2]:
# %%
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import datetime
import re

from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
from shapely.ops import unary_union

# -----------------------
# Params
# -----------------------
project_db   = r"C:\Users\bsf31\Documents\data\NL060\CA_FIRES.gpkg"
faces_layer  = "california_fir_featuretopoly"   # non-overlapping faces (Feature to Polygon)
perims_layer = "california_fire_perimet_clip"   # original clipped perimeters
n_years      = 20                                # "recent" window

# Columns
name_col = "FIRE_NAME"
date_col = "ALARM_DATE"  # e.g., "1/7/2025 08:00:00 (UTC)"
year_col = "YEAR_"       # optional fallback if ALARM_DATE is missing

# -----------------------
# Helpers
# -----------------------
def parse_alarm_datetime_utc(series: pd.Series) -> pd.Series:
    """Parse ALARM_DATE robustly and return tz-aware UTC timestamps (NaT on fail)."""
    # strip any trailing parenthetical timezone like (UTC), (UTC-08:00), (PDT), etc.
    s = series.astype(str).str.replace(r"\s*\([^)]*\)\s*$", "", regex=True).str.strip()
    # allow both 12/31/2024 23:59:59 and ISO-like strings; force UTC
    return pd.to_datetime(s, errors="coerce", utc=True)

def polygonal_part_or_none(geom):
    """Return polygonal part of geometry; None if no polygonal area exists."""
    if geom is None or geom.is_empty:
        return None
    gt = geom.geom_type
    if gt in ("Polygon", "MultiPolygon"):
        return geom
    if gt == "GeometryCollection":
        polys = [g for g in geom.geoms if g.geom_type in ("Polygon", "MultiPolygon")]
        if not polys:
            return None
        # If there are multiple polygon bits, return their union (still polygonal)
        try:
            return unary_union(polys)
        except Exception:
            # Fallback: return MultiPolygon of the polygon parts
            flat = []
            for g in polys:
                if g.geom_type == "Polygon":
                    flat.append(g)
                else:
                    flat.extend(list(g.geoms))
            return MultiPolygon(flat) if flat else None
    # LineString/Point etc. -> no area
    return None

# -----------------------
# Load
# -----------------------
faces  = gpd.read_file(project_db, layer=faces_layer)
perims = gpd.read_file(project_db, layer=perims_layer)

# CRS match
if faces.crs != perims.crs:
    perims = perims.to_crs(faces.crs)

# Quick geom repair on perimeters (faces from FeatureToPoly are usually fine)
# (buffer(0) is safe; if using geographic CRS it's still okay with distance=0)
perims["geometry"] = perims.geometry.buffer(0)

# Ensure faces have a stable id
if "piece_id" not in faces.columns:
    faces = faces.copy()
    faces["piece_id"] = np.arange(len(faces), dtype=int)

# Parse ALARM_DATE robustly
has_date_col = date_col in perims.columns
if has_date_col:
    perims["ALARM_DATE_UTC"] = parse_alarm_datetime_utc(perims[date_col])
else:
    perims["ALARM_DATE_UTC"] = pd.NaT

# YEAR_ fallback (if present)
has_year_col = year_col in perims.columns
if has_year_col:
    perims[year_col] = pd.to_numeric(perims[year_col], errors="coerce")

# -----------------------
# Intersection with polygonal cleanup
# -----------------------
faces_slim  = faces[["piece_id", "geometry"]].copy()
perims_slim = perims[[name_col, "ALARM_DATE_UTC", year_col] if has_year_col else [name_col, "ALARM_DATE_UTC", "geometry"]].copy()
if "geometry" not in perims_slim.columns:
    perims_slim = perims[[name_col, "ALARM_DATE_UTC", year_col, "geometry"]]

# Use keep_geom_type=False to avoid losing GeometryCollections;
# then convert intersections to polygonal parts only.
inter_raw = gpd.overlay(faces_slim, perims_slim, how="intersection", keep_geom_type=False)

# Keep only polygonal area from any GeometryCollections
inter_raw["geometry"] = inter_raw.geometry.apply(polygonal_part_or_none)
inter = inter_raw[~inter_raw.geometry.isna() & ~inter_raw.geometry.is_empty].copy()

# ---- Diagnostics
print(f"Faces: {len(faces)}  Perims: {len(perims)}")
n_dates = perims["ALARM_DATE_UTC"].notna().sum()
print(f"Perimeters with parsed ALARM_DATE: {n_dates} / {len(perims)}")
print(f"Intersections total (raw): {len(inter_raw)}  polygonal kept: {len(inter)}")

if inter.empty:
    out = faces.copy()
    out["LATEST_YEAR"]  = pd.Series(dtype="Int64")
    out["LATEST_FIRE"]  = pd.Series(dtype="object")
    out["LATEST_ALARM"] = pd.Series(dtype="object")   # ISO date string (or None)
    out["BURN_COUNT"]   = 0
    out["HAS_OVERLAP"]  = 0
    cutoff_ts = pd.Timestamp.now(tz="UTC") - pd.DateOffset(years=n_years)
    out[f"RECENT_{n_years}Y"] = 0
else:
    # Build an event key with tolerance:
    # 1) Prefer calendar date of ALARM_DATE
    inter["ALARM_DATE_DATE"] = inter["ALARM_DATE_UTC"].dt.date

    # 2) If ALARM_DATE is NaT but YEAR_ exists, use YEAR_ as fallback
    # (so big events with missing timestamps don't vanish)
    if has_year_col:
        inter["EVENT_KEY"] = inter["ALARM_DATE_DATE"].astype("object")
        mask_nat = inter["EVENT_KEY"].isna()
        inter.loc[mask_nat, "EVENT_KEY"] = inter.loc[mask_nat, year_col].astype("Int64")

        # For "latest" selection: construct a sortable proxy timestamp
        # Prefer ALARM_DATE_UTC; else use Jan 1 of YEAR_ (UTC)
        inter["ALARM_OR_YEAR_TS"] = inter["ALARM_DATE_UTC"]
        mask_ts_nat = inter["ALARM_OR_YEAR_TS"].isna() & inter[year_col].notna()
        inter.loc[mask_ts_nat, "ALARM_OR_YEAR_TS"] = pd.to_datetime(
            inter.loc[mask_ts_nat, year_col].astype(int).astype(str) + "-01-01",
            utc=True, errors="coerce"
        )
    else:
        inter["EVENT_KEY"] = inter["ALARM_DATE_DATE"]
        inter["ALARM_OR_YEAR_TS"] = inter["ALARM_DATE_UTC"]

    # Count unique events per face: distinct (FIRE_NAME, EVENT_KEY)
    unique_events = (
        inter.dropna(subset=["EVENT_KEY"])
             .drop_duplicates(subset=["piece_id", name_col, "EVENT_KEY"])
    )
    counts = unique_events.groupby("piece_id").size().rename("BURN_COUNT")

    # Latest event per face by ALARM_OR_YEAR_TS (UTC)
    top = (
        inter.sort_values(["piece_id", "ALARM_OR_YEAR_TS"], ascending=[True, False])
             .drop_duplicates(subset="piece_id")
             .loc[:, ["piece_id", name_col, "ALARM_DATE_UTC", "ALARM_OR_YEAR_TS"]]
             .rename(columns={name_col: "LATEST_FIRE"})
    )
    top["LATEST_YEAR"]  = top["ALARM_OR_YEAR_TS"].dt.year.astype("Int64")
    top["LATEST_ALARM"] = top["ALARM_DATE_UTC"].dt.strftime("%Y-%m-%d")  # ISO if available

    # Merge onto faces
    out = faces.merge(top[["piece_id", "LATEST_FIRE", "LATEST_YEAR", "LATEST_ALARM"]],
                      on="piece_id", how="left") \
               .merge(counts, on="piece_id", how="left")

    # Flags
    out["BURN_COUNT"]  = out["BURN_COUNT"].fillna(0).astype("int32")
    out["HAS_OVERLAP"] = (out["BURN_COUNT"] > 1).astype("uint8")

    # Recent flag using ALARM_OR_YEAR_TS (so YEAR_ fallback still participates)
    cutoff_ts = pd.Timestamp.now(tz="UTC") - pd.DateOffset(years=n_years)
    latest_ts = top.set_index("piece_id")["ALARM_OR_YEAR_TS"]
    out = out.join(latest_ts.rename("__LATEST_TS__"), on="piece_id")
    out[f"RECENT_{n_years}Y"] = out["__LATEST_TS__"].ge(cutoff_ts).fillna(False).astype("uint8")
    out = out.drop(columns="__LATEST_TS__")

# -----------------------
# Write a clean schema
# -----------------------
keep_cols   = [
    "piece_id", "geometry",
    "BURN_COUNT", "HAS_OVERLAP",
    "LATEST_YEAR", "LATEST_FIRE", "LATEST_ALARM",
    f"RECENT_{n_years}Y"
]
out_to_write = out[keep_cols].copy()

out_layer = f"{faces_layer}_tagged"

out_to_write.to_file(project_db, layer=out_layer, driver="GPKG")
print(f"✅ Wrote tagged faces to layer: {out_layer}")


Faces: 2583  Perims: 487
Perimeters with parsed ALARM_DATE: 329 / 487
Intersections total (raw): 8857  polygonal kept: 5452
✅ Wrote tagged faces to layer: california_fir_featuretopoly_tagged
