In [4]:
# Utilities to compute Conditional Flame Length (CFL) from FLP_English.csv files
# and (optionally) rasterize CFL using XPos/YPos as a grid.
#
# Assumptions:
# - Columns: XPos, YPos, PBurn, FIL1..FIL6.  FILk are probabilities for the 6 flame-length bins.
# - Midpoints (feet) for the 6 classes: [1, 3, 5, 7, 10, 14]
# - Some datasets store FILk as conditional-on-burn probabilities that sum to ~1; others
#   store unconditional probabilities that sum to ~PBurn. We'll auto-detect row-by-row.
#
# Output:
# - DataFrame with CFL_ft and a flag for how CFL was computed (conditional vs unconditional),
# - Optional GeoTIFF written if CRS is provided or inferred.
#
# NOTE: This cell defines functions only. Nothing is executed on your files here.

import os
from typing import Iterable, Tuple, Optional, Dict

import numpy as np
import pandas as pd

import rasterio
from rasterio.transform import Affine



# Fixed midpoints for the 6 flame length classes, in feet
FL_MIDPOINTS_FT = np.array([1.0, 3.0, 5.0, 7.0, 10.0, 14.0], dtype=np.float64)


def _auto_detect_conditional(row: pd.Series, fil_cols: Iterable[str], pburn_col: str = "PBurn", tol: float = 1e-3) -> str:
    """
    Heuristic to determine whether FIL columns are conditional probabilities (sum≈1)
    or unconditional probabilities (sum≈PBurn). Returns one of:
    - 'conditional'
    - 'unconditional'
    - 'unknown' (fallback to conditional)
    """
    fil_sum = float(row[fil_cols].sum())
    pb = float(row[pburn_col])

    # guard against weird/empty rows
    if pb < tol and fil_sum < tol:
        return "unknown"

    if abs(fil_sum - 1.0) <= max(tol, 0.05):  # allow some slack for rounding/noise
        return "conditional"

    if abs(fil_sum - pb) <= max(tol, 0.05):
        return "unconditional"

    # If neither is close, choose the closer one
    if abs(fil_sum - 1.0) < abs(fil_sum - pb):
        return "conditional"
    else:
        return "unconditional"


def process_csv(csv_path: str,
                x_col: str = "XPos",
                y_col: str = "YPos",
                pburn_col: str = "PBurn",
                fil_cols: Iterable[str] = ("FIL1", "FIL2", "FIL3", "FIL4", "FIL5", "FIL6"),
                midpoint_ft: np.ndarray = FL_MIDPOINTS_FT,
                drop_na_rows: bool = True) -> pd.DataFrame:
    """
    Read one FLP_English.csv and compute Conditional Flame Length (CFL) in feet.
    Auto-detects if FIL columns are conditional or unconditional per row.

    Returns a DataFrame with columns: [XPos, YPos, PBurn, CFL_ft, mode]
      - mode is 'conditional' or 'unconditional' indicating how that row was interpreted.
    """
    df = pd.read_csv(csv_path)
    print("\nColumns:", df.columns.tolist())
    # Basic validation
    needed = [x_col, y_col, pburn_col, *fil_cols]
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns in {os.path.basename(csv_path)}: {missing}")

    if drop_na_rows:
        df = df.dropna(subset=[*fil_cols, pburn_col])

    # Ensure numeric types for calculations
    df[fil_cols] = df[list(fil_cols)].apply(pd.to_numeric, errors="coerce").fillna(0.0)
    df[pburn_col] = pd.to_numeric(df[pburn_col], errors="coerce").fillna(0.0)

    # Vectorized computation:
    fil_vals = df.loc[:, fil_cols].to_numpy(dtype=np.float64)  # (n, 6)
    fil_sum = fil_vals.sum(axis=1)                              # (n,)

    # Compute two candidate CFLs
    cfl_if_conditional = (fil_vals @ midpoint_ft)  # sum(FILk * mid_k)
    with np.errstate(divide='ignore', invalid='ignore'):
        cfl_if_unconditional = np.divide((fil_vals @ midpoint_ft),
                                         df[pburn_col].to_numpy(dtype=np.float64),
                                         out=np.zeros_like(fil_sum, dtype=np.float64),
                                         where=df[pburn_col].to_numpy(dtype=np.float64) > 0)

    # Auto-pick per row using the heuristic
    modes = []
    cfl_result = np.empty_like(cfl_if_conditional)
    for i in range(len(df)):
        mode = _auto_detect_conditional(df.iloc[i], fil_cols=fil_cols, pburn_col=pburn_col)
        modes.append(mode)
        cfl_result[i] = cfl_if_conditional[i] if mode == "conditional" else cfl_if_unconditional[i]

    out = df[[x_col, y_col, pburn_col]].copy()
    out.rename(columns={x_col: "XPos", y_col: "YPos", pburn_col: "PBurn"}, inplace=True)
    out["CFL_ft"] = cfl_result
    out["mode"] = modes

    return out


def infer_grid_params(df_xy: pd.DataFrame,
                      x_col: str = "XPos",
                      y_col: str = "YPos") -> Tuple[np.ndarray, np.ndarray, float, float]:
    """
    Infer a regular grid from XPos/YPos by finding unique sorted coordinates and
    the modal spacing (dx, dy). Returns:
      unique_x, unique_y, dx, dy
    """
    xs = np.sort(df_xy[x_col].unique())
    ys = np.sort(df_xy[y_col].unique())

    if len(xs) < 2 or len(ys) < 2:
        raise ValueError("Not enough unique X/Y positions to form a grid.")

    dxs = np.diff(xs)
    dys = np.diff(ys)

    # Use the modal spacing (most common step) to be robust to jitter
    def modal_step(arr: np.ndarray) -> float:
        vals, counts = np.unique(np.round(arr, 6), return_counts=True)  # round for floating noise
        return float(vals[np.argmax(counts)])

    dx = modal_step(dxs)
    dy = modal_step(dys)

    return xs, ys, dx, dy


def df_to_geotiff(df_cfl: pd.DataFrame,
                  out_path: str,
                  crs_epsg: Optional[int] = None,
                  x_col: str = "XPos",
                  y_col: str = "YPos",
                  value_col: str = "CFL_ft",
                  nodata: float = np.nan) -> str:
    """
    Rasterize CFL values to a GeoTIFF using the regular grid implied by XPos/YPos.
    If crs_epsg is None, a 'local' CRS will be used (GeoTIFF written without CRS).

    Returns the output path.
    """
    if rasterio is None:
        raise RuntimeError("rasterio is not available in this environment. Install rasterio to write GeoTIFFs.")

    xs, ys, dx, dy = infer_grid_params(df_cfl, x_col=x_col, y_col=y_col)

    # Build array with origin at top-left (minX, maxY)
    x_to_idx = {x: i for i, x in enumerate(xs)}
    y_to_idx = {y: i for i, y in enumerate(ys)}

    ncols = len(xs)
    nrows = len(ys)

    # Initialize with nodata
    arr = np.full((nrows, ncols), np.nan, dtype=np.float32)

    # Fill array: rows from top to bottom should correspond to maxY->minY
    # We'll map row index as: row = (nrows - 1 - y_to_idx[y])
    for _, row in df_cfl.iterrows():
        xi = x_to_idx[row[x_col]]
        yi = y_to_idx[row[y_col]]
        ri = (nrows - 1 - yi)
        ci = xi
        arr[ri, ci] = float(row[value_col])

    # Define geotransform (Affine): top-left corner of the top-left pixel
    # Using half-cell offset so that cell centers align to XPos/YPos
    min_x = xs.min()
    max_y = ys.max()
    transform = Affine.translation(min_x - dx / 2.0, max_y + dy / 2.0) * Affine.scale(dx, -dy)

    profile = {
        "driver": "GTiff",
        "height": nrows,
        "width": ncols,
        "count": 1,
        "dtype": "float32",
        "transform": transform,
        "compress": "lzw",
        "nodata": nodata,
        "tiled": True,
        "interleave": "band"
    }
    if crs_epsg is not None:
        profile["crs"] = f"EPSG:{crs_epsg}"

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with rasterio.open(out_path, "w", **profile) as dst:
        dst.write(arr, 1)

    return out_path


def batch_process_folder(root_dir: str,
                         pattern_filename: str = "FLP_English.csv",
                         write_geotiffs: bool = False,
                         out_dir: Optional[str] = None,
                         crs_epsg: Optional[int] = None) -> Dict[str, pd.DataFrame]:
    """
    Walk through `root_dir`, find all `pattern_filename` files, compute CFL, and
    (optionally) write a GeoTIFF per CSV.

    Returns a dict: {csv_path: df_with_CFL}
    """
    results: Dict[str, pd.DataFrame] = {}

    for current_dir, _, files in os.walk(root_dir):
        for f in files:
            if f == pattern_filename:
                csv_path = os.path.join(current_dir, f)
                try:
                    df_out = process_csv(csv_path)
                except Exception as e:
                    print(f"[WARN] Skipping {csv_path}: {e}")
                    continue

                results[csv_path] = df_out

                if write_geotiffs:
                    if out_dir is None:
                        out_dir = os.path.join(root_dir, "_CFL_geotiffs")
                    os.makedirs(out_dir, exist_ok=True)

                    rel = os.path.relpath(current_dir, root_dir).replace(os.sep, "_")
                    base = f"CFL_{rel if rel != '.' else 'root'}.tif"
                    out_path = os.path.join(out_dir, base)
                    try:
                        df_to_geotiff(df_out, out_path=out_path, crs_epsg=crs_epsg)
                        print(f"[OK] Wrote {out_path}")
                    except Exception as e:
                        print(f"[WARN] Could not write GeoTIFF for {csv_path}: {e}")

    return results


In [6]:
# Patch: make `process_csv` robust to header whitespace and minor naming variations.
# It now normalizes column names by stripping spaces and removing internal whitespace,
# and it will auto-detect FIL1..FIL6 even if written like " FIL 1" etc.

import re
from typing import List

def _normalize_columns(cols: Iterable[str]) -> List[str]:
    out = []
    for c in cols:
        # strip outer whitespace and remove inner spaces/tabs
        cc = re.sub(r"\s+", "", str(c).strip())
        out.append(cc)
    return out

def _resolve_columns(df: pd.DataFrame,
                     x_col: str,
                     y_col: str,
                     pburn_col: str,
                     fil_cols: Iterable[str]) -> Tuple[str, str, str, Tuple[str, ...]]:
    """
    Resolve actual column names in df for the requested logical columns.
    Works case-insensitively and ignores whitespace differences.
    """
    norm_map = {}  # normalized -> actual
    for actual in df.columns:
        norm = re.sub(r"\s+", "", actual.strip()).lower()
        norm_map[norm] = actual

    def find_one(target: str, aliases: Iterable[str]) -> str:
        # try normalized exact matches against provided aliases (already lowercase/no spaces)
        for a in aliases:
            if a in norm_map:
                return norm_map[a]
        # if missing, raise with context
        raise KeyError(target)

    # candidate aliases (lowercased & whitespace removed)
    x_aliases = [x_col, x_col.lower(), "x", "xpos", "lon", "longitude"]
    y_aliases = [y_col, y_col.lower(), "y", "ypos", "lat", "latitude"]
    pb_aliases = [pburn_col, pburn_col.lower(), "pb", "pburn", "burnprob", "burnprobability"]

    # normalize alias tokens
    x_aliases = [re.sub(r"\s+", "", a).lower() for a in x_aliases]
    y_aliases = [re.sub(r"\s+", "", a).lower() for a in y_aliases]
    pb_aliases = [re.sub(r"\s+", "", a).lower() for a in pb_aliases]

    # resolve X, Y, PBurn
    resolved_x = find_one("XPos", x_aliases)
    resolved_y = find_one("YPos", y_aliases)
    resolved_pb = find_one("PBurn", pb_aliases)

    # resolve FIL columns: allow "FIL1", "FIL 1", " FL1 " etc.
    resolved_fils = []
    for k in range(1, 7):
        candidates = [f"fil{k}", f"fl{k}", f"flp{k}", f"fil_{k}", f"fl_{k}"]
        candidates = [re.sub(r"\s+", "", c).lower() for c in candidates]
        # direct mapping if exact exists
        found = None
        for c in candidates:
            if c in norm_map:
                found = norm_map[c]
                break
        if found is None:
            # brute-force: search any column where normalized matches r"^fil\s*0*k$"
            for norm, actual in norm_map.items():
                if re.fullmatch(rf"(fil|fl|flp)_?0*{k}", norm):
                    found = actual
                    break
        if found is None:
            raise KeyError(f"FIL{k}")
        resolved_fils.append(found)

    return resolved_x, resolved_y, resolved_pb, tuple(resolved_fils)


def process_csv(csv_path: str,
                x_col: str = "XPos",
                y_col: str = "YPos",
                pburn_col: str = "PBurn",
                fil_cols: Iterable[str] = ("FIL1", "FIL2", "FIL3", "FIL4", "FIL5", "FIL6"),
                midpoint_ft: np.ndarray = FL_MIDPOINTS_FT,
                drop_na_rows: bool = True) -> pd.DataFrame:
    """
    Read one FLP_English.csv and compute Conditional Flame Length (CFL) in feet.
    Robust to header whitespace and minor naming variants (e.g., ' FIL1', 'FIL 1', 'fl1').

    Returns a DataFrame with columns: [XPos, YPos, PBurn, CFL_ft, mode]
    """
    # Use engine='python' to be tolerant; strip leading spaces after delimiter.
    df = pd.read_csv(csv_path, engine="python", skipinitialspace=True)

    # Show original columns for debugging
    print("\n[DEBUG] Raw columns:", list(df.columns))

    # Normalize names for matching, but keep original df with original names.
    # We'll resolve actual names and then use those.
    try:
        resolved_x, resolved_y, resolved_pb, resolved_fils = _resolve_columns(
            df, x_col=x_col, y_col=y_col, pburn_col=pburn_col, fil_cols=fil_cols
        )
    except KeyError as ke:
        # Provide a friendlier error that shows what's present and what's missing
        missing = str(ke).strip("'")
        raise ValueError(
            f"Missing required column '{missing}' in {os.path.basename(csv_path)}.\n"
            f"Seen columns: {list(df.columns)}\n"
            f"Tip: headers sometimes contain leading spaces — this function now tries to fix that. "
            f"If it still fails, please share a sample header line."
        )

    if drop_na_rows:
        df = df.dropna(subset=[*resolved_fils, resolved_pb])

    # Ensure numeric
    df[list(resolved_fils)] = df[list(resolved_fils)].apply(pd.to_numeric, errors="coerce").fillna(0.0)
    df[resolved_pb] = pd.to_numeric(df[resolved_pb], errors="coerce").fillna(0.0)

    # Vectorized computation
    fil_vals = df.loc[:, resolved_fils].to_numpy(dtype=np.float64)
    fil_sum = fil_vals.sum(axis=1)
    cfl_if_conditional = (fil_vals @ midpoint_ft)
    with np.errstate(divide='ignore', invalid='ignore'):
        cfl_if_unconditional = np.divide((fil_vals @ midpoint_ft),
                                         df[resolved_pb].to_numpy(dtype=np.float64),
                                         out=np.zeros_like(fil_sum, dtype=np.float64),
                                         where=df[resolved_pb].to_numpy(dtype=np.float64) > 0)

    # Decide mode per row
    modes = []
    cfl_result = np.empty_like(cfl_if_conditional)
    # Build a temp df view with resolved names to reuse the same heuristic
    tmp = df[[resolved_pb, *resolved_fils]].copy()
    tmp.columns = ["PBurn"] + [f"FIL{k}" for k in range(1, 7)]
    for i in range(len(tmp)):
        mode = _auto_detect_conditional(tmp.iloc[i], fil_cols=[f"FIL{k}" for k in range(1, 7)], pburn_col="PBurn")
        modes.append(mode)
        cfl_result[i] = cfl_if_conditional[i] if mode == "conditional" else cfl_if_unconditional[i]

    # Build output with canonical names
    out = pd.DataFrame({
        "XPos": df[resolved_x].values,
        "YPos": df[resolved_y].values,
        "PBurn": df[resolved_pb].values,
        "CFL_ft": cfl_result,
        "mode": modes
    })

    return out


In [7]:
root_dir = r"C:\Users\bsf31\Documents\data\NL060\WFM Outputs\run_97th_percentiles"

# 1) Process all subfolders, just get DataFrames back
results = batch_process_folder(root_dir)

# Each key is a CSV path; each value is a DataFrame with columns: XPos, YPos, PBurn, CFL_ft, mode
df_one = next(iter(results.values()))
df_one.head()





[DEBUG] Raw columns: ['XPos', 'YPos', 'PBurn', 'FIL1', 'FIL2', 'FIL3', 'FIL4', 'FIL5', 'FIL6']

[DEBUG] Raw columns: ['XPos', 'YPos', 'PBurn', 'FIL1', 'FIL2', 'FIL3', 'FIL4', 'FIL5', 'FIL6']


KeyboardInterrupt: 

In [None]:
results

In [None]:
# 2) (Optional) Also write CFL rasters as GeoTIFFs
# Provide your EPSG if you know the coordinates of XPos/YPos (e.g., 26911 for NAD83 / UTM zone 11N)
results = batch_process_folder(
    root_dir,
    write_geotiffs=True,
    crs_epsg=26911  # <- change if needed, or None to write without CRS
)

In [3]:
# create an example FLP-style dataframe and show its columns
df_example = pd.DataFrame([
    # conditional-style row (FIL sum ~1.0, PBurn is prob of burn)
    {"XPos": 100.0, "YPos": 200.0, "PBurn": 0.75, "FIL1": 0.10, "FIL2": 0.20, "FIL3": 0.30, "FIL4": 0.20, "FIL5": 0.15, "FIL6": 0.05},
    # unconditional-style row (FIL sum ~PBurn)
    {"XPos": 110.0, "YPos": 200.0, "PBurn": 0.60, "FIL1": 0.06, "FIL2": 0.12, "FIL3": 0.18, "FIL4": 0.12, "FIL5": 0.09, "FIL6": 0.03},
    # zero-burn row
    {"XPos": 100.0, "YPos": 210.0, "PBurn": 0.0,  "FIL1": 0.0,  "FIL2": 0.0,  "FIL3": 0.0,  "FIL4": 0.0,  "FIL5": 0.0,  "FIL6": 0.0},
    # another conditional-style row
    {"XPos": 110.0, "YPos": 210.0, "PBurn": 0.90, "FIL1": 0.05, "FIL2": 0.15, "FIL3": 0.25, "FIL4": 0.25, "FIL5": 0.15, "FIL6": 0.15},
])

# show the dataframe and its columns
print(df_example)
print("\nColumns:", df_example.columns.tolist())

    XPos   YPos  PBurn  FIL1  FIL2  FIL3  FIL4  FIL5  FIL6
0  100.0  200.0   0.75  0.10  0.20  0.30  0.20  0.15  0.05
1  110.0  200.0   0.60  0.06  0.12  0.18  0.12  0.09  0.03
2  100.0  210.0   0.00  0.00  0.00  0.00  0.00  0.00  0.00
3  110.0  210.0   0.90  0.05  0.15  0.25  0.25  0.15  0.15

Columns: ['XPos', 'YPos', 'PBurn', 'FIL1', 'FIL2', 'FIL3', 'FIL4', 'FIL5', 'FIL6']
