<a href="https://colab.research.google.com/github/eth0-02/Astro-Theme-Creek/blob/master/Storage_potential_calc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Colab: ONE output per file (LONG format; IDs safe; blanks->0; single FID) ===
!pip -q install pandas numpy

import pandas as pd, numpy as np, re
from calendar import month_abbr
from pathlib import Path
from google.colab import files

OUT_ROOT = Path("outputs_csv_long_one_per_file"); OUT_ROOT.mkdir(exist_ok=True, parents=True)

# ---------- helpers ----------
def _month_avgs_any(row: pd.Series, base: str) -> pd.Series:
    """Return Jan–Dec mean for base_1..base_N (N can be 12, 24, 36, 48, ...)."""
    pairs = []
    for key in row.index:
        m = re.fullmatch(fr"{base}_(\d+)", key)
        if m:
            idx = int(m.group(1))
            try:
                val = float(row[key])
            except Exception:
                val = float("nan")
            pairs.append((idx, val))
    if not pairs:
        return pd.Series([float("nan")]*12, index=range(1,13))
    pairs.sort(key=lambda x: x[0])
    vals = np.array([v for _, v in pairs], dtype="float64")
    months = (np.arange(len(vals)) % 12) + 1
    tmp = pd.DataFrame({"month": months, "val": vals})
    return tmp.groupby("month")["val"].mean().reindex(range(1,13))

def _coerce_float(x):
    try: return float(x)
    except: return float("nan")

def _non_timestep_cols(df: pd.DataFrame):
    """All original columns that are NOT H_#, overflow_#, R_#, balance_#."""
    return [c for c in df.columns if not re.fullmatch(r"(H|overflow|R|balance)_\d+", c)]

def _unify_fid_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure exactly one FID column:
       - both 'FID' & 'fid' -> fill FID nulls from 'fid', then drop 'fid'
       - only 'fid' -> rename to 'FID'
    """
    if "FID" in df.columns and "fid" in df.columns:
        df["FID"] = df["FID"].where(df["FID"].notna(), df["fid"])
        df = df.drop(columns=["fid"])
    elif "fid" in df.columns and "FID" not in df.columns:
        df = df.rename(columns={"fid":"FID"})
    return df

def _id_columns_present(df: pd.DataFrame):
    """IDs we never coerce/fill."""
    keep_ids = []
    for cand in ["HYBAS_ID","SUB_HYBAS_ID","BASIN_HYBAS_ID","FID"]:
        if cand in df.columns: keep_ids.append(cand)
    return list(dict.fromkeys(keep_ids))

def _metric_fill_spaces_and_nans_to_zero(df: pd.DataFrame, id_cols: list):
    """Coerce metrics to numeric and fill blanks/space-only/NaN -> 0. Skip IDs & obvious text columns."""
    never = set(id_cols) | {"MonthName","County","BASIN_NAME","BASIN_NAME_JOIN","geometry"}
    def is_text_col(c): return ("name" in c.lower()) or ("geom" in c.lower())
    for c in df.columns:
        if c in never or is_text_col(c):
            continue
        s_orig = df[c]
        s_num = pd.to_numeric(
            s_orig.astype(str).str.strip().replace({"": np.nan, "None": np.nan, "nan": np.nan}, regex=False),
            errors="coerce"
        )
        # if it's purely textual, leave as-is
        if s_num.notna().sum() == 0 and s_orig.dtype == object:
            continue
        df[c] = s_num.fillna(0)
    return df

def _standardize_columns(base: pd.DataFrame) -> pd.DataFrame:
    """Light standardization from the SAME file (no joins)."""
    if "County" not in base.columns and "COUNTY_NAME" in base.columns:
        base = base.rename(columns={"COUNTY_NAME":"County"})
    if "ponds_per_sqkm" not in base.columns and "NP_per_km2" in base.columns:
        base = base.rename(columns={"NP_per_km2":"ponds_per_sqkm"})
    return base

def _ensure_area_km2(row):
    if "area_km2" in row and pd.notna(row.get("area_km2")):
        return _coerce_float(row.get("area_km2"))
    return _coerce_float(row.get("area_m2"))/1e6 if pd.notna(row.get("area_m2")) else np.nan

def _choose_level(df: pd.DataFrame) -> str:
    """Pick ONE level per file by column presence (specific → general)."""
    if "SUB_HYBAS_ID" in df.columns: return "SUBBASIN"
    if "BASIN_NAME"  in df.columns: return "BASIN"
    if "County" in df.columns or "COUNTY_NAME" in df.columns: return "COUNTY"
    return "GENERIC"

# ---------- processor ----------
def process_file(upload_name: str):
    src = upload_name
    stem = Path(upload_name).stem
    out_dir = OUT_ROOT / stem
    out_dir.mkdir(exist_ok=True, parents=True)

    base = pd.read_csv(src)
    base = _standardize_columns(base)
    base = _unify_fid_cols(base)

    level = _choose_level(base)            # ONE output level per file
    passthrough_cols = _non_timestep_cols(base)

    long_rows = []
    for _, r in base.iterrows():
        H_m        = _month_avgs_any(r, "H")
        overflow_m = _month_avgs_any(r, "overflow")
        R_m        = _month_avgs_any(r, "R")
        balance_m  = _month_avvs_any(r, "balance") if False else _month_avgs_any(r, "balance")  # safeguard typo

        SA = _coerce_float(r.get("SA", np.nan))
        area_km2 = _ensure_area_km2(r)

        overflow_m3 = overflow_m * SA
        R_m3        = R_m * SA
        balance_m3  = balance_m * SA

        for m in range(1, 13):
            rec = {
                "MonthofYear": m,
                "MonthName": month_abbr[m],
                "H": H_m.loc[m],                    # m
                "R_m3": R_m3.loc[m],                # m³
                "balance_m3": balance_m3.loc[m],    # m³
                "overflow_m3": overflow_m3.loc[m],  # m³
                "Area_km2": area_km2,
                "Mean_Pond_Depth": r.get("D", np.nan),
                "D_max": r.get("D_max", np.nan),
                "SA": SA,
                "ponds_per_sqkm": r.get("ponds_per_sqkm", r.get("NP_per_km2", np.nan)),
            }
            # keep ALL non-timestep columns from this file (includes IDs)
            for c in passthrough_cols:
                rec[c] = r.get(c, np.nan)
            long_rows.append(rec)

    df = pd.DataFrame.from_records(long_rows)
    df = _unify_fid_cols(df)    # single FID in the final output
    id_cols = _id_columns_present(df)

    # Apply your include/exclude rules by level
    if level in {"COUNTY", "BASIN"} and "BASIN_HYBAS_ID" in df.columns:
        df = df.drop(columns=["BASIN_HYBAS_ID"])     # exclude at county & basin level
    # (for SUBBASIN we KEEP BASIN_HYBAS_ID)

    # Fill metrics only
    df = _metric_fill_spaces_and_nans_to_zero(df, id_cols=id_cols)

    # Front column order per level (others appended to keep everything you supplied)
    if level == "COUNTY":
        front = [c for c in [
            "HYBAS_ID","County","MonthofYear","MonthName","Area_km2",
            "ponds_per_sqkm","D_max","H","R_m3","balance_m3","overflow_m3",
            "Mean_Pond_Depth","SA","FID"
        ] if c in df.columns]
    elif level == "BASIN":
        front = [c for c in [
            "HYBAS_ID","BASIN_NAME","MonthofYear","MonthName","Area_km2",
            "ponds_per_sqkm","D_max","H","R_m3","balance_m3","overflow_m3",
            "Mean_Pond_Depth","SA","FID"
        ] if c in df.columns]
    elif level == "SUBBASIN":
        front = [c for c in [
            "SUB_HYBAS_ID","BASIN_HYBAS_ID","BASIN_NAME","HYBAS_ID",
            "MonthofYear","MonthName","Area_km2","ponds_per_sqkm","D_max","H",
            "R_m3","balance_m3","overflow_m3","Mean_Pond_Depth","SA","FID"
        ] if c in df.columns]
    else:  # GENERIC
        front = [c for c in [
            "HYBAS_ID","MonthofYear","MonthName","Area_km2",
            "D_max","H","R_m3","balance_m3","overflow_m3","Mean_Pond_Depth","SA","FID"
        ] if c in df.columns]

    df = df[front + [c for c in df.columns if c not in front]]

    out_path = out_dir / f"{stem}__{level}_monthly_means_LONG.csv"
    df.to_csv(out_path, index=False)
    files.download(str(out_path))
    print(f"[{stem}] Wrote ONE output:", out_path.name, "| Level:", level)

# ---------- run ----------
print("Upload one or more CSVs (e.g., 'Kenbasin_hydrosheds.csv', 'Kencounty_hydrosheds.csv', 'sub_basins_with_basin_attributes.csv')")
uploaded = files.upload()
for name in uploaded.keys():
    process_file(name)

print("Done. You should have exactly one CSV per uploaded file.")


Upload one or more CSVs (e.g., 'Kenbasin_hydrosheds.csv', 'Kencounty_hydrosheds.csv', 'sub_basins_with_basin_attributes.csv')


Saving Kenbasin_hydrosheds.csv to Kenbasin_hydrosheds.csv
Saving Kencounty_hydrosheds.csv to Kencounty_hydrosheds.csv
Saving sub_basins_with_basin_attributes.csv to sub_basins_with_basin_attributes.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[Kenbasin_hydrosheds] Wrote ONE output: Kenbasin_hydrosheds__GENERIC_monthly_means_LONG.csv | Level: GENERIC


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[Kencounty_hydrosheds] Wrote ONE output: Kencounty_hydrosheds__COUNTY_monthly_means_LONG.csv | Level: COUNTY


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[sub_basins_with_basin_attributes] Wrote ONE output: sub_basins_with_basin_attributes__SUBBASIN_monthly_means_LONG.csv | Level: SUBBASIN
Done. You should have exactly one CSV per uploaded file.
