<a href="https://colab.research.google.com/github/eth0-02/Astro-Theme-Creek/blob/master/SP_FINAL_SCRIPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# === Colab: Source CSVs -> LONG outputs (now with MonthNumber + MonthName) ===
!pip -q install pandas numpy

import pandas as pd, numpy as np, re, os
from calendar import month_abbr
from pathlib import Path
from google.colab import files

OUT_DIR = Path("outputs_bi_clean"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def _find_timestep_cols(df, base):
    pat = re.compile(fr"^{base}_(\d+)$", flags=re.IGNORECASE)
    found = []
    for c in df.columns:
        m = pat.match(c)
        if m: found.append((int(m.group(1)), c))
    found.sort(key=lambda x: x[0])
    return [c for _, c in found]

def _month_means(row, cols):
    if not cols:
        return pd.Series([np.nan]*12, index=range(1,13))
    vals = pd.to_numeric(row[cols], errors="coerce").to_numpy(dtype="float64")
    months = (np.arange(vals.size) % 12) + 1
    return pd.DataFrame({"m": months, "v": vals}).groupby("m")["v"].mean().reindex(range(1,13))

def _non_timestep_cols(df):
    # Drop raw step columns and any *source* MonthNumber/MonthofYear (we'll add our own MonthNumber)
    step_pat = re.compile(r"^(H|overflow|R|balance)_\d+$", flags=re.IGNORECASE)
    drop_src_months = {"monthnumber","monthofyear"}
    return [c for c in df.columns if not step_pat.match(c) and c.lower() not in drop_src_months]

def _choose_level(df, fname):
    cols = {c.lower() for c in df.columns}
    fn = fname.lower()
    if "sub_hybas_id" in cols: return "Sub-Basin"
    if ("sub" in fn and "basin" in fn) or "sub-basin" in fn or "sub_basin" in fn: return "Sub-Basin"
    if "basin_name" in cols: return "Basin"
    if "basin_hybas_id" in cols and "sub_hybas_id" not in cols: return "Basin"
    if "basin" in fn: return "Basin"
    if "county" in cols or "county_name" in cols or "county" in fn: return "County"
    return "Generic"

def _unify_fid_keep(df):
    if "FID" in df.columns and "fid" in df.columns:
        first_is_FID = list(df.columns).index("FID") < list(df.columns).index("fid")
        tgt, src = ("FID","fid") if first_is_FID else ("fid","FID")
        df[tgt] = df[tgt].where(pd.notna(df[tgt]), df[src])
        return df.drop(columns=[src])
    return df

def _id_cols_present(df):
    want = {"hybas_id","sub_hybas_id","basin_hybas_id","fid","adm0_code"}
    return [c for c in df.columns if c.lower() in want]

def _metric_fill_spaces_to_zero(df, id_cols):
    never = set(id_cols) | {"MonthName","MonthNumber","Level","geometry"}
    def is_text(c): return ("name" in c.lower()) or ("geom" in c.lower())
    for c in df.columns:
        if c in never or is_text(c):
            continue
        s2 = pd.to_numeric(
            df[c].astype(str).str.strip().replace({"": np.nan, "None": np.nan, "nan": np.nan}, regex=False),
            errors="coerce"
        )
        if s2.notna().sum()==0 and df[c].dtype==object:
            continue
        df[c] = s2.fillna(0)
    return df

def _drop_dup_names(df):
    return df.loc[:, ~pd.Index(df.columns).duplicated(keep="first")]

def _unique_outpath(base_dir: Path, base_name: str) -> Path:
    p = base_dir / base_name
    if not p.exists(): return p
    i = 2
    stem, ext = os.path.splitext(base_name)
    while True:
        q = base_dir / f"{stem} ({i}){ext}"
        if not q.exists(): return q
        i += 1

# ---------- core transform ----------
def transform_source_to_long(upload_name: str) -> Path:
    src = Path(upload_name)
    base = pd.read_csv(src)
    base = _unify_fid_keep(base)

    # timestep detection and years
    Hc = _find_timestep_cols(base, "H")
    Oc = _find_timestep_cols(base, "overflow")
    Rc = _find_timestep_cols(base, "R")
    Bc = _find_timestep_cols(base, "balance")
    steps = max(len(Hc), len(Oc), len(Rc), len(Bc))
    years = int(steps/12) if steps else 0

    level = _choose_level(base, src.name)
    passthrough = _non_timestep_cols(base)

    rows = []
    for _, r in base.iterrows():
        Hm = _month_means(r, Hc)  # m
        Om = _month_means(r, Oc)  # m
        Rm = _month_means(r, Rc)  # m
        Bm = _month_means(r, Bc)  # m

        try: SA_val = float(r.get("SA"))
        except: SA_val = np.nan

        Om3 = Om * SA_val
        Rm3 = Rm * SA_val
        Bm3 = Bm * SA_val

        for m in range(1, 13):
            rec = {
                "Level": level,
                "YearsAveraged": years,
                "MonthNumber": m,              # <— added
                "MonthName": month_abbr[m],
                "H (m)": Hm.loc[m],
                "R (M3)": Rm3.loc[m],
                "balance (M3)": Bm3.loc[m],
                "overflow (M3)": Om3.loc[m],
            }
            for c in passthrough:
                rec[c] = r.get(c, np.nan)
            rows.append(rec)

    df = pd.DataFrame.from_records(rows)
    df = _drop_dup_names(df)
    df = _unify_fid_keep(df)

    # Enforce BASIN_HYBAS_ID rule
    if level in {"County","Basin"}:
        drop = [c for c in df.columns if c.lower()=="basin_hybas_id"]
        if drop: df = df.drop(columns=drop)

    # Fill metrics only (IDs/text untouched)
    ids = _id_cols_present(df)
    df = _metric_fill_spaces_to_zero(df, ids)

    # Output name
    if level == "County":
        out_path = _unique_outpath(OUT_DIR, "County Output BI.csv")
    elif level == "Basin":
        out_path = _unique_outpath(OUT_DIR, "Basin Output BI.csv")
    elif level == "Sub-Basin":
        out_path = _unique_outpath(OUT_DIR, "Sub-Basin Output BI.csv")
    else:
        out_path = _unique_outpath(OUT_DIR, f"{src.stem}__Generic Output BI.csv")

    df.to_csv(out_path, index=False)
    files.download(str(out_path))
    print(f"[{src.name}] → {out_path.name} | Level: {level} | YearsAveraged: {years}")
    return out_path

# ---------- run ----------
print("Upload one or more SOURCE CSVs (e.g., Kenbasin_hydrosheds.csv, Kencounty_hydrosheds.csv, sub_basins_with_basin_attributes.csv)")
uploaded = files.upload()
for name in uploaded.keys():
    transform_source_to_long(name)
print("Done.")


Upload one or more SOURCE CSVs (e.g., Kenbasin_hydrosheds.csv, Kencounty_hydrosheds.csv, sub_basins_with_basin_attributes.csv)


Saving Kenbasin_hydrosheds.csv to Kenbasin_hydrosheds.csv
Saving Kencounty_hydrosheds.csv to Kencounty_hydrosheds.csv
Saving sub_basins_with_basin_attributes.csv to sub_basins_with_basin_attributes.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[Kenbasin_hydrosheds.csv] → Basin Output BI.csv | Level: Basin | YearsAveraged: 4


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[Kencounty_hydrosheds.csv] → County Output BI.csv | Level: County | YearsAveraged: 4


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[sub_basins_with_basin_attributes.csv] → Sub-Basin Output BI.csv | Level: Sub-Basin | YearsAveraged: 4
Done.
