<a href="https://colab.research.google.com/github/eth0-02/Astro-Theme-Creek/blob/master/WA_sheets_extractor_and_combiner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ==============================
# Accurate Water Accounting Extractor
# (Colab Ready with Sanity Check)
# ==============================

!pip install pdfplumber

import pdfplumber
from google.colab import files
import os, re, csv

# --------------------------
# Step 1: Parameter Template
# --------------------------
PARAMETER_TEMPLATE = [
    "Sheet Number", "P", "Gross Inflow", "Net Inflow", "$Q_{sw}ir$", "$Q_{desal}$",
    "Landscape ET", "Rainfall ET", "Incremental ET", "ET", "Available Water",
    "+AS", "-AS", "Consumed water (Total)", "Depleted water (Total)",
    "Exploitable Water: Protected Land Use", "Exploitable Water: Utilized Land Use",
    "Exploitable Water: Modified Land Use", "Exploitable Water: Managed Water Use",
    "Utilized Flow: Protected Land Use", "Utilized Flow: Utilized Land Use",
    "Utilized Flow: Modified Land Use", "Utilized Flow: Managed Water Use",
    "$Q_{SW}$ outlet", "Utilizable outflow (Component 1)", "Utilizable outflow (Component 2)",
    "Non-recoverable flow", "Non-utilizable outflow", "Reserved outflow",
    "NaN", "NaN", "NaN"
]

FILENAME_PATTERN = "{county}_water_accounting_{year}.csv"
MASTER_FILENAME = "combined_water_accounting_all_years.csv"

# --------------------------
# Step 2: Upload PDFs
# --------------------------
print("📂 Please upload your water accounting PDF files...")
uploaded = files.upload()

# --------------------------
# Step 3: Helper functions
# --------------------------
def extract_text(pdf_path):
    """Extract raw text from PDF."""
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() or "" for page in pdf.pages])
    return text

def extract_numbers(pdf_path):
    """Try to extract numbers using tables first, fallback to text regex."""
    numbers = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Try table extraction
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    for cell in row:
                        if cell:
                            nums = re.findall(r"\d+\.\d+|\d+", cell)
                            numbers.extend(nums)
        # If no numbers from tables, fallback to text
        if len(numbers) < 32:
            text = page.extract_text() or ""
            nums = re.findall(r"\d+\.\d+|\d+", text)
            numbers.extend(nums)
    return numbers[:32]

def parse_county_year(text):
    """Extract county and year from text (e.g., Basin: Embu / Period: 2012)."""
    county_match = re.search(r"Basin:\s*([A-Za-z]+)", text)
    year_match = re.search(r"Period:\s*(\d{4})", text)
    county = county_match.group(1) if county_match else "Unknown"
    year = year_match.group(1) if year_match else "Unknown"
    return county, year

def write_year_csv(county, year, parameters, values):
    """Write one CSV for a single year & county."""
    filename = FILENAME_PATTERN.format(county=county.lower(), year=year)
    with open(filename, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["County", "Year", "Parameters", "Values"])
        for param, val in zip(parameters, values):
            writer.writerow([county, year, param, val])
    return filename

def combine_csvs(file_list):
    """Combine all per-year CSVs into one master file."""
    header_written = False
    with open(MASTER_FILENAME, mode="w", newline="", encoding="utf-8") as out_f:
        writer = csv.writer(out_f)
        for fp in sorted(file_list):
            with open(fp, mode="r", newline="", encoding="utf-8") as in_f:
                reader = csv.reader(in_f)
                header = next(reader)
                if not header_written:
                    writer.writerow(header)
                    header_written = True
                for row in reader:
                    writer.writerow(row)
    return MASTER_FILENAME

# --------------------------
# Step 4: Process PDFs
# --------------------------
produced_files = []
for pdf_name in uploaded.keys():
    print(f"\n🔎 Processing {pdf_name}...")
    text = extract_text(pdf_name)
    county, year = parse_county_year(text)
    values = extract_numbers(pdf_name)

    if len(values) < len(PARAMETER_TEMPLATE):
        print(f"⚠️ Warning: Only {len(values)} values found in {pdf_name} (expected 32).")
        continue

    # Sanity Check Preview
    print(f"📊 Preview for {county} {year}:")
    for p, v in zip(PARAMETER_TEMPLATE, values):
        print(f"  {p:<40} {v}")
    print("-" * 50)

    # Write CSV
    csv_file = write_year_csv(county, year, PARAMETER_TEMPLATE, values)
    produced_files.append(csv_file)
    print(f"✅ Saved {csv_file}")

# --------------------------
# Step 5: Combine into master
# --------------------------
if produced_files:
    master_file = combine_csvs(produced_files)
    print(f"\n✅ Combined master file created: {master_file}")
else:
    print("⚠️ No CSVs created. Please check parsing logic.")


📂 Please upload your water accounting PDF files...




Saving sheet1_2012.pdf to sheet1_2012 (1).pdf
Saving sheet1_2013.pdf to sheet1_2013 (1).pdf
Saving sheet1_2014.pdf to sheet1_2014 (1).pdf
Saving sheet1_2015.pdf to sheet1_2015 (1).pdf
Saving sheet1_2016.pdf to sheet1_2016 (1).pdf
Saving sheet1_2017.pdf to sheet1_2017 (1).pdf
Saving sheet1_2018.pdf to sheet1_2018 (1).pdf
Saving sheet1_2019.pdf to sheet1_2019 (1).pdf
Saving sheet1_2020.pdf to sheet1_2020 (1).pdf
Saving sheet1_2021.pdf to sheet1_2021 (1).pdf

🔎 Processing sheet1_2012 (1).pdf...
📊 Preview for Kenya 2012:
  Sheet Number                             371.2
  P                                        317.0
  Gross Inflow                             276.2
  Net Inflow                               29.8
  $Q_{sw}ir$                               100.3
  $Q_{desal}$                              40.5
  Landscape ET                             7.6
  Rainfall ET                              178.1
  Incremental ET                           299.4
  ET                                    



📊 Preview for Kenya 2014:
  Sheet Number                             365.3
  P                                        286.8
  Gross Inflow                             231.3
  Net Inflow                               29.8
  $Q_{sw}ir$                               99.3
  $Q_{desal}$                              32.5
  Landscape ET                             16.6
  Rainfall ET                              178.2
  Incremental ET                           261.6
  ET                                       241.2
  Available Water                          241.2
  +AS                                      45.6
  -AS                                      10.5
  Consumed water (Total)                   38.4
  Depleted water (Total)                   4.2
  Exploitable Water: Protected Land Use    53.1
  Exploitable Water: Utilized Land Use     9.9
  Exploitable Water: Modified Land Use     328.3
  Exploitable Water: Managed Water Use     0.0
  Utilized Flow: Protected Land Use        55.5
  Utilize



📊 Preview for Kenya 2017:
  Sheet Number                             344.9
  P                                        377.4
  Gross Inflow                             323.4
  Net Inflow                               35.2
  $Q_{sw}ir$                               116.1
  $Q_{desal}$                              47.5
  Landscape ET                             15.0
  Rainfall ET                              213.8
  Incremental ET                           357.4
  ET                                       341.3
  Available Water                          341.3
  +AS                                      36.1
  -AS                                      18.1
  Consumed water (Total)                   76.7
  Depleted water (Total)                   14.9
  Exploitable Water: Protected Land Use    109.6
  Exploitable Water: Utilized Land Use     17.9
  Exploitable Water: Modified Land Use     316.8
  Exploitable Water: Managed Water Use     0.0
  Utilized Flow: Protected Land Use        54.0
  Uti



📊 Preview for Kenya 2019:
  Sheet Number                             496.3
  P                                        532.3
  Gross Inflow                             411.8
  Net Inflow                               47.2
  $Q_{sw}ir$                               157.8
  $Q_{desal}$                              60.5
  Landscape ET                             11.0
  Rainfall ET                              276.5
  Incremental ET                           474.2
  ET                                       427.2
  Available Water                          427.2
  +AS                                      105.1
  -AS                                      21.4
  Consumed water (Total)                   97.6
  Depleted water (Total)                   16.4
  Exploitable Water: Protected Land Use    135.3
  Exploitable Water: Utilized Land Use     15.3
  Exploitable Water: Modified Land Use     445.4
  Exploitable Water: Managed Water Use     0.0
  Utilized Flow: Protected Land Use        120.5
  U