In [None]:
import pandas as pd
from pathlib import Path
from PIL import Image, ImageFilter, ImageEnhance
import numpy as np
import io
import json
from tqdm.notebook import tqdm
from google.colab import drive
import os

In [None]:
# --- Mount Google Drive (Colab) and configure paths ---
from google.colab import drive
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive/', force_remount=True)
except Exception as e:
    print(f"Error mounting drive: {e}")

from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image, ImageFilter
from tqdm.auto import tqdm
import io, os, random

ROOT = Path("/content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data")
CSV_PATH = ROOT / "processed/metadata.csv"

DEG_ROOT = ROOT / "processed" / "degraded_sets"
DEG_ROOT.mkdir(parents=True, exist_ok=True)
DEGRAD_MANIFEST = DEG_ROOT / "degraded_manifest.parquet"

def set_seed(s=1337):
    random.seed(s); np.random.seed(s)
set_seed(1337)

print("Using ROOT:", ROOT)
print("Metadata:", CSV_PATH)
print("Degraded root:", DEG_ROOT)
print("Manifest will be written to:", DEGRAD_MANIFEST)

Mounting Google Drive...
Mounted at /content/drive/
Using ROOT: /content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data
Metadata: /content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/metadata.csv
Degraded root: /content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets
Manifest will be written to: /content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets/degraded_manifest.parquet


In [None]:
import pandas as pd

# --- Load metadata & choose which split/source to degrade ---
df_meta = pd.read_csv(CSV_PATH)
# Which sources/splits to process:
TASKS = [
    dict(name="seg",  query="source=='aquavplant' and split=='val'",       needs_mask=True),
    dict(name="pres", query="source=='bangladesh_augmented' and split=='val'", needs_mask=False),
]

# Combine queries from TASKS into a single SOURCE_QUERY
SOURCE_QUERY = ' or '.join([task['query'] for task in TASKS])
df_src = df_meta.query(SOURCE_QUERY).reset_index(drop=True)

print("Rows selected by SOURCE_QUERY:", SOURCE_QUERY, "->", len(df_src))
df_src.head(3)

Rows selected by SOURCE_QUERY: source=='aquavplant' and split=='val' or source=='bangladesh_augmented' and split=='val' -> 670


Unnamed: 0,image_path,mask_path,has_hyacinth,source,split,species_name,base_stem
0,/content/drive/MyDrive/HyacinthWatch_workspace...,/content/drive/MyDrive/HyacinthWatch_workspace...,1,aquavplant,val,,
1,/content/drive/MyDrive/HyacinthWatch_workspace...,/content/drive/MyDrive/HyacinthWatch_workspace...,1,aquavplant,val,,
2,/content/drive/MyDrive/HyacinthWatch_workspace...,/content/drive/MyDrive/HyacinthWatch_workspace...,1,aquavplant,val,,


In [None]:
# --- Degradation functions ---
from PIL import ImageOps

def degrade(img: Image.Image, op: str, severity: int) -> Image.Image:
    img = img.convert("RGB")
    if op == "lowres":
        f = {1:2, 2:4, 3:8}[severity]
        w,h = img.size
        return img.resize((max(1,w//f), max(1,h//f)), Image.BICUBIC).resize((w,h), Image.BICUBIC)
    if op == "jpeg":
        q = {1:60, 2:30, 3:10}[severity]
        buf = io.BytesIO(); img.save(buf, format="JPEG", quality=q)
        buf.seek(0); return Image.open(buf).convert("RGB")
    if op == "defocus":
        r = {1:2, 2:4, 3:6}[severity]
        return img.filter(ImageFilter.GaussianBlur(r))
    if op == "brightness":
        g = {1:0.85, 2:0.65, 3:0.45}[severity]
        arr = (np.asarray(img).astype(np.float32) / 255.0)
        arr = np.clip(arr ** (1.0/g), 0, 1)
        return Image.fromarray((arr*255).astype(np.uint8))
    if op == "noise":
        s = {1:5, 2:15, 3:30}[severity]
        arr = np.asarray(img).astype(np.float32)
        noise = np.random.normal(0, s, arr.shape)
        arr = np.clip(arr + noise, 0, 255)
        return Image.fromarray(arr.astype(np.uint8))
    raise ValueError(f"Unknown op: {op}")


In [None]:
# --- Config ---
CORRUPTIONS = ["jpeg", "lowres", "defocus", "brightness", "noise"]
SEVERITIES = [1, 2, 3]
KEEP_SPLIT_SUBFOLDER = True

CORRUPTIONS = ["jpeg", "lowres", "defocus", "brightness", "noise"]
SEVERITIES = [1, 2, 3]
KEEP_SPLIT_SUBFOLDER = True  # keep 'val' subfolder
USE_SOURCE_SUBFOLDER = True  # avoid filename collisions across sources

In [None]:
# --- Generate degraded images + write manifest ---
all_rows = []

for task in TASKS:
    df_src = df_meta.query(task["query"]).reset_index(drop=True)
    print(f"Building degraded for: {task['query']} -> {len(df_src)} images")

    for op in CORRUPTIONS:
        for sev in SEVERITIES:
            split_sub = f"{df_src['split'].iloc[0]}" if KEEP_SPLIT_SUBFOLDER and len(df_src)>0 else ""
            source_sub = df_src['source'].iloc[0] if USE_SOURCE_SUBFOLDER and 'source' in df_src.columns and len(df_src)>0 else ""
            out_dir = DEG_ROOT / op / f"S{sev}"
            if split_sub: out_dir = out_dir / split_sub
            if source_sub: out_dir = out_dir / source_sub
            out_dir.mkdir(parents=True, exist_ok=True)

            for _, row in tqdm(df_src.iterrows(), total=len(df_src), desc=f"{op}/S{sev}/{source_sub or ''}"):
                in_path = Path(row.image_path)
                if not in_path.exists():
                    in_path = (ROOT / in_path).resolve()  # robustness for relative paths
                img = Image.open(in_path).convert("RGB")

                deg_img = degrade(img, op, sev)  # your existing function
                out_path = out_dir / in_path.name
                # Save as JPEG for consistency
                deg_img.save(out_path, format="JPEG", quality=95)

                rec = {
                    "image_id": row.get("image_id", in_path.stem),
                    "split": row.get("split", "val"),
                    "source": row.get("source", None),
                    "corruption": op,
                    "severity": int(sev),
                    "image_path": str(out_path),
                }
                # Keep mask_path for segmentation rows
                if task["needs_mask"]:
                    rec["mask_path"] = row.get("mask_path", None)
                else:
                    rec["mask_path"] = None
                # Keep presence label for presence rows
                if "has_hyacinth" in row:
                    rec["has_hyacinth"] = int(row["has_hyacinth"])
                all_rows.append(rec)

# Write/merge manifest
df_deg = pd.DataFrame(all_rows)
if len(df_deg):
    # If you want to merge with an existing manifest, read, concat, drop dups:
    if DEGRAD_MANIFEST.exists():
        old = pd.read_parquet(DEGRAD_MANIFEST)
        df_deg = pd.concat([old, df_deg], ignore_index=True)
        df_deg.drop_duplicates(subset=["image_path"], inplace=True)

    # Ensure types
    if df_deg["severity"].dtype.kind != "i":
        df_deg["severity"] = df_deg["severity"].astype(int)

    df_deg.to_parquet(DEGRAD_MANIFEST, index=False)
    print("Wrote manifest:", DEGRAD_MANIFEST, "rows:", len(df_deg))
else:
    print("No rows written.")

Building degraded for: source=='aquavplant' and split=='val' -> 27 images


jpeg/S1/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

jpeg/S2/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

jpeg/S3/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

lowres/S1/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

lowres/S2/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

lowres/S3/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

defocus/S1/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

defocus/S2/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

defocus/S3/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

brightness/S1/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

brightness/S2/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

brightness/S3/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

noise/S1/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

noise/S2/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

noise/S3/aquavplant:   0%|          | 0/27 [00:00<?, ?it/s]

Building degraded for: source=='bangladesh_augmented' and split=='val' -> 643 images


jpeg/S1/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

jpeg/S2/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

jpeg/S3/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

lowres/S1/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

lowres/S2/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

lowres/S3/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

defocus/S1/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

defocus/S2/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

defocus/S3/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

brightness/S1/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

brightness/S2/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

brightness/S3/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

noise/S1/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

noise/S2/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

noise/S3/bangladesh_augmented:   0%|          | 0/643 [00:00<?, ?it/s]

Wrote manifest: /content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets/degraded_manifest.parquet rows: 3300


In [None]:
# --- Validation: ensure different files per severity ---
check = (df_deg.query("corruption=='jpeg'")[["severity","image_path"]]
               .groupby("severity")["image_path"]
               .nunique())
print("Distinct paths per severity for 'jpeg':")
print(check)

# Safely get a sample_id, checking if df_deg is not empty
if not df_deg.empty and not df_deg.query("corruption=='jpeg'").empty:
    sample_id = df_deg.query("corruption=='jpeg'")["image_id"].iloc[0]
    paths_by_sev = (df_deg.query("corruption=='jpeg' and image_id==@sample_id")[["severity","image_path"]]
                        .set_index("severity").to_dict()["image_path"])
    print("Sample image_id:", sample_id, "paths:", paths_by_sev)
else:
    print("No 'jpeg' corruption data found in df_deg for validation.")

Distinct paths per severity for 'jpeg':
severity
1    220
2    220
3    220
Name: image_path, dtype: int64
Sample image_id: image_1 paths: {1: '/content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets/jpeg/S1/val/aquavplant/image_1.jpg', 2: '/content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets/jpeg/S2/val/aquavplant/image_1.jpg', 3: '/content/drive/MyDrive/HyacinthWatch_workspace/HyacinthWatch_data/processed/degraded_sets/jpeg/S3/val/aquavplant/image_1.jpg'}
