In [1]:
import os
import json
import numpy as np
import pandas as pd
from pycocotools.coco import COCO
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split

Save as numpy the mucilage patches

In [None]:
# Path to your cached .npz file
cache_file = "/home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/test_cache.npz"  # or val.npz / test.npz
out_file = "/home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/test_positive.npz"

# Load the cached dataset
data = np.load(cache_file)
X, y = data["X"], data["y"]  # X.shape = (N,H,W,C), y.shape = (N,)

# Select only patches with label 1 (mucilage)
mask = y == 1
X_pos = X[mask]
y_pos = y[mask]  # will be all 1, optional

print(f"Selected {len(X_pos)} positive patches out of {len(y)} total.")

# Save to a new npz file
np.savez_compressed(out_file, X=X_pos, y=y_pos)

Convert and save as RGB

In [None]:
path = "/home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/test_positive.npz"
arr = np.load(path)
X = arr['X']  # (N, H, W, C)
output_dir = "/home/ubuntu/mucilage_pipeline/mucilage-detection/rgb_patches"
os.makedirs(output_dir, exist_ok=True)

for i, patch in enumerate(X):
    rgb = patch[:, :, [3, 2, 1]]  # RGB bands
    # Percentile normalization
    p2, p98 = np.nanpercentile(rgb, (2, 98))
    rgb = np.clip((rgb - p2) / (p98 - p2 + 1e-6), 0, 1)
    rgb = (rgb * 255).astype(np.uint8)
    # Save each patch as PNG
    filename = os.path.join(output_dir, f"patch_{i:04d}.png")
    cv2.imwrite(filename, cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR))

Convert annotations from Roboflow to binary masks

In [7]:
# Path to your dataset
base_dir = "/home/ubuntu/mucilage_pipeline/mucilage-detection/roboflow_dataset"
splits = ["train", "valid", "test"]

for split in splits:
    img_dir = os.path.join(base_dir, split)
    ann_path = os.path.join(img_dir, "_annotations.coco.json")
    mask_dir = os.path.join(base_dir, f"masks_{split}")
    os.makedirs(mask_dir, exist_ok=True)

    coco = COCO(ann_path)

    for img_id in coco.getImgIds():
        img_info = coco.loadImgs(img_id)[0]
        img_name = img_info["file_name"]
        h, w = img_info["height"], img_info["width"]

        mask = np.zeros((h, w), dtype=np.uint8)

        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)
        for ann in anns:
            # Convert polygon or RLE to binary mask
            m = coco.annToMask(ann)
            mask = np.maximum(mask, m * 255)

        cv2.imwrite(os.path.join(mask_dir, img_name), mask)

    print(f"✅ Masks for {split} saved to {mask_dir}")

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
✅ Masks for train saved to /home/ubuntu/mucilage_pipeline/mucilage-detection/roboflow_dataset/masks_train
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
✅ Masks for valid saved to /home/ubuntu/mucilage_pipeline/mucilage-detection/roboflow_dataset/masks_valid
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
✅ Masks for test saved to /home/ubuntu/mucilage_pipeline/mucilage-detection/roboflow_dataset/masks_test


Reconstruct original order

In [3]:
def split_data(labels_file, test_size=0.3, val_size=0.5, seed=42):
    df = pd.read_csv(labels_file)

    # first split train vs test
    df_train, df_tmp = train_test_split(
        df, test_size=test_size, stratify=df["label"], random_state=seed
    )
    # then split train vs val
    df_test, df_val = train_test_split(
        df_tmp, test_size=val_size, stratify=df_tmp["label"], random_state=seed
    )
    return df_train, df_val, df_test

df = "/home/ubuntu/mucilage_pipeline/mucilage-detection/csv/patches_final.csv"
df_train, df_val, df_test = split_data(df)

In [8]:
base_dir = "/home/ubuntu/mucilage_pipeline/mucilage-detection"
splits = {
    "train": df_train,
    "val": df_val,
    "test": df_test
}

for split, df in splits.items():
    print(f"\n=== Processing {split.upper()} ===")

    # Load original npz cache
    cache_file = os.path.join(base_dir, f"saved_npy/{split}_cache.npz")
    data = np.load(cache_file)
    X, y = data["X"], data["y"]

    # Identify positive/negative indices
    # Indices
    pos_indices = np.where(y == 1)[0]
    neg_indices = np.where(y == 0)[0]

    # Initialize empty masks (same number of patches as X)
    H, W = 256, 256
    M = np.zeros((len(X), H, W), dtype=np.uint8)

    # Refined masks directory (from Roboflow export)
    mask_dir = os.path.join(base_dir, f"roboflow_dataset/masks")

    # Build prefix-based lookup dictionary for Roboflow masks
    mask_lookup = {}
    for fname in os.listdir(mask_dir):
        if fname.startswith("patch_"):
            prefix = fname.split("_png")[0]  # e.g. "patch_0000"
            mask_lookup[prefix] = os.path.join(mask_dir, fname)

    print(f"Found {len(mask_lookup)} refined masks for {split}.")

    # Fill masks for positive patches
    for i, idx in enumerate(tqdm(pos_indices)):
        prefix = f"patch_{i:04d}"  # matches your exported patch names
        mask_path = mask_lookup.get(prefix, None)
        if mask_path:
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            if mask is not None:
                M[idx] = (mask > 127).astype(np.uint8)
            else:
                print(f"⚠️ Could not read {mask_path}")
        else:
            print(f"⚠️ Missing mask for {prefix} ({split})")

    # Save mask array aligned with original X
    out_path = os.path.join(base_dir, f"saved_npy/{split}_masks_refined.npz")
    np.savez_compressed(out_path, M=M)
    print(f"✅ Saved refined masks to {out_path}")


=== Processing TRAIN ===
Found 222 refined masks for train.


100%|██████████| 154/154 [00:00<00:00, 4052.98it/s]


✅ Saved refined masks to /home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/train_masks_refined.npz

=== Processing VAL ===
Found 222 refined masks for val.


100%|██████████| 33/33 [00:00<00:00, 6921.99it/s]

✅ Saved refined masks to /home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/val_masks_refined.npz

=== Processing TEST ===





Found 222 refined masks for test.


100%|██████████| 35/35 [00:00<00:00, 3230.58it/s]

✅ Saved refined masks to /home/ubuntu/mucilage_pipeline/mucilage-detection/saved_npy/test_masks_refined.npz



