In [5]:
import os, json, random
from pathlib import Path
from PIL import Image, ImageDraw
from tqdm import tqdm


In [6]:
# root directories for your two datasets
DATASET_DIR_CRACK = Path("./datasets/crack_dataset")
DATASET_DIR_JOIN  = Path("./datasets/join_dataset")

# final merged dataset
MERGED_DIR = Path("./datasets/merged_clipseg")
os.makedirs(MERGED_DIR / "images", exist_ok=True)
os.makedirs(MERGED_DIR / "masks", exist_ok=True)


In [7]:
def load_all_coco(base_dir):
    coco_data = {"images": [], "annotations": [], "categories": []}
    for split in ["train", "valid", "test"]:
        ann_path = base_dir / split / "_annotations.coco.json"
        if ann_path.exists():
            with open(ann_path) as f:
                data = json.load(f)
                # adjust image paths
                for img in data["images"]:
                    img["split"] = split
                coco_data["images"].extend(data["images"])
                coco_data["annotations"].extend(data["annotations"])
                coco_data["categories"].extend(data["categories"])
    return coco_data


In [8]:
coco_crack = load_all_coco(DATASET_DIR_CRACK)
coco_join  = load_all_coco(DATASET_DIR_JOIN)
print(f"✅ Loaded Cracks: {len(coco_crack['images'])} imgs, {len(coco_crack['annotations'])} anns")
print(f"✅ Loaded Joins:  {len(coco_join['images'])} imgs, {len(coco_join['annotations'])} anns")


✅ Loaded Cracks: 5369 imgs, 8511 anns
✅ Loaded Joins:  1022 imgs, 1424 anns


In [9]:
def make_mask(img_path, anns, save_path, is_crack):
    img = Image.open(img_path)
    mask = Image.new("L", img.size, 0)
    draw = ImageDraw.Draw(mask)
    if is_crack:
        # polygon segmentations
        for ann in anns:
            if "segmentation" in ann and len(ann["segmentation"]) > 0:
                poly = ann["segmentation"][0]
                xy = [(poly[i], poly[i+1]) for i in range(0, len(poly), 2)]
                draw.polygon(xy, fill=255)
    else:
        # bbox → filled rectangle
        for ann in anns:
            if "bbox" in ann:
                x, y, w, h = ann["bbox"]
                draw.rectangle([x, y, x+w, y+h], fill=255)
    mask.save(save_path)


In [10]:
metadata = []

for dataset, coco_data, is_crack, prompt in [
    ("crack", coco_crack, True, "segment crack"),
    ("join",  coco_join,  False, "segment taping area")
]:
    for img in tqdm(coco_data["images"], desc=f"Processing {dataset}"):
        anns = [a for a in coco_data["annotations"] if a["image_id"] == img["id"]]
        src_img = (DATASET_DIR_CRACK if is_crack else DATASET_DIR_JOIN) / img["split"] / img["file_name"]
        if not src_img.exists(): 
            continue
        dst_img = MERGED_DIR / "images" / f"{dataset}_{img['file_name']}"
        dst_mask = MERGED_DIR / "masks" / f"{dataset}_{Path(img['file_name']).stem}.png"
        # copy image
        Image.open(src_img).save(dst_img)
        # make mask
        make_mask(src_img, anns, dst_mask, is_crack)
        metadata.append({
            "image": str(dst_img.name),
            "mask": str(dst_mask.name),
            "prompt": prompt
        })

# save metadata
with open(MERGED_DIR / "metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Merged dataset ready: {len(metadata)} total images")


Processing crack: 100%|██████████| 5369/5369 [02:00<00:00, 44.71it/s]
Processing join: 100%|██████████| 1022/1022 [00:22<00:00, 45.48it/s]


✅ Merged dataset ready: 6391 total images


In [11]:
import json
import random
import os
import shutil

# Paths
dataset_dir = r"datasets\merged_clipseg"
metadata_path = os.path.join(dataset_dir, "metadata.json")
output_dir = os.path.join(dataset_dir, "splits")

# Load metadata
with open(metadata_path, "r") as f:
    data = json.load(f)

# Shuffle dataset
random.shuffle(data)

# Split ratios
total = len(data)
train_end = int(0.7 * total)
val_end = int(0.9 * total)  # 70 + 20

train_data = data[:train_end]
val_data = data[train_end:val_end]
test_data = data[val_end:]

splits = {
    "train": train_data,
    "val": val_data,
    "test": test_data
}

# Create folders
for split in splits.keys():
    os.makedirs(os.path.join(output_dir, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, split, "masks"), exist_ok=True)

# Copy files and update JSON paths
for split_name, split_items in splits.items():
    split_json = []
    for item in split_items:
        # Source paths
        img_src = os.path.join(dataset_dir, "images", item["image"])
        mask_src = os.path.join(dataset_dir, "masks", item["mask"])

        # Destination paths
        img_dst = os.path.join(output_dir, split_name, "images", item["image"])
        mask_dst = os.path.join(output_dir, split_name, "masks", item["mask"])

        # Copy files
        shutil.copy(img_src, img_dst)
        shutil.copy(mask_src, mask_dst)

        # Update paths in JSON
        split_json.append({
            "image": os.path.join("images", item["image"]),
            "mask": os.path.join("masks", item["mask"]),
            "prompt": item["prompt"]
        })

    # Save split JSON
    with open(os.path.join(output_dir, f"{split_name}.json"), "w") as f:
        json.dump(split_json, f, indent=4)

print("✅ Dataset fully split with images, masks, and JSON files ready.")
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")


✅ Dataset fully split with images, masks, and JSON files ready.
Train: 4473, Val: 1278, Test: 640
