In [None]:
import os
import numpy as np
import cv2
from PIL import Image
import yaml

def convert_png_to_yolo_txt(mask_path, output_path, class_id=0):
    mask = np.array(Image.open(mask_path).convert("I"))
    height, width = mask.shape

    instance_ids = np.unique(mask)
    instance_ids = instance_ids[instance_ids != 0]

    lines = []
    for inst_id in instance_ids:
        instance_mask = (mask == inst_id).astype(np.uint8)
        contours, _ = cv2.findContours(instance_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

        if not contours:
            continue

        # Use only the largest contour to avoid overcounting
        largest = max(contours, key=cv2.contourArea)
        if largest.shape[0] < 1:
            continue  # skip degenerate shapes

        contour = largest.reshape(-1, 2)
        norm = contour / np.array([width, height])
        flat = norm.flatten().tolist()
        line = f"{class_id} " + " ".join(f"{x:.6f}" for x in flat)
        lines.append(line)

    with open(output_path, "w") as f:
        f.write("\n".join(lines))

def process_folder(mask_folder, label_folder, class_id=0):
    os.makedirs(label_folder, exist_ok=True)
    for filename in sorted(os.listdir(mask_folder)):
        if filename.endswith(".png"):
            mask_path = os.path.join(mask_folder, filename)
            label_path = os.path.join(label_folder, filename.replace(".png", ".txt"))
            convert_png_to_yolo_txt(mask_path, label_path, class_id=class_id)
            print(f"Saved: {label_path}")

def generate_data_yaml(root_dir, train_img_dir, val_img_dir, yaml_path, class_name="GobletCell"):
    data_yaml = {
        "path": os.path.abspath(root_dir),
        "train": os.path.relpath(train_img_dir, root_dir),
        "val": os.path.relpath(val_img_dir, root_dir),
        "names": [class_name],
        "nc": 1
    }
    with open(yaml_path, "w") as f:
        yaml.dump(data_yaml)
    print(f"Generated data.yaml at {yaml_path}")

if __name__ == "__main__":
    # Set paths
    root_dataset_dir = "insert_root_path"
    train_mask_dir = os.path.join(root_dataset_dir, "train/masks")
    train_label_dir = os.path.join(root_dataset_dir, "train/labels")
    val_mask_dir = os.path.join(root_dataset_dir, "val/masks")
    val_label_dir = os.path.join(root_dataset_dir, "val/labels")
    train_img_dir = os.path.join(root_dataset_dir, "train/images")
    val_img_dir = os.path.join(root_dataset_dir, "val/images")
    yaml_output_path = os.path.join(root_dataset_dir, "data.yaml")

    # Convert masks to YOLO labels
    process_folder(train_mask_dir, train_label_dir)
    process_folder(val_mask_dir, val_label_dir)

    # Generate data.yaml
    generate_data_yaml(root_dataset_dir, train_img_dir, val_img_dir, yaml_output_path)