# YOLO Cross validation datasets

**Objective:** Prepare Supervisely CV dataset to YOLO format with data augmentation

**Workflow:**
1. Convert binary masks to YOLO segmentation format
2. Create cross-validation datasets with augmentation

## Imports

In [None]:
import glob
import json
import multiprocessing
import os
import shutil
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from loguru import logger
from PIL import Image
from tqdm.auto import tqdm
# from ultralytics import YOLO
import albumentations as A

## Configuration

In [None]:
# Generate timestamp for unique naming
todaysdate = datetime.now().strftime("%Y%m%d_%H%M%S")

# Output paths
CREATE_DATASET_PROCESSED_PATH = f"datasets/supervisely/yolo_processed_{todaysdate}"
DEVICE = "2"

# Binary mask conversion paths
ANNOTATIONS_BINARY_PNG_PATH = "datasets/supervisely/dataset_processed_20250523-173715/masks"
YOLO_ANNOTATIONS_OUTPUT_PATH = f"{CREATE_DATASET_PROCESSED_PATH}/labels"
TEST_MASK_OUTPUT_PATH = os.path.join(CREATE_DATASET_PROCESSED_PATH, "test_masks")

# Dataset paths
DATASET_PATH = "datasets/supervisely/341575_free_space_rooftop_geneva_20250511_yolo"
FOLD_PATHS = {
    0: "datasets/supervisely/dataset_processed_20250523-173715/fold_0_dataset.txt",
    1: "datasets/supervisely/dataset_processed_20250523-173715/fold_1_dataset.txt",
    2: "datasets/supervisely/dataset_processed_20250523-173715/fold_2_dataset.txt",
    3: "datasets/supervisely/dataset_processed_20250523-173715/fold_3_dataset.txt",
    4: "datasets/supervisely/dataset_processed_20250523-173715/fold_4_dataset.txt"
}
TEST_DATASET_TXT_PATH = "datasets/supervisely/dataset_processed_20250523-173715/test_dataset.txt"
IMG_DATASET_PATH = "datasets/supervisely/dataset_processed_20250523-173715/images"

# Augmentation parameters
NUM_AUGMENTATIONS_PER_IMAGE = 10  # Number of augmented versions per original
AUGMENTATION_WORKERS = 8  # Number of parallel workers for augmentation

# Training configuration
DATASET_PROCESSED_PATH = "datasets/supervisely/yolo_processed_20250618_201019"
MODEL_NAME = "yolo12x-seg.yaml" # yolo11n-seg.pt
OUTPUT_DIR_YOLO = "training_yolo"
PROJECT_NAME = f"yolo_free_space_rooftop_{todaysdate}"
CLASS_NAMES = ["free_space"]
CUSTOM_PARAMS = {
    'epochs': 5000,
    'batch': 1,
    'imgsz': 1280,
    'patience': 30,
    'lr0': 0.005,
}

# Evaluation parameters
OUTPUT_EVALUATE_TEST_DIR = os.path.join(OUTPUT_DIR_YOLO, f"auto_cv_evaluation_results_{todaysdate}")
CONF_THRESHOLD = 0.5
IOU_THRESHOLD = 0.7
CLASS_NAMES = ["free_space"]

# Feature flags
CONVERT_BINARY_MASKS_TO_YOLO_FORMAT = True
SPLIT_DATASET = True
APPLY_AUGMENTATION = True
TRAIN_YOLO = False
EVALUATE_YOLO = False

In [None]:
# Create required directories
if CONVERT_BINARY_MASKS_TO_YOLO_FORMAT:
    os.makedirs(CREATE_DATASET_PROCESSED_PATH, exist_ok=True)
    os.makedirs(YOLO_ANNOTATIONS_OUTPUT_PATH, exist_ok=True)
    os.makedirs(TEST_MASK_OUTPUT_PATH, exist_ok=True)
if EVALUATE_YOLO:
    os.makedirs(OUTPUT_EVALUATE_TEST_DIR, exist_ok=True)

In [None]:
# Define augmentation pipeline for data augmentation
AUGMENTATION_PIPELINE = A.Compose([
    # Basic Geometric
    A.SquareSymmetry(p=0.5),
    # Affine and Perspective
    A.Affine(
        scale=(0.95, 1.05), translate_percent=0.1, rotate=(-45, 45), p=0.6
    ),
    # Blur
    A.OneOf(
        [
            A.GaussianBlur(blur_limit=(3, 7), p=0.5),
            A.MedianBlur(blur_limit=5, p=0.5),
            A.MotionBlur(blur_limit=(3, 7), p=0.5),
        ],
        p=0.2,
    ),
    # Noise
    A.OneOf(
        [
            A.GaussNoise(p=0.5),
            A.ISONoise(
                color_shift=(0.01, 0.05), intensity=(0.1, 0.5), p=0.5
            ),
            A.MultiplicativeNoise(
                multiplier=(0.9, 1.1), per_channel=True, p=0.5
            ),
            A.SaltAndPepper(p=0.5),
        ],
        p=0.2,
    ),
    # Weather effects
    A.RandomSunFlare(p=0.2),
    A.RandomFog(p=0.2),
], additional_targets={'mask': 'mask'})

## Binary Mask to YOLO Format Conversion

In [None]:
def contours_join(parent_contour, child_contour):
    """
    Join parent contour with child contour for handling donut-shaped masks.
    
    Handles cases where masks have holes (e.g., donut shapes) by properly
    joining outer and inner contours. The inside of the donut should not
    be detected as positive area.
    
    Parameters:
        parent_contour: Outer contour points
        child_contour: Inner contour points (hole)
        
    Returns:
        np.ndarray: Merged contour preserving the hole
        
    Reference:
        https://github.com/ultralytics/ultralytics/issues/3085
    """
    def is_clockwise(contour):
        value = 0
        num = len(contour)
        for i in range(num):
            p1 = contour[i]
            p2 = contour[(i + 1) % num]  # More efficient modulo operation
            value += (p2[0][0] - p1[0][0]) * (p2[0][1] + p1[0][1])
        return value < 0

    def get_merge_point_idx(contour1, contour2):
        min_distance = float('inf')
        idx1, idx2 = 0, 0
        
        # Vectorized distance calculation for better performance
        for i, p1 in enumerate(contour1):
            distances = np.sum((contour2[:, 0] - p1[0]) ** 2, axis=1)
            min_idx = np.argmin(distances)
            if distances[min_idx] < min_distance:
                min_distance = distances[min_idx]
                idx1, idx2 = i, min_idx
        return idx1, idx2

    def merge_contours(contour1, contour2, idx1, idx2):
        # More efficient concatenation
        part1 = contour1[:idx1 + 1]
        part2 = contour2[idx2:]
        part3 = contour2[:idx2 + 1]
        part4 = contour1[idx1:]
        
        contour = np.concatenate([part1, part2, part3, part4], axis=0)
        return contour.astype(np.int32)

    def merge_with_parent(parent_contour, contour):
        if not is_clockwise(parent_contour):
            parent_contour = parent_contour[::-1]
        if is_clockwise(contour):
            contour = contour[::-1]
        idx1, idx2 = get_merge_point_idx(parent_contour, contour)
        return merge_contours(parent_contour, contour, idx1, idx2)

    return merge_with_parent(parent_contour=parent_contour, contour=child_contour)


def group_child_contours_with_parent(hierarchy):
    """
    Group child contours with their parent contours based on hierarchy.
    
    Parameters:
        hierarchy: OpenCV contour hierarchy array
        
    Returns:
        dict: Dictionary mapping parent indices to their child indices
              Format: {parent_idx: {"parent": idx, "child": [child_indices]}}
    """
    groups = {}
    hierarchy_flat = hierarchy.squeeze()
    
    for i, h in enumerate(hierarchy_flat):
        parent_index = h[3]
        if parent_index != -1:
            if parent_index in groups:
                groups[parent_index]["child"].append(i)
            else:
                groups[parent_index] = {"parent": parent_index, "child": [i]}
        else:
            if i not in groups:
                groups[i] = {"parent": i, "child": []}
            else:
                groups[i]["parent"] = i
    return groups


def convert_mask_to_yolo_seg_label(mask_path, create_test_mask=True):
    """
    Convert a single binary mask to YOLO segmentation format.
    
    Processes binary mask images and converts them to YOLO polygon format
    with normalized coordinates.
    
    Parameters:
        mask_path: Path to the binary mask file
        create_test_mask: Whether to create test mask for verification
        
    Returns:
        tuple: (label_str, test_mask, error_msg)
            - label_str: YOLO format label string
            - test_mask: Reconstructed mask for verification (if requested)
            - error_msg: Error message if conversion failed, None otherwise
    """
    try:
        label_str = ""
        test_mask = None
        
        # Read mask
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        if mask is None:
            return label_str, test_mask, f"Could not read mask from {mask_path}"
        
        height, width = mask.shape
        
        # Threshold (optimized for 0/1 masks)
        _, thresh = cv2.threshold(mask, 0.5, 255, cv2.THRESH_BINARY)
        
        # Find contours
        contours, hierarchy = cv2.findContours(thresh, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        
        if not contours:
            return label_str, test_mask, None
        
        # Initialize test mask only if needed
        if create_test_mask:
            test_mask = np.zeros((height, width), dtype=np.uint8)
        
        # Process contours
        if len(contours) > 1 and hierarchy is not None:
            contour_groups = group_child_contours_with_parent(hierarchy)
            for contour_group in contour_groups.values():
                parent_contour = contours[contour_group["parent"]]
                
                # Join with child contours
                for child in contour_group["child"]:
                    parent_contour = contours_join(parent_contour=parent_contour, child_contour=contours[child])
                
                # Process contour
                contour_label = process_contour(parent_contour, width, height)
                if contour_label:
                    label_str += f"0{contour_label}\n"
                
                # Draw test mask
                if create_test_mask and test_mask is not None:
                    parent_contour = np.expand_dims(parent_contour, axis=0)
                    cv2.drawContours(test_mask, parent_contour, -1, 255, -1)
        else:
            # Single contour
            contour_label = process_contour(contours[0], width, height)
            if contour_label:
                label_str += f"0{contour_label}\n"
            
            if create_test_mask:
                test_mask = np.zeros((height, width), dtype=np.uint8)
                cv2.drawContours(test_mask, [contours[0]], -1, 255, -1)
        
        label_str = label_str.rstrip()  # Remove last \n
        return label_str, test_mask, None
        
    except Exception as e:
        return "", None, str(e)


def process_contour(contour, width, height):
    """
    Process a single contour and return normalized coordinates.
    
    Parameters:
        contour: OpenCV contour array
        width: Image width for normalization
        height: Image height for normalization
        
    Returns:
        str: Space-separated normalized coordinates string
    """
    contour_squeezed = contour.squeeze()
    
    # Handle single point case
    if contour_squeezed.ndim == 1:
        return ""
    
    contour_list = contour_squeezed.tolist()
    
    if len(contour_list) < 3:
        return ""
    
    # Filter valid points and normalize coordinates
    contour_label = ""
    for point in contour_list:
        if isinstance(point, list) and len(point) == 2:
            x_norm = round(float(point[0]) / float(width), 6)
            y_norm = round(float(point[1]) / float(height), 6)
            contour_label += f" {x_norm} {y_norm}"
    
    return contour_label


def process_single_mask(args):
    """
    Process a single mask file for multiprocessing.
    
    Parameters:
        args: Tuple containing (mask_path, yolo_output_path, test_output_path, create_test_masks)
        
    Returns:
        tuple: (success, message)
            - success: Boolean indicating if processing succeeded
            - message: Success message with filename or error description
    """
    mask_path, yolo_output_path, test_output_path, create_test_masks = args
    
    mask_filename = Path(mask_path).stem
    
    # Convert mask
    label_str, test_mask, error = convert_mask_to_yolo_seg_label(mask_path, create_test_masks)
    
    if error:
        return False, f"Error processing {mask_filename}: {error}"
    
    if not label_str:
        return False, f"No valid contours found in {mask_filename}"
    
    # Save YOLO label
    label_output_path = os.path.join(yolo_output_path, f"{mask_filename}.txt")
    with open(label_output_path, 'w') as f:
        f.write(label_str)
    
    # Save test mask if created
    if create_test_masks and test_mask is not None:
        test_mask_output_path = os.path.join(test_output_path, f"{mask_filename}_test.png")
        cv2.imwrite(test_mask_output_path, test_mask)
    
    return True, mask_filename


def batch_convert_masks_to_yolo(ANNOTATIONS_BINARY_PNG_PATH, 
                               YOLO_ANNOTATIONS_OUTPUT_PATH, 
                               TEST_MASK_OUTPUT_PATH,
                               create_test_masks=True,
                               num_workers=None,
                               create_empty_labels=True):
    """
    Convert all mask files to YOLO format using parallel processing.
    
    Efficiently processes large batches of binary mask files and converts
    them to YOLO segmentation format with optional verification masks.
    
    Parameters:
        ANNOTATIONS_BINARY_PNG_PATH: Input directory with mask files
        YOLO_ANNOTATIONS_OUTPUT_PATH: Output directory for YOLO labels
        TEST_MASK_OUTPUT_PATH: Output directory for test masks
        create_test_masks: Whether to create test masks for verification
        num_workers: Number of parallel workers (None = auto-detect)
        create_empty_labels: Whether to create empty label files for images without valid contours
    """
    
    # Find all mask files
    mask_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tif', '*.tiff']
    mask_files = []
    
    print("Searching for mask files...")
    for extension in mask_extensions:
        pattern = os.path.join(ANNOTATIONS_BINARY_PNG_PATH, extension)
        mask_files.extend(glob.glob(pattern))
    
    if not mask_files:
        print(f"No mask files found in {ANNOTATIONS_BINARY_PNG_PATH}")
        return
    
    print(f"Found {len(mask_files)} mask files to process...")
    
    # Set up multiprocessing
    if num_workers is None:
        num_workers = min(multiprocessing.cpu_count(), len(mask_files))
    
    print(f"Using {num_workers} workers for parallel processing...")
    
    # Prepare arguments for multiprocessing
    process_args = [
        (mask_path, YOLO_ANNOTATIONS_OUTPUT_PATH, TEST_MASK_OUTPUT_PATH, create_test_masks)
        for mask_path in mask_files
    ]
    
    processed_count = 0
    error_count = 0
    errors = []
    failed_files = []  # Track files that failed
    
    # Process files with progress bar
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Submit all jobs at once
        futures = {executor.submit(process_single_mask, args): args for args in process_args}
        
        # Use tqdm for progress tracking with as_completed for real-time updates
        with tqdm(total=len(mask_files), desc="Converting masks", unit="files") as pbar:
            # Process results as they complete
            for future in as_completed(futures):
                try:
                    success, message = future.result()
                    if success:
                        processed_count += 1
                        pbar.set_postfix({"Success": processed_count, "Errors": error_count})
                    else:
                        error_count += 1
                        errors.append(message)
                        # Extract filename from error message for empty label creation
                        if "No valid contours found in" in message:
                            filename = message.split("No valid contours found in ")[-1]
                            failed_files.append(filename)
                        pbar.set_postfix({"Success": processed_count, "Errors": error_count})
                except Exception as e:
                    error_count += 1
                    errors.append(f"Unexpected error: {str(e)}")
                    pbar.set_postfix({"Success": processed_count, "Errors": error_count})
                
                pbar.update(1)
    
    # Create empty label files for images without valid contours
    if create_empty_labels and failed_files:
        print(f"\nCreating empty label files for {len(failed_files)} images without valid contours...")
        empty_labels_created = 0
        
        for filename in failed_files:
            empty_label_path = os.path.join(YOLO_ANNOTATIONS_OUTPUT_PATH, f"{filename}.txt")
            try:
                # Create empty label file
                with open(empty_label_path, 'w') as f:
                    pass  # Empty file
                empty_labels_created += 1
            except Exception as e:
                print(f"Failed to create empty label for {filename}: {e}")
        
        print(f"Created {empty_labels_created} empty label files")
        processed_count += empty_labels_created  # Update count to include empty labels
    
    # Print results
    print("\n" + "="*50)
    print("CONVERSION COMPLETED!")
    print("="*50)
    print(f"Successfully processed: {processed_count} files")
    print(f"Errors: {error_count - len(failed_files) if create_empty_labels else error_count} files")
    if create_empty_labels and failed_files:
        print(f"Empty labels created: {len(failed_files)} files (images without annotations)")
    print(f"Success rate: {processed_count/(len(mask_files))*100:.1f}%")
    print(f"YOLO labels saved to: {YOLO_ANNOTATIONS_OUTPUT_PATH}")
    if create_test_masks:
        print(f"Test masks saved to: {TEST_MASK_OUTPUT_PATH}")
    
    # Show first few errors if any (excluding "no contours" if empty labels were created)
    remaining_errors = [e for e in errors if not (create_empty_labels and "No valid contours found in" in e)]
    if remaining_errors:
        print(f"\nRemaining errors ({len(remaining_errors)}):")
        for error in remaining_errors[:5]:
            print(f"  - {error}")
        if len(remaining_errors) > 5:
            print(f"  ... and {len(remaining_errors) - 5} more errors")
    
    if create_empty_labels and failed_files:
        print(f"\nNote: {len(failed_files)} images had no valid annotations and got empty label files.")
        print("This is normal for datasets where some images contain no objects of interest.")

In [None]:
def analyze_failed_masks(ANNOTATIONS_BINARY_PNG_PATH, failed_filenames, sample_size=5):
    """
    Analyze why specific masks failed to convert to YOLO format.
    
    Provides detailed analysis of failed mask conversions to help
    diagnose issues with the mask files or conversion process.
    
    Parameters:
        ANNOTATIONS_BINARY_PNG_PATH: Path to mask directory
        failed_filenames: List of filenames that failed
        sample_size: Number of files to analyze in detail
    """
    if not failed_filenames:
        print("No failed masks to analyze")
        return
    
    print(f"\nAnalyzing {min(sample_size, len(failed_filenames))} failed masks...")
    
    for i, filename in enumerate(failed_filenames[:sample_size]):
        mask_path = os.path.join(ANNOTATIONS_BINARY_PNG_PATH, f"{filename}.png")
        
        if not os.path.exists(mask_path):
            # Try other extensions
            for ext in ['.jpg', '.jpeg', '.bmp', '.tif', '.tiff']:
                alt_path = os.path.join(ANNOTATIONS_BINARY_PNG_PATH, f"{filename}{ext}")
                if os.path.exists(alt_path):
                    mask_path = alt_path
                    break
        
        if not os.path.exists(mask_path):
            print(f"  {filename}: File not found")
            continue
        
        try:
            # Read and analyze mask
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            if mask is None:
                print(f"  {filename}: Could not read image")
                continue
            
            height, width = mask.shape
            unique_values = np.unique(mask)
            foreground_pixels = np.sum(mask > 0)
            foreground_percentage = (foreground_pixels / (height * width)) * 100
            
            print(f"  {filename}:")
            print(f"    - Size: {width}x{height}")
            print(f"    - Unique values: {unique_values}")
            print(f"    - Foreground pixels: {foreground_pixels} ({foreground_percentage:.2f}%)")
            
            if foreground_pixels == 0:
                print("    - Issue: Completely empty mask (no annotations)")
            elif foreground_pixels < 10:
                print("    - Issue: Very few foreground pixels (likely noise)")
            else:
                # Check contours
                _, thresh = cv2.threshold(mask, 0.5, 255, cv2.THRESH_BINARY)
                contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                
                if contours:
                    contour_sizes = [len(c) for c in contours]
                    print(f"    - Contours found: {len(contours)}")
                    print(f"    - Contour sizes: {contour_sizes}")
                    print("    - Issue: Contours too small (< 3 points) or invalid shape")
                else:
                    print("    - Issue: No contours detected")
        
        except Exception as e:
            print(f"  {filename}: Error analyzing - {e}")

In [None]:
if CONVERT_BINARY_MASKS_TO_YOLO_FORMAT:
    print("Converting binary masks to YOLO format...")
    batch_convert_masks_to_yolo(
        ANNOTATIONS_BINARY_PNG_PATH=ANNOTATIONS_BINARY_PNG_PATH,
        YOLO_ANNOTATIONS_OUTPUT_PATH=YOLO_ANNOTATIONS_OUTPUT_PATH,
        TEST_MASK_OUTPUT_PATH=TEST_MASK_OUTPUT_PATH,
        create_test_masks=True,
        num_workers=16,
        create_empty_labels=True,
    )

In [None]:
# Uncomment the lines below if you want to investigate the failed conversions
# failed_files = [
#     "24991113_tile_1_3_14c592", "25001124_tile_18_16_c9d875", 
#     "24971118_tile_15_17_5212bb", "25001121_tile_16_12_6e2b70", 
#     "24921119_tile_4_17_b09eb4"
# ]
# analyze_failed_masks(ANNOTATIONS_BINARY_PNG_PATH, failed_files)

## Dataset Splitting and Augmentation

In [None]:
class FastYOLOAugmentationPipeline:
    """
    Optimized data augmentation pipeline for YOLO segmentation datasets.
    
    Provides efficient augmentation of images and corresponding segmentation
    masks with proper YOLO format conversion.
    """
    
    def __init__(self, augmentation_pipeline=None, num_augmentations=10):
        """
        Initialize augmentation pipeline.
        
        Parameters:
            augmentation_pipeline: Albumentations pipeline (uses default if None)
            num_augmentations: Number of augmented versions per image
        """
        import albumentations as A
        
        if augmentation_pipeline is None:
            # Optimized pipeline - fewer heavy operations
            self.aug_pipeline = A.Compose([
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.3),
                A.RandomRotate90(p=0.5),
                A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.4),
                # Removed heavy operations like blur for speed
            ], additional_targets={'mask': 'mask'})
        else:
            self.aug_pipeline = augmentation_pipeline
            
        self.num_augmentations = num_augmentations
    
    def yolo_label_to_mask_fast(self, label_path, img_width, img_height):
        """
        Convert YOLO label to binary mask with optimized performance.
        
        Parameters:
            label_path: Path to YOLO label file
            img_width: Image width
            img_height: Image height
            
        Returns:
            np.ndarray: Binary mask array
        """
        mask = np.zeros((img_height, img_width), dtype=np.uint8)
        
        if not Path(label_path).exists() or Path(label_path).stat().st_size == 0:
            return mask
            
        try:
            with open(label_path, 'r') as f:
                content = f.read().strip()
            
            if not content:
                return mask
                
            lines = content.split('\n')
            
            for line in lines:
                if not line.strip():
                    continue
                    
                parts = line.split()
                if len(parts) < 7:
                    continue
                
                # Vectorized coordinate conversion
                coords = np.array([float(x) for x in parts[1:]])
                coords = coords.reshape(-1, 2)
                
                # Convert to pixel coordinates in one go
                pixel_coords = coords * np.array([img_width, img_height])
                pixel_coords = np.clip(pixel_coords, 0, [img_width-1, img_height-1]).astype(np.int32)
                
                if len(pixel_coords) >= 3:
                    cv2.fillPoly(mask, [pixel_coords], 255)
                    
        except Exception:
            pass  # Return empty mask on error
        
        return mask
    
    def mask_to_yolo_label_fast(self, mask, img_width, img_height, class_id=0):
        """
        Convert binary mask to YOLO label with optimized performance.
        
        Parameters:
            mask: Binary mask array
            img_width: Image width
            img_height: Image height
            class_id: Object class ID
            
        Returns:
            str: YOLO format label string
        """
        if mask.max() == 0:  # Empty mask
            return ""
            
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        if not contours:
            return ""
        
        label_lines = []
        
        for contour in contours:
            # Less aggressive simplification for speed
            epsilon = 0.002 * cv2.arcLength(contour, True)
            approx = cv2.approxPolyDP(contour, epsilon, True)
            
            if len(approx) < 3 or cv2.contourArea(approx) < 5:
                continue
            
            # Vectorized coordinate normalization
            points = approx.reshape(-1, 2)
            normalized = points / np.array([img_width, img_height])
            normalized = np.clip(normalized, 0, 1)
            
            if len(normalized) >= 3:
                coords_str = ' '.join([f"{coord:.6f}" for coord in normalized.flatten()])
                label_lines.append(f"{class_id} {coords_str}")
        
        return '\n'.join(label_lines)
    
    def augment_single_image_batch(self, image_path, label_path, output_images_dir, output_labels_dir):
        """
        Generate all augmentations for a single image.
        
        Parameters:
            image_path: Path to source image
            label_path: Path to source label
            output_images_dir: Output directory for augmented images
            output_labels_dir: Output directory for augmented labels
            
        Returns:
            tuple: (successful_count, failed_count, message)
        """
        try:
            # Load image once
            image = cv2.imread(str(image_path))
            if image is None:
                return 0, self.num_augmentations, f"Could not load image: {image_path}"
            
            img_height, img_width = image.shape[:2]
            stem = Path(image_path).stem
            
            # Convert YOLO label to mask once
            mask = self.yolo_label_to_mask_fast(label_path, img_width, img_height)
            
            successful = 0
            failed = 0
            
            # Create all augmentations in batch
            for aug_idx in range(1, self.num_augmentations + 1):
                try:
                    # Apply augmentation
                    augmented = self.aug_pipeline(image=image, mask=mask)
                    aug_image = augmented['image']
                    aug_mask = augmented['mask']
                    
                    # Save files
                    aug_image_name = f"{stem}_aug{aug_idx}.png"
                    aug_label_name = f"{stem}_aug{aug_idx}.txt"
                    
                    aug_image_path = output_images_dir / aug_image_name
                    aug_label_path = output_labels_dir / aug_label_name
                    
                    # Faster image saving
                    cv2.imwrite(str(aug_image_path), aug_image, [cv2.IMWRITE_PNG_COMPRESSION, 1])
                    
                    # Convert and save label
                    yolo_label = self.mask_to_yolo_label_fast(aug_mask, img_width, img_height)
                    with open(aug_label_path, 'w') as f:
                        f.write(yolo_label)
                    
                    successful += 1
                    
                except Exception as e:
                    failed += 1
                    if failed <= 2:  # Limit error messages
                        logger.warning(f"Aug {aug_idx} failed for {stem}: {e}")
            
            return successful, failed, f"Processed {stem}"
            
        except Exception as e:
            return 0, self.num_augmentations, f"Error processing {Path(image_path).name}: {e}"
    
    def augment_dataset_folder_fast(self, images_dir, labels_dir, output_images_dir, output_labels_dir, 
                                   num_workers=None, use_threads=True):
        """
        Apply augmentation to entire dataset folder with parallel processing.
        
        Parameters:
            images_dir: Source images directory
            labels_dir: Source labels directory
            output_images_dir: Output images directory
            output_labels_dir: Output labels directory
            num_workers: Number of parallel workers
            use_threads: Use ThreadPoolExecutor if True, ProcessPoolExecutor if False
            
        Returns:
            tuple: (total_successful, total_failed)
        """
        images_dir = Path(images_dir)
        labels_dir = Path(labels_dir)
        output_images_dir = Path(output_images_dir)
        output_labels_dir = Path(output_labels_dir)
        
        # Find all images
        image_files = []
        for ext in ['*.png', '*.jpg', '*.jpeg', '*.tif']:
            image_files.extend(images_dir.glob(ext))
        
        if not image_files:
            logger.warning(f"No images found in {images_dir}")
            return 0, 0
        
        logger.info(f"Found {len(image_files)} images to augment")
        logger.info(f"Creating {self.num_augmentations} versions each = {len(image_files) * self.num_augmentations} total")
        
        # Set optimal number of workers
        if num_workers is None:
            num_workers = min(multiprocessing.cpu_count(), len(image_files), 8)
        
        total_successful = 0
        total_failed = 0
        
        start_time = time.time()
        
        if use_threads and len(image_files) > 1:
            # Use ThreadPoolExecutor for I/O-bound operations
            logger.info(f"Using {num_workers} threads for parallel augmentation")
            
            with ThreadPoolExecutor(max_workers=num_workers) as executor:
                # Submit all tasks
                future_to_image = {}
                for image_path in image_files:
                    label_path = labels_dir / f"{image_path.stem}.txt"
                    future = executor.submit(
                        self.augment_single_image_batch,
                        image_path, label_path, output_images_dir, output_labels_dir
                    )
                    future_to_image[future] = image_path
                
                # Process results with progress bar
                with tqdm(total=len(image_files), desc="Augmenting images (threaded)") as pbar:
                    for future in as_completed(future_to_image):
                        try:
                            successful, failed, message = future.result(timeout=60)
                            total_successful += successful
                            total_failed += failed
                            pbar.update(1)
                            pbar.set_postfix({
                                "Success": total_successful, 
                                "Failed": total_failed,
                                "Rate": f"{total_successful/(time.time()-start_time):.1f}/s"
                            })
                        except Exception as e:
                            total_failed += self.num_augmentations
                            logger.error(f"Task failed: {e}")
                            pbar.update(1)
        
        else:
            # Fallback to sequential processing
            logger.info("Using sequential processing")
            
            for image_path in tqdm(image_files, desc="Augmenting images (sequential)"):
                label_path = labels_dir / f"{image_path.stem}.txt"
                successful, failed, message = self.augment_single_image_batch(
                    image_path, label_path, output_images_dir, output_labels_dir
                )
                total_successful += successful
                total_failed += failed
        
        duration = time.time() - start_time
        rate = total_successful / duration if duration > 0 else 0
        
        logger.info(f"Augmentation completed in {duration:.1f}s")
        logger.info(f"Rate: {rate:.1f} augmentations/second")
        logger.info(f"Results: {total_successful} successful, {total_failed} failed")
        
        return total_successful, total_failed

In [None]:
def read_image_list(txt_path):
    """
    Read image filenames from text file.
    
    Parameters:
        txt_path: Path to text file containing image filenames
        
    Returns:
        list: List of image filenames
    """
    with open(txt_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def get_image_name_without_ext(filename):
    """
    Extract image name without extension.
    
    Parameters:
        filename: Image filename
        
    Returns:
        str: Filename without extension
    """
    return Path(filename).stem

def create_directory_structure(base_path):
    """
    Create YOLO dataset directory structure.
    
    Creates train/val/test directories with images and labels subdirectories.
    
    Parameters:
        base_path: Base directory path
    """
    for split in ['train', 'val', 'test']:
        for subdir in ['images', 'labels']:
            dir_path = Path(base_path) / split / subdir
            dir_path.mkdir(parents=True, exist_ok=True)

def convert_tiff_to_png(tiff_path, png_path, quality=95):
    """
    Convert TIFF image to high-quality PNG.
    
    Parameters:
        tiff_path: Path to TIFF file
        png_path: Output path for PNG file
        quality: PNG compression quality
        
    Returns:
        bool: True if conversion successful
    """
    try:
        with Image.open(tiff_path) as img:
            # Convert to RGB if needed (TIFF might be in different color modes)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Save as PNG with high quality
            img.save(png_path, 'PNG', optimize=True, compress_level=1)
        return True
    except Exception as e:
        logger.warning(f"Failed to convert {tiff_path} to {png_path}: {e}")
        return False

def copy_label_file(source_labels_dir, target_labels_dir, image_filename):
    """
    Copy corresponding label file for an image.
    
    Parameters:
        source_labels_dir: Source labels directory
        target_labels_dir: Target labels directory
        image_filename: Image filename to find corresponding label
        
    Returns:
        bool: True if copy successful
    """
    # Get image name without extension
    image_name = get_image_name_without_ext(image_filename)
    
    # YOLO labels have exact same name as images (just .txt extension)
    label_filename = f"{image_name}.txt"
    
    source_label = Path(source_labels_dir) / label_filename
    target_label = Path(target_labels_dir) / label_filename
    
    if source_label.exists():
        shutil.copy2(source_label, target_label)
        return True
    else:
        logger.warning(f"Label file not found: {source_label}")
        return False

def validate_setup():
    """
    Validate dataset paths and configuration.
    
    Checks that all required paths exist and files are accessible
    before starting dataset creation.
    
    Returns:
        bool: True if validation passes
    """
    logger.info("Validating setup for YOLO dataset creation...")
    
    all_good = True
    
    # Check original dataset path for reference
    if not Path(DATASET_PATH).exists():
        logger.warning(f"Original dataset path does not exist: {DATASET_PATH}")
        all_good = False
    else:
        logger.success(f"Original dataset path exists: {DATASET_PATH}")
    
    # Check YOLO labels directory (main requirement)
    yolo_labels_dir = Path(YOLO_ANNOTATIONS_OUTPUT_PATH)
    if not yolo_labels_dir.exists():
        logger.error(f"YOLO labels directory does not exist: {yolo_labels_dir}")
        logger.error("Please run the binary mask to YOLO conversion first!")
        all_good = False
    else:
        label_count = len(list(yolo_labels_dir.glob("*.txt")))
        logger.success(f"YOLO labels directory exists: {yolo_labels_dir} ({label_count} label files)")
    
    # Check image path
    if not Path(IMG_DATASET_PATH).exists():
        logger.warning(f"Image dataset path does not exist: {IMG_DATASET_PATH}")
        all_good = False
    else:
        image_count = len(list(Path(IMG_DATASET_PATH).glob("*.tif")))  # Fixed: only .tif
        logger.success(f"Image dataset path exists: {IMG_DATASET_PATH} ({image_count} image files)")
    
    # Check/create processed output path
    if not Path(CREATE_DATASET_PROCESSED_PATH).exists():
        logger.info(f"Creating processed dataset directory: {CREATE_DATASET_PROCESSED_PATH}")
        Path(CREATE_DATASET_PROCESSED_PATH).mkdir(parents=True, exist_ok=True)
    else:
        logger.success(f"Processed dataset path exists: {CREATE_DATASET_PROCESSED_PATH}")
    
    # Check fold txt files
    total_fold_images = 0
    for fold_num, txt_path in FOLD_PATHS.items():
        if not Path(txt_path).exists():
            logger.warning(f"Fold {fold_num} txt file does not exist: {txt_path}")
            all_good = False
        else:
            images = read_image_list(txt_path)
            total_fold_images += len(images)
            logger.success(f"Fold {fold_num}: {len(images)} images")
    
    # Check test file
    if not Path(TEST_DATASET_TXT_PATH).exists():
        logger.warning(f"Test dataset txt file does not exist: {TEST_DATASET_TXT_PATH}")
        all_good = False
    else:
        test_images = read_image_list(TEST_DATASET_TXT_PATH)
        logger.success(f"Test set: {len(test_images)} images")
        logger.info(f"Total dataset size: {total_fold_images + len(test_images)} images")
    
    # Validate that images and labels match (critical check)
    if Path(YOLO_ANNOTATIONS_OUTPUT_PATH).exists() and FOLD_PATHS.get(0) and Path(FOLD_PATHS[0]).exists():
        logger.info("Validating image-label correspondence...")
        
        sample_images = read_image_list(FOLD_PATHS[0])[:10]  # Check first 10 images
        missing_labels = []
        missing_images = []
        
        for image_filename in sample_images:
            image_name = get_image_name_without_ext(image_filename)
            
            # Check label exists
            label_file = Path(YOLO_ANNOTATIONS_OUTPUT_PATH) / f"{image_name}.txt"
            if not label_file.exists():
                missing_labels.append(f"{image_name}.txt")
            
            # Check image exists (only .tif)
            tif_file = Path(IMG_DATASET_PATH) / f"{image_name}.tif"
            if not tif_file.exists():
                missing_images.append(f"{image_name}.tif")
        
        if missing_labels:
            logger.error(f"Missing label files: {missing_labels}")
            all_good = False
        
        if missing_images:
            logger.error(f"Missing image files: {missing_images}")
            all_good = False
        
        if not missing_labels and not missing_images:
            logger.success("Image-label correspondence validation passed!")
    
    # Summary
    if all_good:
        logger.success("All validations passed! Ready to create cross-validation datasets.")
        logger.info(f"Output will be saved to: {CREATE_DATASET_PROCESSED_PATH}")
    else:
        logger.error("Setup validation failed. Please fix the issues above before proceeding.")
    
    return all_good

def process_image_list(image_list, split_name, target_base_dir):
    """
    Process a list of images for a specific dataset split.
    
    Converts TIFF images to PNG and copies corresponding labels.
    
    Parameters:
        image_list: List of image filenames
        split_name: Split name (train/val/test)
        target_base_dir: Target base directory
        
    Returns:
        tuple: (successful_copies, failed_copies)
    """
    # Use YOLO labels directory
    source_labels_dir = Path(YOLO_ANNOTATIONS_OUTPUT_PATH)
    target_images_dir = Path(target_base_dir) / split_name / "images"
    target_labels_dir = Path(target_base_dir) / split_name / "labels"
    
    successful_copies = 0
    failed_copies = 0
    
    # Use tqdm for progress bar
    for image_filename in tqdm(image_list, desc=f"Processing {split_name}", leave=False):
        image_name = get_image_name_without_ext(image_filename)
        
        # Source TIF image path (only .tif extension)
        source_tif = Path(IMG_DATASET_PATH) / f"{image_name}.tif"
        
        # Target PNG image path
        target_png = target_images_dir / f"{image_name}.png"
        
        # Convert TIF to PNG and copy label
        if source_tif.exists():
            if convert_tiff_to_png(source_tif, target_png):
                # Copy corresponding label file
                if copy_label_file(source_labels_dir, target_labels_dir, image_filename):
                    successful_copies += 1
                else:
                    failed_copies += 1
            else:
                failed_copies += 1
        else:
            logger.warning(f"Source image not found: {image_name}.tif")
            failed_copies += 1
    
    logger.info(f"{split_name}: {successful_copies} successful, {failed_copies} failed")
    return successful_copies, failed_copies

def create_single_fold_dataset(val_fold, fold_data, test_data):
    """
    Create a single cross-validation dataset.
    
    Parameters:
        val_fold: Validation fold number
        fold_data: Dictionary of fold data
        test_data: List of test images
        
    Returns:
        tuple: (total_success, total_fail)
    """
    dataset_name = f"fold_{val_fold}_dataset"
    # Use the updated CREATE_DATASET_PROCESSED_PATH for output
    dataset_path = Path(CREATE_DATASET_PROCESSED_PATH) / dataset_name
    
    logger.info(f"Creating {dataset_name}")
    
    # Create directory structure
    create_directory_structure(dataset_path)
    
    # Validation data: current fold
    val_data = fold_data[val_fold]
    
    # Training data: all other folds
    train_data = []
    for fold_num, images in fold_data.items():
        if fold_num != val_fold:
            train_data.extend(images)
    
    logger.info(f"Training: {len(train_data)} images")
    logger.info(f"Validation: {len(val_data)} images")
    logger.info(f"Test: {len(test_data)} images")
    
    # Process each split
    start_time = time.time()
    
    train_success, train_fail = process_image_list(train_data, "train", dataset_path)
    val_success, val_fail = process_image_list(val_data, "val", dataset_path)
    test_success, test_fail = process_image_list(test_data, "test", dataset_path)
    
    end_time = time.time()
    duration = end_time - start_time
    
    total_success = train_success + val_success + test_success
    total_fail = train_fail + val_fail + test_fail
    
    logger.success(f"{dataset_name} completed in {duration:.1f}s")
    logger.success(f"Total: {total_success} successful, {total_fail} failed")
    
    return total_success, total_fail

def create_single_fold_dataset_with_augmentation_fast(val_fold, fold_data, test_data):
    """
    Create dataset with optimized augmentation pipeline.
    
    Creates base dataset then applies augmentation to training set.
    
    Parameters:
        val_fold: Validation fold number
        fold_data: Dictionary of fold data
        test_data: List of test images
        
    Returns:
        tuple: (total_success, total_fail)
    """
    # First create normal dataset
    total_success, total_fail = create_single_fold_dataset(val_fold, fold_data, test_data)
    
    # Then apply augmentation if enabled
    if APPLY_AUGMENTATION:
        logger.info(f"Applying FAST augmentation to fold {val_fold} training set...")
        
        dataset_path = Path(CREATE_DATASET_PROCESSED_PATH) / f"fold_{val_fold}_dataset"
        train_images_dir = dataset_path / "train" / "images"
        train_labels_dir = dataset_path / "train" / "labels"
        
        if train_images_dir.exists() and train_labels_dir.exists():
            # Count original images
            original_count = len(list(train_images_dir.glob("*.png")))
            logger.info(f"  Original training images: {original_count}")
            
            # Initialize fast augmenter
            augmenter = FastYOLOAugmentationPipeline(
                augmentation_pipeline=AUGMENTATION_PIPELINE,
                num_augmentations=NUM_AUGMENTATIONS_PER_IMAGE
            )
            
            # Apply fast augmentation
            start_time = time.time()
            aug_success, aug_fail = augmenter.augment_dataset_folder_fast(
                images_dir=train_images_dir,
                labels_dir=train_labels_dir,
                output_images_dir=train_images_dir,  # In-place
                output_labels_dir=train_labels_dir,  # In-place
                num_workers=AUGMENTATION_WORKERS,
                use_threads=True  # Set to False for CPU-intensive augmentations
            )
            
            duration = time.time() - start_time
            final_count = len(list(train_images_dir.glob("*.png")))
            
            logger.success(f"Fold {val_fold} augmentation completed in {duration:.1f}s:")
            logger.success(f"  Created: {aug_success} augmented images")
            logger.success(f"  Failed: {aug_fail} augmentations")
            logger.success(f"  Final training set: {final_count} images ({original_count} → {final_count})")
            
            total_success += aug_success
            total_fail += aug_fail
        else:
            logger.warning(f"Training directories not found for fold {val_fold}")
    
    return total_success, total_fail

def create_cross_validation_datasets(APPLY_AUGMENTATION):
    """
    Create all cross-validation datasets with optional augmentation.
    
    Generates 5-fold cross-validation split with proper train/val/test
    separation and optional data augmentation.
    
    Parameters:
        APPLY_AUGMENTATION: Whether to apply augmentation to training sets
        
    Returns:
        tuple: (total_success, total_fail)
    """
    
    if not SPLIT_DATASET:
        logger.warning("Dataset creation is disabled. Set SPLIT_DATASET = True to enable.")
        return
    
    logger.info("Starting cross-validation dataset creation...")
    if APPLY_AUGMENTATION:
        logger.info("Augmentation is ENABLED - will augment training sets")
        logger.info(f"Augmentations per image: {NUM_AUGMENTATIONS_PER_IMAGE}")
    else:
        logger.info("Augmentation is DISABLED")
    
    start_time = time.time()
    
    # Read all fold datasets
    fold_data = {}
    for fold_num, txt_path in FOLD_PATHS.items():
        fold_data[fold_num] = read_image_list(txt_path)
        logger.info(f"Fold {fold_num}: {len(fold_data[fold_num])} images")
    
    # Read test dataset
    test_data = read_image_list(TEST_DATASET_TXT_PATH)
    logger.info(f"Test set: {len(test_data)} images")
    
    # Create 5 cross-validation datasets
    total_success = 0
    total_fail = 0
    
    for val_fold in range(5):
        logger.info(f"\n{'='*60}")
        logger.info(f"PROCESSING FOLD {val_fold}")
        logger.info(f"{'='*60}")
        
        if APPLY_AUGMENTATION:
            logger.info(f"Creating dataset for fold {val_fold} WITH augmentation...")
            success, fail = create_single_fold_dataset_with_augmentation_fast(val_fold, fold_data, test_data)
        else:
            logger.info(f"Creating dataset for fold {val_fold} WITHOUT augmentation...")
            success, fail = create_single_fold_dataset(val_fold, fold_data, test_data)
        
        total_success += success
        total_fail += fail
        
        logger.info(f"Fold {val_fold} completed: {success} successful, {fail} failed")
    
    end_time = time.time()
    total_duration = end_time - start_time
    
    logger.success("\n" + "="*60)
    logger.success("ALL CROSS-VALIDATION DATASETS COMPLETED!")
    logger.success("="*60)
    logger.success(f"Total time: {total_duration/60:.1f} minutes")
    logger.success(f"Overall: {total_success} successful, {total_fail} failed")
    
    if APPLY_AUGMENTATION:
        logger.success(f"Augmentation applied to all training sets with {NUM_AUGMENTATIONS_PER_IMAGE} versions per image")
    
    return total_success, total_fail

In [None]:
if SPLIT_DATASET:
    if validate_setup():
        # Create cross-validation datasets
        create_cross_validation_datasets(APPLY_AUGMENTATION=APPLY_AUGMENTATION)
    else:
        logger.error("Setup validation failed. Please fix the issues before proceeding.")