# Model Evaluation
 
**Objective:** Evaluate segmentation models on test dataset and analyze performance.

**Workflow:**
1. Load trained models and configurations
2. Run inference on test dataset
3. Calculate comprehensive metrics (IoU, mAP, F1-score)
4. Generate visualizations and performance analysis
5. Create ensemble predictions from k-fold models

## Imports

In [None]:
import gc
import json
import os
import re
import sys
import time
import warnings
from datetime import datetime
import torch

import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from concurrent.futures import ThreadPoolExecutor, as_completed

from torch.utils.data import Dataset
from tqdm.auto import tqdm

from PIL import Image

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

from loguru import logger
import segmentation_models_pytorch as smp

from pathlib import Path
from ultralytics import YOLO
import cv2
from collections import defaultdict

warnings.filterwarnings('ignore')

# Ampere
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True

## Configuration

In [None]:
# Date and time settings
todays_date = datetime.now().strftime("%Y%m%d-%H%M%S")

# Model paths
INPUT_MODELS_BASE_DIR_PATH = "data/training"

# Dataset paths
BASE_DATASET_DIR = "datasets/supervisely/dataset_processed_20250523-173715"
DATASET_IMAGES_DIR = BASE_DATASET_DIR + "/images"
DATASET_MASKS_DIR = BASE_DATASET_DIR + "/masks"
TEST_DATASET_FILE = BASE_DATASET_DIR + "/test_dataset.txt"
IMG_SIZE = (1280, 1280)

# Output paths
OUTPUT_BASE_DIR_PATH = "data/notebook_10/"
OUTPUT_PARQUET_PATH = OUTPUT_BASE_DIR_PATH + "parquet/"
GRAPHICS_PATH = OUTPUT_BASE_DIR_PATH + f"graphics/{todays_date}/"

# Saved results paths for loading existing evaluations
SAVED_MODEL_RESULTS_PARQUET_PATH = OUTPUT_PARQUET_PATH + "model_evaluation_models_20250629_210320.parquet"
SAVED_PER_IMAGE_RESULTS_PARQUET_PATH = OUTPUT_PARQUET_PATH + "model_evaluation_per_image_20250629_210320.parquet"

# YOLO specific paths
TEST_DATASET_PATH = "datasets/supervisely/yolo_processed_20250619_151249/fold_0_dataset/test"

# Analysis output directories
VIZ_OUTPUT_DIR = Path(OUTPUT_BASE_DIR_PATH + f"analyse_qualitative/{todays_date}/")
QUARTIER_OUTPUT_DIR = Path(OUTPUT_BASE_DIR_PATH + f"analyse_quartier/{todays_date}/")

# Feature flags
MAJ_DATASET = False      # Update dataset evaluation
YOLO_EVALUATE = False    # Evaluate YOLO models
EVALUER_ENSEMBLE = False # Evaluate ensemble models

In [None]:
# Create output directories
os.makedirs(OUTPUT_BASE_DIR_PATH, exist_ok=True)
os.makedirs(OUTPUT_PARQUET_PATH, exist_ok=True)
os.makedirs(GRAPHICS_PATH, exist_ok=True)

In [None]:
# Configure logging
logger.remove()
logger.add(sys.stderr, level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}")

## Load Model Data

In [None]:
if MAJ_DATASET:
    def get_best_model_paths(base_dir):
        """
        Find all model result JSON files in directory tree.
        
        Parameters:
            base_dir: Root directory to search
            
        Returns:
            List of paths to model result files
        """
        model_paths = []
        for root, dirs, files in os.walk(base_dir):
            for file in files:
                if re.match(r"single_fold_complete_results.json", file):
                    model_paths.append(os.path.join(root, file))
        return model_paths

    def test_json_parsing(json_file_path):
        """
        Verify JSON parsing and data structure.
        
        Parameters:
            json_file_path: Path to JSON file to test
            
        Returns:
            DataFrame with parsed data or None if parsing fails
        """
        print("Testing JSON parsing...")
        
        try:
            # Test single file loading
            with open(json_file_path, 'r') as f:
                data = json.load(f)
            
            print("JSON file loads successfully")
            print(f"Structure: {list(data.keys())}")
            
            # Test parsing function
            df_test = json_files_to_dataframe([json_file_path])
            print(f"DataFrame created with shape: {df_test.shape}")
            print(f"Columns: {df_test.columns.tolist()}")
            
            # Test completion function
            df_complete = completer_df(df_test)
            print(f"DataFrame completed with shape: {df_complete.shape}")
            
            # Show sample data
            print("\nSample data:")
            for col in ['config', 'validation_fold', 'architecture', 'backbone', 'test_iou', 'model_path']:
                if col in df_complete.columns:
                    print(f"  {col}: {df_complete[col].iloc[0]}")
            
            return df_complete
            
        except Exception as e:
            print(f"Error: {e}")
            return None
    
    def json_files_to_dataframe(model_paths):
        """
        Read multiple JSON files and concatenate into DataFrame.
        
        Parameters:
            model_paths: List of JSON file paths
            
        Returns:
            DataFrame containing all model results
        """
        all_data = []
        
        for json_path in model_paths:
            try:
                with open(json_path, 'r') as f:
                    data = json.load(f)
                
                def safe_get(data, keys, default=''):
                    """Safely get nested dictionary values"""
                    current = data
                    for key in keys:
                        if isinstance(current, dict) and key in current:
                            current = current[key]
                        else:
                            return default
                    return current
                
                # Extract data from JSON structure
                flattened_data = {
                    'file_path': json_path,
                    'config': safe_get(data, ['config']),
                    'validation_fold': safe_get(data, ['fold']),
                    
                    # Training configuration
                    'architecture': safe_get(data, ['training_config', 'architecture']),
                    'backbone': safe_get(data, ['training_config', 'backbone']),
                    'encoder_weights': safe_get(data, ['training_config', 'encoder_weights']),
                    'img_size': safe_get(data, ['training_config', 'img_size']),
                    'num_classes': safe_get(data, ['training_config', 'num_classes']),
                    'batch_size': safe_get(data, ['training_config', 'batch_size']),
                    'learning_rate': safe_get(data, ['training_config', 'learning_rate']),
                    'epochs': safe_get(data, ['training_config', 'epochs']),
                    'patience': safe_get(data, ['training_config', 'patience']),
                    'accumulation_steps': safe_get(data, ['training_config', 'accumulation_steps']),
                    'auto_batch_size': safe_get(data, ['training_config', 'auto_batch_size']),
                    'min_batch_size_search': safe_get(data, ['training_config', 'min_batch_size_search']),
                    'max_batch_size_search': safe_get(data, ['training_config', 'max_batch_size_search']),
                    'batch_size_test_steps': safe_get(data, ['training_config', 'batch_size_test_steps']),
                    
                    # Validation metrics
                    'best_val_iou': safe_get(data, ['results', 'best_val_iou']),
                    'final_val_iou': safe_get(data, ['results', 'final_val_metrics', 'iou']),
                    'final_val_f1_score': safe_get(data, ['results', 'final_val_metrics', 'f1_score']),
                    'final_val_accuracy': safe_get(data, ['results', 'final_val_metrics', 'accuracy']),
                    'final_val_recall': safe_get(data, ['results', 'final_val_metrics', 'recall']),
                    'final_val_precision': safe_get(data, ['results', 'final_val_metrics', 'precision']),
                    
                    # Test metrics
                    'test_iou': safe_get(data, ['results', 'test_metrics', 'iou']),
                    'test_f1_score': safe_get(data, ['results', 'test_metrics', 'f1_score']),
                    'test_accuracy': safe_get(data, ['results', 'test_metrics', 'accuracy']),
                    'test_recall': safe_get(data, ['results', 'test_metrics', 'recall']),
                    'test_precision': safe_get(data, ['results', 'test_metrics', 'precision']),
                    
                    # Timing information
                    'total_seconds': safe_get(data, ['results', 'timing', 'total_seconds']),
                    'total_time_formatted': safe_get(data, ['results', 'timing', 'total_time_formatted']),
                    'total_epochs_trained': safe_get(data, ['results', 'timing', 'total_epochs_trained']),
                    'average_seconds_per_epoch': safe_get(data, ['results', 'timing', 'average_seconds_per_epoch']),
                    'start_timestamp': safe_get(data, ['results', 'timing', 'start_timestamp']),
                    'end_timestamp': safe_get(data, ['results', 'timing', 'end_timestamp']),
                    
                    # Paths
                    'plot_path': safe_get(data, ['results', 'plot_path']),
                    'pred_path': safe_get(data, ['results', 'pred_path'])
                }
                
                all_data.append(flattened_data)
                
            except Exception as e:
                print(f"Error reading {json_path}: {e}")
                continue
        
        # Convert to DataFrame
        df = pd.DataFrame(all_data)
        print(f"Successfully loaded {len(df)} records from {len(model_paths)} JSON files")
        return df

    def completer_df(df):
        """
        Complete DataFrame by processing timing data and creating model paths.
        
        Parameters:
            df: Raw DataFrame from JSON files
            
        Returns:
            DataFrame with additional processed columns
        """
        df_temp = df.copy()
        
        # Create model path for evaluation
        try:
            df_temp["validation_fold_str"] = df_temp["validation_fold"].astype(str)
            
            df_temp["model_path"] = (
                df_temp["file_path"].str.replace("single_fold_complete_results.json", "") + 
                "best_model_fold_" + 
                df_temp["validation_fold_str"] + 
                ".pth"
            )
            
            df_temp = df_temp.drop("validation_fold_str", axis=1)
            
        except Exception as e:
            print(f"Error creating model_path: {e}")
            # Fallback path generation
            df_temp["model_path"] = df_temp["file_path"].str.replace("single_fold_complete_results.json", "best_model.pth")
        
        # Convert timestamps to datetime
        try:
            df_temp["model_start_date"] = pd.to_datetime(df_temp["start_timestamp"], errors='coerce')
            df_temp["model_end_date"] = pd.to_datetime(df_temp["end_timestamp"], errors='coerce')
        except Exception as e:
            print(f"Error converting timestamps: {e}")
            df_temp["model_start_date"] = pd.NaT
            df_temp["model_end_date"] = pd.NaT
        
        # Convert training time to hours
        try:
            df_temp["total_training_time_hour"] = pd.to_numeric(df_temp["total_seconds"], errors='coerce') / 3600
        except Exception as e:
            print(f"Error converting training time: {e}")
            df_temp["total_training_time_hour"] = np.nan
        
        # Convert numeric fields
        numeric_fields = [
            'learning_rate', 'epochs', 'patience', 'accumulation_steps', 'batch_size',
            'num_classes', 'min_batch_size_search', 'max_batch_size_search', 'batch_size_test_steps',
            'best_val_iou', 'final_val_iou', 'final_val_f1_score', 'final_val_accuracy', 
            'final_val_recall', 'final_val_precision',
            'test_iou', 'test_f1_score', 'test_accuracy', 'test_recall', 'test_precision',
            'total_seconds', 'total_epochs_trained', 'average_seconds_per_epoch', 'validation_fold'
        ]
        
        for field in numeric_fields:
            if field in df_temp.columns:
                try:
                    df_temp[field] = df_temp[field].replace('', np.nan)
                    df_temp[field] = pd.to_numeric(df_temp[field], errors='coerce')
                except Exception as e:
                    print(f"Error converting {field} to numeric: {e}")
        
        # Convert boolean fields
        boolean_fields = ['auto_batch_size']
        for field in boolean_fields:
            if field in df_temp.columns:
                try:
                    df_temp[field] = df_temp[field].replace({'true': True, 'false': False, 'True': True, 'False': False})
                    df_temp[field] = df_temp[field].astype('bool', errors='ignore')
                except Exception as e:
                    print(f"Error converting {field} to boolean: {e}")
        
        # Process img_size field
        if 'img_size' in df_temp.columns:
            try:
                df_temp['img_size_processed'] = df_temp['img_size'].apply(
                    lambda x: eval(x) if isinstance(x, str) and x.startswith('(') else x
                )
            except Exception as e:
                print(f"Error processing img_size: {e}")
                df_temp['img_size_processed'] = df_temp['img_size']
        
        print(f"Data processing complete. DataFrame shape: {df_temp.shape}")
        return df_temp

In [None]:
if MAJ_DATASET:
    # Get all JSON files
    model_paths = get_best_model_paths(INPUT_MODELS_BASE_DIR_PATH)
    print(f"Found {len(model_paths)} JSON files")
    
    if len(model_paths) == 0:
        print("No JSON files found! Check your INPUT_MODELS_BASE_DIR_PATH")
        print(f"Looking in: {INPUT_MODELS_BASE_DIR_PATH}")
        exit()
    
    # Test with first file
    print("Testing with first JSON file...")
    test_result = test_json_parsing(model_paths[0])
    if test_result is None:
        print("JSON parsing test failed! Check your JSON structure.")
        exit()
    
    print("JSON parsing test successful!")
    
    # Process all files
    print(f"\nProcessing all {len(model_paths)} JSON files...")
    df_test_json = json_files_to_dataframe(model_paths)
    df_test_models = completer_df(df_test_json)
    
    # Show summary
    print("\n" + "="*50)
    print("DATASET SUMMARY")
    print("="*50)
    print(f"Total models: {len(df_test_models)}")
    print(f"Unique configs: {df_test_models['config'].nunique()}")
    print(f"Validation folds: {sorted(df_test_models['validation_fold'].unique())}")
    print(f"Architectures: {df_test_models['architecture'].unique()}")
    print(f"Backbones: {df_test_models['backbone'].nunique()} unique")
    
    # Data quality checks
    print("\nDATA QUALITY CHECKS:")
    print("-" * 20)
    
    # Check for missing values
    key_columns = ['config', 'architecture', 'backbone', 'test_iou', 'best_val_iou']
    for col in key_columns:
        if col in df_test_models.columns:
            missing = df_test_models[col].isna().sum()
            print(f"Missing {col}: {missing}/{len(df_test_models)} ({missing/len(df_test_models)*100:.1f}%)")
    
    # Check for incomplete configs
    config_counts = df_test_models['config'].value_counts()
    incomplete_configs = config_counts[config_counts != 5]
    if len(incomplete_configs) > 0:
        print(f"\nWarning: {len(incomplete_configs)} configs don't have 5 folds:")
        for config, count in incomplete_configs.head().items():
            print(f"  {config}: {count} folds")
    else:
        print("All configs have 5 folds")
    
    # Check for missing model files
    missing_models = df_test_models[~df_test_models['model_path'].apply(os.path.exists)]
    if len(missing_models) > 0:
        print(f"\nWarning: {len(missing_models)} model files not found:")
        for path in missing_models['model_path'].head(5):
            print(f"  {path}")
        
        print(f"\n{len(missing_models)} out of {len(df_test_models)} model files are missing.")
        response = input("Continue with available models? (y/n): ").lower().strip()
        if response != 'y':
            print("Exiting...")
            exit()
        else:
            # Filter out missing models
            df_test_models = df_test_models[df_test_models['model_path'].apply(os.path.exists)]
            print(f"Continuing with {len(df_test_models)} models that have available files")
    else:
        print("All model files found")
    
    # Filter out low performance models
    df_test_models = df_test_models[df_test_models['test_iou'] >= 0.1]

    # Show performance range
    if 'test_iou' in df_test_models.columns:
        test_iou_clean = df_test_models['test_iou'].dropna()
        if len(test_iou_clean) > 0:
            print("\nPERFORMANCE RANGE:")
            print(f"Test IoU: {test_iou_clean.min():.3f} - {test_iou_clean.max():.3f}")
            print(f"Mean Test IoU: {test_iou_clean.mean():.3f}")
    
    print("="*50)
    print("Ready to proceed with evaluation!")
    print("="*50)

## Evaluate SMP Models on Test Set

In [None]:
def load_test_data(test_file, images_dir, masks_dir):
    """
    Load test dataset paths from file.
    
    Parameters:
        test_file: Path to file containing test image filenames
        images_dir: Directory containing images
        masks_dir: Directory containing masks
        
    Returns:
        Dictionary with 'images' and 'masks' lists of paths
    """
    with open(test_file, "r") as f:
        filenames = [line.strip() for line in f.readlines()]

    images = []
    masks = []

    for filename in filenames:
        mask_path = os.path.join(masks_dir, filename)
        image_path = os.path.join(images_dir, filename)

        if os.path.exists(image_path) and os.path.exists(mask_path):
            images.append(image_path)
            masks.append(mask_path)

    logger.debug(f"Loaded {len(images)} test samples")
    return {"images": images, "masks": masks}


class SimpleSegmentationDataset(Dataset):
    """
    Memory-optimized dataset for segmentation tasks.
    
    Parameters:
        image_paths: List of image file paths
        mask_paths: List of mask file paths
        img_size: Target image size (width, height)
        transform: Optional augmentation transforms
        cache_size: Number of images to cache in memory
    """

    def __init__(
        self, image_paths, mask_paths, img_size, transform=None, cache_size=250
    ):
        self.image_paths = image_paths
        self.mask_paths = mask_paths
        self.img_size = img_size
        self.transform = transform
        self.cache_size = cache_size
        self._cache = {}  # Simple LRU-like cache

        # Validate paths
        assert len(image_paths) == len(mask_paths), "Mismatch between images and masks"

        # Check if files exist
        valid_pairs = []
        for img_path, mask_path in zip(image_paths, mask_paths):
            if os.path.exists(img_path) and os.path.exists(mask_path):
                valid_pairs.append((img_path, mask_path))
            else:
                logger.warning(f"Missing files: {img_path} or {mask_path}")

        self.image_paths = [pair[0] for pair in valid_pairs]
        self.mask_paths = [pair[1] for pair in valid_pairs]

        logger.debug(f"Dataset initialized with {len(valid_pairs)} valid pairs")

    def __len__(self):
        return len(self.image_paths)

    def _load_and_process(self, idx):
        """Load and process image/mask pair."""
        try:
            # Load image
            image = Image.open(self.image_paths[idx]).convert("RGB")
            image = image.resize(self.img_size, Image.Resampling.BILINEAR)
            image = np.array(image)

            # Load mask
            mask = Image.open(self.mask_paths[idx])

            # Convert to grayscale if needed
            if mask.mode != "L":
                mask = mask.convert("L")

            # Resize mask using nearest neighbor to preserve labels
            mask = mask.resize(self.img_size, Image.Resampling.NEAREST)
            mask = np.array(mask)

            # Create binary mask
            if mask.max() > 1:
                mask = mask.astype(np.float32) / 255.0
                mask = (mask > 0.5).astype(np.uint8)
            else:
                mask = (mask > 0).astype(np.uint8)

            return image, mask

        except Exception as e:
            logger.error(f"Error loading sample {idx}: {e}")
            raise e

    def __getitem__(self, idx):
        # Check cache
        if idx in self._cache:
            image, mask = self._cache[idx]
        else:
            image, mask = self._load_and_process(idx)

            # Add to cache if not full
            if len(self._cache) < self.cache_size:
                self._cache[idx] = (image.copy(), mask.copy())

        # Apply augmentations
        if self.transform:
            augmented = self.transform(image=image, mask=mask)
            image = augmented["image"]
            mask = augmented["mask"]

        # Convert to tensors
        image = torch.from_numpy(image.astype(np.float32) / 255.0).permute(2, 0, 1)
        mask = torch.from_numpy(mask.astype(np.float32))

        return image, mask


def calculate_metrics(pred_logits, target):
    """
    Calculate comprehensive metrics including mAP at various thresholds.
    
    Parameters:
        pred_logits: Raw model outputs [B, H, W] or [B, 1, H, W]
        target: Ground truth masks [B, H, W]
    
    Returns:
        Dictionary with IoU, F1-score, accuracy, recall, precision, and mAP metrics
    """
    try:
        # Handle different input shapes
        if len(pred_logits.shape) == 4:
            pred_logits = pred_logits.squeeze(1)
        
        # Convert to binary predictions
        pred_binary = (torch.sigmoid(pred_logits) > 0.5).float()
        target_binary = (target > 0.5).float()
        
        # Flatten for calculation
        pred_flat = pred_binary.view(-1)
        target_flat = target_binary.view(-1)
        
        # Calculate confusion matrix components
        tp = (pred_flat * target_flat).sum()
        fp = (pred_flat * (1 - target_flat)).sum()
        fn = ((1 - pred_flat) * target_flat).sum()
        tn = ((1 - pred_flat) * (1 - target_flat)).sum()
        
        # Calculate metrics with zero division protection
        eps = 1e-7
        
        precision = tp / (tp + fp + eps)
        recall = tp / (tp + fn + eps)
        f1_score = 2 * precision * recall / (precision + recall + eps)
        iou = tp / (tp + fp + fn + eps)
        accuracy = (tp + tn) / (tp + fp + fn + tn + eps)
        
        # Calculate mAP@0.5-0.95 for segmentation
        batch_size = pred_logits.shape[0]
        iou_thresholds = torch.arange(0.5, 1.0, 0.05, device=pred_logits.device)
        
        # Calculate IoU for each image in batch
        batch_ious = []
        for i in range(batch_size):
            pred_img = pred_binary[i].view(-1)
            target_img = target_binary[i].view(-1)
            
            # Calculate IoU for this image
            img_tp = (pred_img * target_img).sum()
            img_fp = (pred_img * (1 - target_img)).sum()
            img_fn = ((1 - pred_img) * target_img).sum()
            
            img_iou = img_tp / (img_tp + img_fp + img_fn + eps)
            batch_ious.append(img_iou)
        
        batch_ious = torch.stack(batch_ious)
        
        # Calculate mAP at different thresholds
        map_50 = (batch_ious >= 0.5).float().mean()
        map_55 = (batch_ious >= 0.55).float().mean()
        map_60 = (batch_ious >= 0.6).float().mean()
        map_65 = (batch_ious >= 0.65).float().mean()
        map_70 = (batch_ious >= 0.7).float().mean()
        map_75 = (batch_ious >= 0.75).float().mean()
        map_80 = (batch_ious >= 0.8).float().mean()
        map_85 = (batch_ious >= 0.85).float().mean()
        map_90 = (batch_ious >= 0.9).float().mean()
        map_95 = (batch_ious >= 0.95).float().mean()
        
        return {
            "iou": iou.item(),
            "f1_score": f1_score.item(),
            "accuracy": accuracy.item(),
            "recall": recall.item(),
            "precision": precision.item(),
            "map_50": map_50.item(),
            "map_55": map_55.item(),
            "map_60": map_60.item(),
            "map_65": map_65.item(),
            "map_70": map_70.item(),
            "map_75": map_75.item(),
            "map_80": map_80.item(),
            "map_85": map_85.item(),
            "map_90": map_90.item(),
            "map_95": map_95.item(),
            "mean_iou": batch_ious.mean().item(),
        }
        
    except Exception as e:
        print(f"Metrics calculation error: {e}")
        # Return zero metrics on error
        return {key: 0.0 for key in [
            "iou", "f1_score", "accuracy", "recall", "precision",
            "map_50", "map_55", "map_60", "map_65", "map_70",
            "map_75", "map_80", "map_85", "map_90", "map_95", "mean_iou"
        ]}


def create_model(config):
    """
    Create segmentation model based on configuration.
    
    Parameters:
        config: Dictionary with architecture, backbone, encoder_weights, num_classes
        
    Returns:
        Tuple of (model, aux_params)
    """
    try:
        architecture = config["architecture"].lower()
        
        aux_params = {
            "dropout": 0.2,
            "activation": None,
            "classes": 1
        }
        
        # Create model based on architecture
        if architecture == "unet":
            model = smp.Unet(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "unetplusplus":
            model = smp.UnetPlusPlus(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "manet":
            model = smp.MAnet(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "linknet":
            model = smp.Linknet(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "fpn":
            model = smp.FPN(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "pan":
            model = smp.PAN(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "pspnet":
            model = smp.PSPNet(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "segformer":
            model = smp.Segformer(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "deeplabv3":
            model = smp.DeepLabV3Plus(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        elif architecture == "dpt":
            model = smp.DPT(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                dynamic_img_size=True,
                aux_params=aux_params,
            )
        elif architecture == "upernet":
            model = smp.UPerNet(
                encoder_name=config["backbone"],
                encoder_weights=config["encoder_weights"],
                in_channels=3,
                classes=config["num_classes"],
                activation=None,
                aux_params=aux_params,
            )
        else:
            raise ValueError(f"Architecture {config['architecture']} not supported")

        return model, aux_params

    except Exception as e:
        logger.error(f"Error creating model: {e}")
        logger.error(f"Config: {config}")
        raise e


def corriger_state_dict(model, model_path, logger):
    """
    Load state dict with robust handling of naming mismatches.
    
    Parameters:
        model: PyTorch model instance
        model_path: Path to model checkpoint
        logger: Logger instance
        
    Returns:
        Boolean indicating success
    """
    try:
        # Load checkpoint
        checkpoint = torch.load(model_path, map_location="cpu")
        if 'model_state_dict' in checkpoint:
            state_dict = checkpoint['model_state_dict']
        else:
            state_dict = checkpoint
        
        # Get model's expected keys
        model_keys = set(model.state_dict().keys())
        state_dict_keys = set(state_dict.keys())
        
        # If keys match exactly, load directly
        if model_keys == state_dict_keys:
            model.load_state_dict(state_dict)
            logger.debug("State dict loaded directly (perfect match)")
            return True
        
        logger.debug("Keys don't match exactly, trying to fix...")
        
        # Apply common fixes
        fixed_state_dict = {}
        used_keys = set()
        
        for model_key in model_keys:
            found = False
            
            # Try exact match first
            if model_key in state_dict:
                fixed_state_dict[model_key] = state_dict[model_key]
                used_keys.add(model_key)
                found = True
            else:
                # Try common transformations
                possible_keys = [
                    model_key,
                    # FastViT fix
                    model_key.replace('encoder.', 'encoder.model.'),
                    model_key.replace('encoder.model.', 'encoder.'),
                    # Module prefix fixes
                    f"module.{model_key}",
                    model_key.replace('module.', ''),
                    # Backbone/encoder swaps
                    model_key.replace('encoder.', 'backbone.'),
                    model_key.replace('backbone.', 'encoder.'),
                    # Decoder fixes
                    model_key.replace('decoder.', 'segmentation_head.'),
                    model_key.replace('segmentation_head.', 'decoder.'),
                ]
                
                for possible_key in possible_keys:
                    if possible_key in state_dict and possible_key not in used_keys:
                        fixed_state_dict[model_key] = state_dict[possible_key]
                        used_keys.add(possible_key)
                        found = True
                        break
            
            if not found:
                logger.debug(f"Could not find match for: {model_key}")
        
        # Check if we have enough keys
        match_percentage = len(fixed_state_dict) / len(model_keys) * 100
        logger.debug(f"Matched {len(fixed_state_dict)}/{len(model_keys)} keys ({match_percentage:.1f}%)")
        
        if match_percentage >= 80:
            try:
                model.load_state_dict(fixed_state_dict, strict=False)
                logger.debug("State dict loaded with fixes")
                return True
            except Exception as e:
                logger.error(f"Failed to load fixed state dict: {e}")
                return False
        else:
            logger.error(f"Too few keys matched ({match_percentage:.1f}%), skipping model")
            return False
            
    except Exception as e:
        logger.error(f"Error loading state dict: {e}")
        return False

def clear_memory():
    """Clear GPU memory cache and perform garbage collection."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        gc.collect()

# Test single model evaluation
def test_single_model(df_models, test_paths, img_size, batch_size=4):
    """
    Test evaluation with single model to debug issues.
    
    Parameters:
        df_models: DataFrame with model information
        test_paths: Dictionary with test image and mask paths
        img_size: Target image size
        batch_size: Batch size for inference
        
    Returns:
        Evaluation result dictionary
    """
    print("Testing single model evaluation...")
    
    # Take first model
    model_info = df_models.iloc[0].to_dict()
    
    print(f"Testing: {model_info['architecture']}_{model_info['backbone']}")
    print(f"Model path: {model_info['model_path']}")
    print(f"Path exists: {os.path.exists(model_info['model_path'])}")
    
    result, per_image_data = evaluate_single_model_with_per_image(
        model_info, test_paths, img_size, batch_size, 0, track_per_image=True
    )
    
    print(f"Result: {result['evaluation_status']}")
    if result['evaluation_status'] == 'success':
        print(f"IoU: {result['eval_test_iou']:.3f}")
        print(f"Parameters: {result['num_params_M']:.1f}M")
        print(f"Images processed: {len(per_image_data)}")
    
    return result

def worker_thread_safe(gpu_id, models_subset, test_paths, img_size, batch_size, track_per_image):
    """
    Thread-safe worker for multi-GPU evaluation.
    
    Parameters:
        gpu_id: GPU device ID
        models_subset: List of model info dictionaries to process
        test_paths: Test dataset paths
        img_size: Target image size
        batch_size: Batch size for inference
        track_per_image: Whether to track per-image metrics
        
    Returns:
        Tuple of (model_results, per_image_results)
    """
    import warnings
    
    # Initialize CUDA context
    with torch.cuda.device(gpu_id):
        torch.randn(1).to(f'cuda:{gpu_id}')
    
    thread_name = f"GPU-{gpu_id}"
    
    # Create individual progress bar for this thread
    pbar = tqdm(
        total=len(models_subset),
        desc=f"GPU {gpu_id}",
        position=gpu_id,
        leave=True,
        colour=['red', 'green', 'blue', 'yellow', 'magenta', 'cyan'][gpu_id % 6]
    )
    
    # Create dataset and loader for this thread
    try:
        test_dataset = SimpleSegmentationDataset(
            test_paths["images"],
            test_paths["masks"],
            img_size,
            None,
            cache_size=50,
        )
        
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=True,
            drop_last=False,
            persistent_workers=False,
        )
    except Exception as loader_error:
        pbar.set_description(f"GPU {gpu_id}: DataLoader Failed")
        pbar.close()
        print(f"{thread_name}: Failed to create DataLoader: {loader_error}")
        return [], []
    
    model_results = []
    per_image_results = []
    successful = 0
    
    try:
        for i, model_info in enumerate(models_subset):
            model_start_time = time.time()
            model_name = f"{model_info['architecture']}_{model_info['backbone']}"
            
            # Update progress bar
            pbar.set_description(f"GPU {gpu_id}: {model_name[:20]}")
            
            try:
                # Suppress warnings during evaluation
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    
                    # Evaluate model
                    result, per_image_data = evaluate_single_model_with_per_image(
                        model_info, test_paths, img_size, batch_size, gpu_id, 
                        track_per_image, test_loader, test_dataset
                    )
                
                model_results.append(result)
                per_image_results.extend(per_image_data)
                
                model_time = time.time() - model_start_time
                
                if result["evaluation_status"] == "success":
                    successful += 1
                    
                    # Update progress bar with success info
                    pbar.set_postfix({
                        'Status': '✓',
                        'IoU': f"{result['eval_test_iou']:.3f}",
                        'Time': f"{model_time:.1f}s",
                        'Success': f"{successful}/{i+1}"
                    })
                else:
                    # Update progress bar with failure info
                    pbar.set_postfix({
                        'Status': '✗', 
                        'Error': 'Failed',
                        'Time': f"{model_time:.1f}s",
                        'Success': f"{successful}/{i+1}"
                    })
                          
            except Exception as model_error:
                # Update progress bar with exception info
                pbar.set_postfix({
                    'Status': '!',
                    'Error': str(model_error)[:10],
                    'Success': f"{successful}/{i+1}"
                })
                
                # Add failed result
                result = model_info.copy()
                result.update({
                    "eval_test_iou": np.nan,
                    "eval_test_f1_score": np.nan,
                    "eval_test_accuracy": np.nan,
                    "eval_test_recall": np.nan,
                    "eval_test_precision": np.nan,
                    "eval_test_map_50": np.nan,
                    "eval_test_map_55": np.nan,
                    "eval_test_map_60": np.nan,
                    "eval_test_map_65": np.nan,
                    "eval_test_map_70": np.nan,
                    "eval_test_map_75": np.nan,
                    "eval_test_map_80": np.nan,
                    "eval_test_map_85": np.nan,
                    "eval_test_map_90": np.nan,
                    "eval_test_map_95": np.nan,
                    "eval_test_mean_iou": np.nan,
                    "num_params_M": np.nan,
                    "evaluation_status": f"failed: {str(model_error)[:50]}",
                    "device_id": gpu_id,
                    "num_test_images": 0
                })
                model_results.append(result)
            
            # Update progress
            pbar.update(1)
    
    except Exception as thread_error:
        pbar.set_postfix({'Status': 'X', 'Error': 'Thread Failed'})
        print(f"{thread_name}: Thread failed with error: {thread_error}")
    
    finally:
        # Final progress bar update
        pbar.set_description(f"GPU {gpu_id}: Complete")
        pbar.set_postfix({
            'Status': 'Done',
            'Success': f"{successful}/{len(models_subset)}",
            'Rate': f"{successful/len(models_subset)*100:.0f}%"
        })
        pbar.close()
        
        try:
            # Clean up resources
            del test_loader, test_dataset
            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e:
            print(f"{thread_name}: Error clearing memory: {e}")
            pass
    
    return model_results, per_image_results


def evaluate_single_model_with_per_image(model_info, test_paths, img_size, batch_size, device_id, track_per_image, test_loader, test_dataset):
    """
    Evaluate single model with optional per-image tracking.
    
    Parameters:
        model_info: Dictionary with model configuration
        test_paths: Test dataset paths
        img_size: Target image size
        batch_size: Batch size for inference
        device_id: GPU device ID
        track_per_image: Whether to track per-image metrics
        test_loader: DataLoader instance
        test_dataset: Dataset instance
        
    Returns:
        Tuple of (model_results, per_image_results)
    """
    model = None
    
    try:
        # Set device
        device = torch.device(f'cuda:{device_id}')
        
        # Use context manager for CUDA device
        with torch.cuda.device(device_id):
            
            # Extract config from model_info
            config = {
                "architecture": model_info["architecture"],
                "backbone": model_info["backbone"],
                "encoder_weights": model_info.get("encoder_weights"),
                "num_classes": int(model_info.get("num_classes", 1)),
                "model_path": model_info["model_path"],
            }
            
            # Check if model file exists
            if not os.path.exists(config["model_path"]):
                raise Exception(f"Model file not found: {config['model_path']}")
            
            # Create and load model
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model, aux_params = create_model(config)

            # Load weights
            import logging
            logger = logging.getLogger('silent')
            logger.setLevel(logging.CRITICAL)
            
            if not corriger_state_dict(model, config["model_path"], logger):
                raise Exception("Failed to load model weights")
            
            model = model.to(device)
            
            # Disable auxiliary head for speed
            if hasattr(model, 'aux_params'):
                model.aux_params = None
            
            num_params_M = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1_000_000
            
            # Evaluate model
            model.eval()
            all_metrics = {
                "iou": [],
                "f1_score": [],
                "accuracy": [],
                "recall": [],
                "precision": [],
                "map_50": [],
                "map_55": [],
                "map_60": [],
                "map_65": [],
                "map_70": [],
                "map_75": [],
                "map_80": [],
                "map_85": [],
                "map_90": [],
                "map_95": [],
                "mean_iou": [],
            }
            
            # Per-image tracking
            per_image_results = []
            image_counter = 0
            
            # Create new iterator for each model
            try:
                data_iter = iter(test_loader)
            except Exception as iter_error:
                raise Exception(f"Failed to create data iterator: {iter_error}")
            
            with torch.no_grad():
                batch_idx = 0
                while True:
                    try:
                        # Get next batch
                        try:
                            images, masks = next(data_iter)
                        except StopIteration:
                            break
                        except Exception as batch_error:
                            print(f"GPU {device_id}: Batch loading error: {batch_error}")
                            continue
                        
                        images = images.to(device, non_blocking=True)
                        masks = masks.to(device, non_blocking=True)
                        
                        with torch.cuda.amp.autocast():
                            outputs = model(images)
                            
                            # Handle tuple outputs (from aux_params)
                            if isinstance(outputs, tuple):
                                outputs = outputs[0]
                            
                            # Keep raw outputs for metrics
                            raw_outputs = outputs.squeeze(1) if len(outputs.shape) == 4 else outputs
                        
                        if track_per_image:
                            # Calculate metrics for each image
                            for i in range(len(images)):
                                # Pass raw logits to metrics calculation
                                single_raw_output = raw_outputs[i : i + 1]
                                single_mask = masks[i : i + 1]
                                
                                # Calculate metrics
                                batch_metrics = calculate_metrics(single_raw_output, single_mask)
                                
                                # Add to overall metrics
                                for key, value in batch_metrics.items():
                                    all_metrics[key].append(value)
                                
                                # Per-image result
                                per_image_result = {
                                    "image_id": image_counter,
                                    "batch_idx": batch_idx,
                                    "image_in_batch": i,
                                    **batch_metrics,
                                }
                                
                                # Add model info
                                per_image_result.update({
                                    "architecture": model_info["architecture"],
                                    "backbone": model_info["backbone"],
                                    "model_path": model_info["model_path"],
                                    "device_id": device_id,
                                    "num_params_M": num_params_M,
                                })
                                
                                # Add additional metadata
                                for key in ["validation_fold", "config", "model_start_date", "model_end_date", "total_training_time_hour"]:
                                    if key in model_info:
                                        per_image_result[key] = model_info[key]
                                
                                per_image_results.append(per_image_result)
                                image_counter += 1
                        else:
                            # Batch-level calculation
                            batch_metrics = calculate_metrics(raw_outputs, masks)
                            for key, value in batch_metrics.items():
                                all_metrics[key].append(value)
                        
                        # Clean up batch tensors
                        del images, masks, outputs, raw_outputs
                        
                        batch_idx += 1
                        
                    except Exception as batch_error:
                        print(f"GPU {device_id}: Batch {batch_idx} failed: {batch_error}")
                        batch_idx += 1
                        continue
            
            # Calculate average metrics
            if any(all_metrics.values()):
                avg_metrics = {key: np.mean(values) if values else 0.0 for key, values in all_metrics.items()}
            else:
                avg_metrics = {key: 0.0 for key in all_metrics.keys()}
            
            # Return result dictionary
            result = model_info.copy()
            result.update({
                "eval_test_iou": avg_metrics["iou"],
                "eval_test_f1_score": avg_metrics["f1_score"],
                "eval_test_accuracy": avg_metrics["accuracy"],
                "eval_test_recall": avg_metrics["recall"],
                "eval_test_precision": avg_metrics["precision"],
                "eval_test_map_50": avg_metrics["map_50"],
                "eval_test_map_55": avg_metrics["map_55"],
                "eval_test_map_60": avg_metrics["map_60"],
                "eval_test_map_65": avg_metrics["map_65"],
                "eval_test_map_70": avg_metrics["map_70"],
                "eval_test_map_75": avg_metrics["map_75"],
                "eval_test_map_80": avg_metrics["map_80"],
                "eval_test_map_85": avg_metrics["map_85"],
                "eval_test_map_90": avg_metrics["map_90"],
                "eval_test_map_95": avg_metrics["map_95"],
                "eval_test_mean_iou": avg_metrics["mean_iou"],
                "num_params_M": num_params_M,
                "evaluation_status": "success",
                "device_id": device_id,
                "num_test_images": image_counter if track_per_image else len(test_dataset)
            })
            
            return result, per_image_results if track_per_image else []
            
    except Exception as e:
        print(f"GPU {device_id}: Model evaluation failed: {str(e)}")
        import traceback
        print(f"GPU {device_id}: Traceback: {traceback.format_exc()}")
        
        # Return failed result
        result = model_info.copy()
        result.update({
            "eval_test_iou": np.nan,
            "eval_test_f1_score": np.nan,
            "eval_test_accuracy": np.nan,
            "eval_test_recall": np.nan,
            "eval_test_precision": np.nan,
            "eval_test_map_50": np.nan,
            "eval_test_map_55": np.nan,
            "eval_test_map_60": np.nan,
            "eval_test_map_65": np.nan,
            "eval_test_map_70": np.nan,
            "eval_test_map_75": np.nan,
            "eval_test_map_80": np.nan,
            "eval_test_map_85": np.nan,
            "eval_test_map_90": np.nan,
            "eval_test_map_95": np.nan,
            "eval_test_mean_iou": np.nan,
            "num_params_M": np.nan,
            "evaluation_status": f"failed: {str(e)[:50]}",
            "device_id": device_id,
            "num_test_images": 0
        })
        
        return result, []
        
    finally:
        try:
            if model is not None:
                del model
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                torch.cuda.reset_peak_memory_stats(device)
        except Exception as cleanup_error:
            print(f"GPU {device_id}: Cleanup error: {cleanup_error}")


def evaluate_models_threading_with_per_image(df_models, test_paths, img_size, batch_size=4, num_gpus=4, track_per_image=True, timeout_per_model=300):
    """
    Multi-GPU model evaluation using threading.
    
    Parameters:
        df_models: DataFrame with model information
        test_paths: Test dataset paths
        img_size: Target image size
        batch_size: Batch size for inference
        num_gpus: Number of GPUs to use
        track_per_image: Whether to track per-image metrics
        timeout_per_model: Timeout per model in seconds
        
    Returns:
        Tuple of (model_results_df, per_image_results_df)
    """
    print("Starting Multi-GPU Evaluation with Progress Tracking")
    print(f"Models: {len(df_models)} | GPUs: {num_gpus} | Batch Size: {batch_size}")
    print(f"Per-image tracking: {'Enabled' if track_per_image else 'Disabled'}")
    print(f"Timeout per model: {timeout_per_model}s")
    
    # Convert DataFrame to list of dictionaries
    all_models_list = []
    for idx, row in df_models.iterrows():
        model_dict = row.to_dict()
        model_dict['original_index'] = idx
        all_models_list.append(model_dict)
    
    # Split models across GPUs evenly
    models_per_gpu = len(all_models_list) // num_gpus
    remainder = len(all_models_list) % num_gpus
    
    gpu_assignments = []
    start_idx = 0
    
    for gpu_id in range(num_gpus):
        # Add extra model to first GPUs if remainder exists
        current_batch_size = models_per_gpu + (1 if gpu_id < remainder else 0)
        end_idx = start_idx + current_batch_size
        
        gpu_models = all_models_list[start_idx:end_idx]
        gpu_assignments.append((gpu_id, gpu_models))
        print(f"GPU {gpu_id}: {len(gpu_models)} models")
        
        start_idx = end_idx
    
    # Create main progress bar
    main_pbar = tqdm(
        total=len(df_models),
        desc="Overall Progress",
        position=num_gpus,
        colour='white'
    )
    
    # Run threads
    start_time = time.time()
    all_model_results = []
    all_per_image_results = []
    
    # Calculate total timeout
    max_models_per_gpu = max(len(models) for _, models in gpu_assignments)
    total_timeout = max_models_per_gpu * timeout_per_model + 60  # 60s buffer
    
    print(f"Total timeout: {total_timeout}s ({total_timeout//60:.1f} minutes)")
    print(f"\n{'='*60}")
    print("Starting GPU workers...")
    print(f"{'='*60}")
    
    with ThreadPoolExecutor(max_workers=num_gpus, thread_name_prefix="GPU-Worker") as executor:
        # Submit all tasks
        future_to_gpu = {}
        for gpu_id, models in gpu_assignments:
            future = executor.submit(
                worker_thread_safe,
                gpu_id, 
                models, 
                test_paths, 
                img_size, 
                batch_size, 
                track_per_image
            )
            future_to_gpu[future] = (gpu_id, len(models))
        
        # Collect results
        completed_gpus = set()
        
        try:
            for future in as_completed(future_to_gpu, timeout=total_timeout):
                gpu_id, num_models = future_to_gpu[future]
                try:
                    model_results, per_image_results = future.result(timeout=30)
                    all_model_results.extend(model_results)
                    all_per_image_results.extend(per_image_results)
                    completed_gpus.add(gpu_id)
                    
                    # Update main progress bar
                    main_pbar.update(num_models)
                    main_pbar.set_postfix({
                        'Completed_GPUs': f"{len(completed_gpus)}/{num_gpus}",
                        'Success_Rate': f"{sum(1 for r in all_model_results if r.get('evaluation_status') == 'success')}/{len(all_model_results)}"
                    })
                    
                    print(f"GPU {gpu_id} completed successfully ({len(model_results)} models)")
                    
                except Exception as e:
                    print(f"GPU {gpu_id} failed: {e}")
                    main_pbar.update(num_models)
                    
        except TimeoutError:
            print(f"Evaluation timed out after {total_timeout}s")
            
            # Cancel remaining futures
            for future in future_to_gpu:
                if not future.done():
                    gpu_id, _ = future_to_gpu[future]
                    print(f"Cancelling GPU {gpu_id}")
                    future.cancel()
        
        finally:
            main_pbar.close()
    
    total_time = time.time() - start_time
    
    # Create DataFrames
    df_model_results = pd.DataFrame(all_model_results)
    df_per_image_results = pd.DataFrame(all_per_image_results) if track_per_image else pd.DataFrame()
    
    # Summary
    successful = sum(1 for r in all_model_results if r.get("evaluation_status") == "success")
    total_images = len(all_per_image_results) if track_per_image else 0
    
    print(f"\n{'='*60}")
    print("EVALUATION COMPLETE!")
    print(f"{'='*60}")
    print(f"Total Time: {total_time//60:.0f}m {total_time%60:.0f}s")
    print(f"Models Processed: {len(all_model_results)}")
    print(f"Successful Models: {successful}/{len(df_models)} ({100*successful/len(df_models):.1f}%)")
    if len(all_model_results) > 0:
        print(f"Model Processing Speed: {len(all_model_results)*3600/total_time:.1f} models/hour")
    print(f"Completed GPUs: {len(completed_gpus)}/{num_gpus}")
    
    if track_per_image and total_images > 0:
        print(f"Total Images Evaluated: {total_images:,}")
        print(f"Image Processing Speed: {total_images*3600/total_time:.0f} images/hour")
        
        # Show per-image statistics
        if successful > 0:
            successful_images = df_per_image_results[
                df_per_image_results['architecture'].isin(
                    df_model_results[df_model_results['evaluation_status'] == 'success']['architecture']
                )
            ]
            if len(successful_images) > 0:
                avg_image_iou = successful_images['iou'].mean()
                std_image_iou = successful_images['iou'].std()
                print(f"Average Image IoU: {avg_image_iou:.3f} ± {std_image_iou:.3f}")
    
    print(f"Model Results DataFrame shape: {df_model_results.shape}")
    if track_per_image:
        print(f"Per-Image Results DataFrame shape: {df_per_image_results.shape}")
    print(f"{'='*60}\n")
    
    return df_model_results, df_per_image_results

def save_results_with_per_image(df_model_results, df_per_image_results, output_path, filename_prefix="model_evaluation"):
    """
    Save evaluation results to parquet files.
    
    Parameters:
        df_model_results: DataFrame with model-level results
        df_per_image_results: DataFrame with per-image results
        output_path: Output directory path
        filename_prefix: Prefix for output filenames
        
    Returns:
        Tuple of (model_filepath, per_image_filepath)
    """
    from datetime import datetime
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Save model results
    model_filename = f"{filename_prefix}_models_{timestamp}.parquet"
    model_filepath = os.path.join(output_path, model_filename)
    df_model_results.to_parquet(model_filepath, index=False)
    
    print(f"Model results saved to: {model_filepath}")
    print(f"Saved {len(df_model_results)} model results")
    
    # Save per-image results if available
    per_image_filepath = None
    if len(df_per_image_results) > 0:
        per_image_filename = f"{filename_prefix}_per_image_{timestamp}.parquet"
        per_image_filepath = os.path.join(output_path, per_image_filename)
        df_per_image_results.to_parquet(per_image_filepath, index=False)
        
        print(f"Per-image results saved to: {per_image_filepath}")
        print(f"Saved {len(df_per_image_results)} per-image results")
    else:
        print("No per-image results to save")
    
    return model_filepath, per_image_filepath

# Backward compatibility functions
def evaluate_models_threading(df_models, test_paths, img_size, batch_size=4, num_gpus=4):
    """Backward compatible function - returns only model results"""
    df_model_results, _ = evaluate_models_threading_with_per_image(
        df_models, test_paths, img_size, batch_size, num_gpus, track_per_image=False
    )
    return df_model_results

def save_results_clean(df_results, output_path, filename_prefix="model_evaluation"):
    """Backward compatible save function"""
    model_filepath, _ = save_results_with_per_image(
        df_results, pd.DataFrame(), output_path, filename_prefix
    )
    return model_filepath


In [None]:
if MAJ_DATASET:
    test_paths = load_test_data(TEST_DATASET_FILE, DATASET_IMAGES_DIR, DATASET_MASKS_DIR)
    
    # Run evaluation with per-image tracking
    df_model_results, df_per_image_results = evaluate_models_threading_with_per_image(
        df_test_models,
        test_paths, 
        IMG_SIZE, 
        batch_size=2,
        num_gpus=4,
        track_per_image=True,
        timeout_per_model=25
    )

    # Save both DataFrames
    model_file, per_image_file = save_results_with_per_image(
        df_model_results, 
        df_per_image_results, 
        OUTPUT_PARQUET_PATH
    )
    
    print("Evaluation completed successfully!")
    
else:
    logger.info("Skipping dataset loading, using existing DataFrame")
    df_model_results = pd.read_parquet(SAVED_MODEL_RESULTS_PARQUET_PATH)
    df_per_image_results = pd.read_parquet(SAVED_PER_IMAGE_RESULTS_PARQUET_PATH)

In [None]:
df_model_results

In [None]:
df_per_image_results

In [None]:
# Check DataFrame structure
print("Original df_model_results columns related to config:")
config_cols = [col for col in df_model_results.columns if 'config' in col.lower()]
print(config_cols)

print(f"\n'config' in columns: {'config' in df_model_results.columns}")
print(f"'config_' in columns: {'config_' in df_model_results.columns}")

print("\nSample data:")
print(df_model_results[['config', 'validation_fold', 'architecture', 'eval_test_iou']].head())


### Data Cleaning

In [None]:
def filter_complete_trainings(df):
    """
    Filter to include only training runs with all 5 folds.
    
    Parameters:
        df: DataFrame with model results
        
    Returns:
        DataFrame with only complete training runs
    """
    # Count folds per config
    fold_counts = df.groupby(['config'])['validation_fold'].agg(['count', 'nunique']).reset_index()
    fold_counts.columns = ['config', 'total_folds', 'unique_folds']
    
    # Identify complete training runs
    complete_trainings = fold_counts[fold_counts['unique_folds'] == 5]
    
    # Check for configs with multiple training sessions
    multiple_sessions = fold_counts[fold_counts['total_folds'] > 5]
    if len(multiple_sessions) > 0:
        print(f"Warning: {len(multiple_sessions)} configs have more than 5 folds (multiple training sessions)")
        print("Using all available folds for these configs (will average across all sessions).")
        complete_trainings = fold_counts[fold_counts['unique_folds'] == 5]
    
    # Verify fold numbers are exactly 0, 1, 2, 3, 4
    expected_folds = set([0, 1, 2, 3, 4])
    
    def has_all_folds(group):
        return set(group['validation_fold'].unique()) == expected_folds
    
    complete_check = df.groupby(['config']).apply(has_all_folds).reset_index()
    complete_check.columns = ['config', 'has_all_folds']
    
    complete_trainings = complete_trainings.merge(complete_check, on=['config'])
    complete_trainings = complete_trainings[complete_trainings['has_all_folds']]
    
    # Filter original dataframe
    df_complete = df.merge(complete_trainings[['config']], on=['config'], how='inner')
    
    print(f"Original configs: {df['config'].nunique()}")
    print(f"Complete configs (all 5 folds): {len(complete_trainings)}")
    print(f"Filtered from {len(df)} to {len(df_complete)} rows")
    
    return df_complete


def show_incomplete_trainings(df):
    """Show which training configurations were incomplete."""
    fold_summary = df.groupby(['config']).agg({
        'validation_fold': lambda x: sorted(list(x.unique()))
    }).reset_index()
    fold_summary.columns = ['config', 'available_folds']
    
    incomplete = fold_summary[fold_summary['available_folds'].apply(lambda x: len(x) < 5 or x != [0,1,2,3,4])]
    
    if len(incomplete) > 0:
        print(f"Incomplete configs removed: {len(incomplete)}")
        for _, row in incomplete.iterrows():
            print(f"  {row['config']}: folds {row['available_folds']}")
    else:
        print("No incomplete trainings found")

def show_available_metrics(df_clean):
    """Show available metrics in the cleaned dataset."""
    print("AVAILABLE METRICS IN CLEANED DATASET:")
    print("=" * 60)
    
    original_test_cols = [col for col in df_clean.columns if col.startswith('test_') and col.endswith('_mean')]
    eval_test_cols = [col for col in df_clean.columns if col.startswith('eval_test_') and col.endswith('_mean')]
    val_cols = [col for col in df_clean.columns if any(prefix in col.lower() for prefix in ['val_', 'validation_', 'best_val']) and col.endswith('_mean')]
    
    exclude_cols = original_test_cols + eval_test_cols + val_cols + ['config', 'model_start_date']
    static_cols = [col for col in df_clean.columns if col not in exclude_cols]
    
    if original_test_cols:
        print("Original Test Metrics (from JSON):")
        for col in sorted(original_test_cols):
            print(f"  - {col}")
    
    if eval_test_cols:
        print("Evaluation Test Metrics (from re-evaluation):")
        for col in sorted(eval_test_cols):
            print(f"  - {col}")
    
    if val_cols:
        print("Validation Metrics:")
        for col in sorted(val_cols):
            print(f"  - {col}")
    
    if static_cols:
        print("Model Characteristics:")
        for col in sorted(static_cols):
            print(f"  - {col}")

def clean_cv_results(df):
    """
    Clean cross-validation results by aggregating across folds.
    
    Parameters:
        df: Raw DataFrame with all fold results
        
    Returns:
        DataFrame with one row per model configuration
    """
    # Filter low performance models
    df_filtered = df[df['eval_test_iou'] > 0.2].copy()
    
    print(f"Starting with {len(df_filtered)} rows from {df_filtered['config'].nunique()} configs")
    print(f"(Filtered out {len(df) - len(df_filtered)} rows with IoU <= 0.2)")
    
    # Select best evaluation per config/fold
    print("\nSelecting best evaluation per fold for each config...")
    
    best_per_config_fold = []
    configs_processed = 0
    configs_kept = 0
    
    for config in df_filtered['config'].unique():
        config_data = df_filtered[df_filtered['config'] == config]
        configs_processed += 1
        
        # Check which folds are available
        available_folds = set(config_data['validation_fold'].unique())
        required_folds = {0, 1, 2, 3, 4}
        
        if available_folds == required_folds:
            configs_kept += 1
            
            for fold in range(5):
                fold_data = config_data[config_data['validation_fold'] == fold]
                if len(fold_data) > 1:
                    # Multiple evaluations for this fold - keep the best
                    best_idx = fold_data['eval_test_iou'].idxmax()
                    best_per_config_fold.append(fold_data.loc[best_idx])
                else:
                    # Single evaluation for this fold
                    best_per_config_fold.append(fold_data.iloc[0])
        else:
            # Missing some folds
            missing_folds = required_folds - available_folds
            if configs_processed <= 10:
                print(f"  Skipping {config}: missing folds {sorted(missing_folds)}")
    
    print(f"\nKept {configs_kept} out of {configs_processed} configs (all with complete 5-fold CV)")
    
    # Create dataframe with best evaluations
    df_complete = pd.DataFrame(best_per_config_fold)
    print(f"Selected best evaluations: {len(df_complete)} rows from {df_complete['config'].nunique()} configs")
    
    # Verify each config has exactly 5 folds
    fold_check = df_complete.groupby('config')['validation_fold'].agg(['count', 'nunique'])
    assert all(fold_check['count'] == 5), "Some configs don't have exactly 5 rows"
    assert all(fold_check['nunique'] == 5), "Some configs don't have 5 unique folds"
    print("Verified: All configs have exactly 5 unique folds")
    
    # Aggregate by config
    print("\nAggregating metrics by config...")
    
    # Identify metrics to aggregate
    original_test_metrics = [col for col in df_complete.columns if col.startswith('test_')]
    eval_test_metrics = [col for col in df_complete.columns if col.startswith('eval_test_')]
    val_metrics = [col for col in df_complete.columns if any(prefix in col.lower() for prefix in ['final_val_', 'best_val_'])]
    
    print(f"Found {len(original_test_metrics)} original test metrics")
    print(f"Found {len(eval_test_metrics)} evaluation test metrics")
    print(f"Found {len(val_metrics)} validation metrics")
    
    # Build aggregation dictionary
    agg_dict = {}
    
    # Aggregate all metrics - take mean across folds
    for metric in original_test_metrics + eval_test_metrics + val_metrics:
        agg_dict[metric] = 'mean'
    
    # Model characteristics - should be same across folds
    static_cols = ['architecture', 'backbone', 'encoder_weights', 'img_size', 
                   'num_classes', 'batch_size', 'learning_rate', 'epochs', 
                   'patience', 'accumulation_steps', 'auto_batch_size',
                   'min_batch_size_search', 'max_batch_size_search', 'batch_size_test_steps']
    
    for col in static_cols:
        if col in df_complete.columns:
            agg_dict[col] = 'first'
    
    # Training time - sum across all folds
    if 'total_training_time_hour' in df_complete.columns:
        agg_dict['total_training_time_hour'] = 'sum'
    
    # Parameter count - should be same across folds
    if 'num_params_M' in df_complete.columns:
        agg_dict['num_params_M'] = 'first'
    
    # Group by config and aggregate
    df_clean = df_complete.groupby('config').agg(agg_dict)
    
    # Handle model_start_date separately
    if 'model_start_date' in df_complete.columns:
        date_agg = df_complete.groupby('config')['model_start_date'].agg(['min', 'max', 'nunique'])
        df_clean['first_training_date'] = date_agg['min']
        df_clean['last_training_date'] = date_agg['max']
        df_clean['num_training_sessions'] = date_agg['nunique']
    
    # Add fold count for verification
    fold_counts = df_complete.groupby('config')['validation_fold'].agg(['count', 'nunique'])
    df_clean['fold_count'] = fold_counts['count']
    df_clean['fold_unique'] = fold_counts['nunique']
    
    # Reset index
    df_clean = df_clean.reset_index()
    
    print(f"Aggregated to {len(df_clean)} unique configs")
    
    # Rename columns to add _mean suffix
    rename_dict = {}
    for col in df_clean.columns:
        if col in original_test_metrics + eval_test_metrics + val_metrics:
            rename_dict[col] = f"{col}_mean"
    
    df_clean = df_clean.rename(columns=rename_dict)
    
    # Add config_ column for compatibility
    if 'config' in df_clean.columns:
        df_clean['config_'] = df_clean['config']
    
    # Final verification
    assert all(df_clean['fold_count'] == 5), "Some configs don't have exactly 5 folds after aggregation"
    assert all(df_clean['fold_unique'] == 5), "Some configs don't have 5 unique folds after aggregation"
    
    # Clean up verification columns
    df_clean = df_clean.drop(['fold_count', 'fold_unique'], axis=1)
    
    # Print summary statistics
    print(f"\n{'='*60}")
    print("FINAL SUMMARY")
    print(f"{'='*60}")
    print(f"Total configs in final dataset: {len(df_clean)}")
    
    if 'eval_test_iou_mean' in df_clean.columns:
        print(f"\nPerformance statistics:")
        print(f"  Min IoU: {df_clean['eval_test_iou_mean'].min():.3f}")
        print(f"  Max IoU: {df_clean['eval_test_iou_mean'].max():.3f}")
        print(f"  Mean IoU: {df_clean['eval_test_iou_mean'].mean():.3f}")
        print(f"  Std IoU: {df_clean['eval_test_iou_mean'].std():.3f}")
        
        # Compare to original data
        original_configs = df_filtered['config'].unique()
        kept_configs = set(df_clean['config'])
        dropped_configs = set(original_configs) - kept_configs
        
        if dropped_configs:
            print(f"\nDropped {len(dropped_configs)} configs due to incomplete folds")
            print("Examples of dropped configs:")
            for i, config in enumerate(sorted(dropped_configs)[:5]):
                config_data = df_filtered[df_filtered['config'] == config]
                available_folds = sorted(config_data['validation_fold'].unique())
                print(f"  - {config}: has folds {available_folds}")
            if len(dropped_configs) > 5:
                print(f"  ... and {len(dropped_configs) - 5} more")
    
    if 'num_training_sessions' in df_clean.columns:
        multi_session = df_clean[df_clean['num_training_sessions'] > 1]
        if len(multi_session) > 0:
            print(f"\n{len(multi_session)} configs had multiple training sessions")
            print("(We kept the best evaluation per fold across all sessions)")
    
    print(f"{'='*60}")
    
    return df_clean

def compare_original_vs_evaluation_metrics(df_clean):
    """Compare original JSON test metrics with re-evaluation metrics."""
    
    original_iou = None
    eval_iou = None
    
    for col in df_clean.columns:
        if col.startswith('test_') and 'iou' in col and col.endswith('_mean'):
            original_iou = col
        elif col.startswith('eval_test_') and 'iou' in col and col.endswith('_mean'):
            eval_iou = col
    
    if original_iou and eval_iou:
        print("COMPARISON: Original vs Re-evaluation Metrics")
        print("=" * 50)
        
        iou_diff = df_clean[eval_iou] - df_clean[original_iou]
        
        print(f"Original Test IoU (from JSON): {df_clean[original_iou].mean():.3f} ± {df_clean[original_iou].std():.3f}")
        print(f"Re-eval Test IoU: {df_clean[eval_iou].mean():.3f} ± {df_clean[eval_iou].std():.3f}")
        print(f"Average difference (re-eval - original): {iou_diff.mean():.3f} ± {iou_diff.std():.3f}")
        
        df_temp = df_clean.copy()
        df_temp['iou_difference'] = iou_diff
        
        # Determine config column
        config_col = 'config' if 'config' in df_temp.columns else 'config_' if 'config_' in df_temp.columns else None
        
        if config_col:
            print("\nLargest improvements in re-evaluation:")
            top_improvements = df_temp.nlargest(3, 'iou_difference')[[config_col, original_iou, eval_iou, 'iou_difference']]
            print(top_improvements.to_string(index=False))
            
            print("\nLargest drops in re-evaluation:")
            top_drops = df_temp.nsmallest(3, 'iou_difference')[[config_col, original_iou, eval_iou, 'iou_difference']]
            print(top_drops.to_string(index=False))
        else:
            print("\nWarning: No config column found for detailed comparison")
        
        return df_temp
    else:
        print("Cannot compare metrics - missing columns:")
        print(f"  Original IoU column: {original_iou}")
        print(f"  Evaluation IoU column: {eval_iou}")
        return df_clean

def fix_column_names_for_graphics(df):
    """Add compatibility columns for graphics functions."""
    df_fixed = df.copy()
    
    if 'config' in df_fixed.columns and 'config_' not in df_fixed.columns:
        df_fixed['config_'] = df_fixed['config']
    
    return df_fixed

def run_cv_analysis(df_clean, save_path=None, use_evaluation_metrics=True):
    """
    Run analysis on cleaned cross-validation data.
    
    Parameters:
        df_clean: Cleaned DataFrame with aggregated results
        save_path: Path to save analysis results
        use_evaluation_metrics: Whether to use re-evaluation metrics
        
    Returns:
        DataFrame ready for analysis
    """
    # Determine performance metric to use
    available_eval_test = [col for col in df_clean.columns if col.startswith('eval_test_') and 'iou' in col and col.endswith('_mean')]
    available_original_test = [col for col in df_clean.columns if col.startswith('test_') and 'iou' in col and col.endswith('_mean')]
    available_val = [col for col in df_clean.columns if ('val' in col.lower() or 'validation' in col.lower()) and 'iou' in col and col.endswith('_mean')]
    
    if use_evaluation_metrics and available_eval_test:
        performance_metric = available_eval_test[0]
        print(f"Using evaluation metric: {performance_metric}")
    elif available_original_test:
        performance_metric = available_original_test[0]
        print(f"Using original test metric: {performance_metric}")
    elif available_val:
        performance_metric = available_val[0]
        print(f"Using validation metric: {performance_metric}")
    else:
        print("No IoU metrics found")
        return
    
    # Create analysis copy with compatibility columns
    df_analysis = df_clean.copy()
    
    column_mapping = {
        performance_metric: 'test_iou',
        'total_training_time_hour': 'total_training_time_hour',
        'num_params_M': 'num_params_M'
    }
    
    for old_col, new_col in column_mapping.items():
        if old_col in df_analysis.columns and old_col != new_col:
            df_analysis[new_col] = df_analysis[old_col]
    
    # Analysis summary
    print(f"Running analysis on {len(df_analysis)} complete training runs")
    print(f"Performance range: {df_analysis['test_iou'].min():.3f} - {df_analysis['test_iou'].max():.3f}")
    print(f"Mean performance: {df_analysis['test_iou'].mean():.3f}")
    
    threshold = 0.5
    good_models = df_analysis[df_analysis['test_iou'] > threshold]
    print(f"Models above {threshold} IoU: {len(good_models)}")
    
    if len(good_models) > 0:
        print("\nDATA ANALYSIS SUMMARY:")
        print(f"Models above threshold: {len(good_models)}")
        print(f"Best performing model: {good_models['test_iou'].max():.3f} IoU")
        
        top_5 = good_models.nlargest(5, 'test_iou')
        print("\nTop 5 Models:")
        for i, (_, row) in enumerate(top_5.iterrows(), 1):
            config_name = row.get('config', f'Model {i}')
            architecture = row.get('architecture', 'Unknown')
            print(f"  {i}. {config_name} ({architecture}): {row['test_iou']:.3f} IoU")
    else:
        print("No models above threshold for analysis")
        
    return df_analysis

In [None]:
# Understand data structure
print("DIAGNOSTICS: Understanding the data structure")
print("="*60)

# Check sample config
sample_config = 'pan_regnety_080_imagenet'
sample_data = df_model_results[df_model_results['config'] == sample_config].sort_values(['model_start_date', 'validation_fold'])

print(f"\nSample config: {sample_config}")
print(f"Total rows: {len(sample_data)}")
print("\nData structure:")
print(sample_data[['config', 'validation_fold', 'model_start_date', 'eval_test_iou']].to_string())

print("\nUnique model_start_dates:")
print(sample_data['model_start_date'].value_counts())

# Clean CV results
df_model_results_clean = clean_cv_results(df_model_results)

print("\n" + "="*60)
print("CLEANED DATASET SUMMARY")
print("="*60)
print(f"Total configs: {len(df_model_results_clean)}")
print(f"Original had: {df_model_results['config'].nunique()} unique configs")
print(f"Kept: {len(df_model_results_clean) / df_model_results['config'].nunique() * 100:.1f}%")

# Run analysis
df_analysis = run_cv_analysis(df_model_results_clean, save_path=GRAPHICS_PATH)

# Show top models
print("\n" + "="*60)
print("TOP 10 MODELS (Best Sessions Only)")
print("="*60)
if 'eval_test_iou_mean' in df_model_results_clean.columns:
    top_10 = df_model_results_clean.nlargest(10, 'eval_test_iou_mean')[['config', 'architecture', 'backbone', 'eval_test_iou_mean', 'num_params_M']]
    print(top_10.to_string(index=False))


## Evaluate YOLO Models on Test Set

In [None]:
def polygon_to_mask(polygon, img_width, img_height):
    """
    Convert normalized polygon coordinates to binary mask.
    
    Parameters:
        polygon: List of normalized coordinates
        img_width: Image width in pixels
        img_height: Image height in pixels
        
    Returns:
        Binary mask array
    """
    # Denormalize coordinates
    polygon = np.array(polygon).reshape(-1, 2)
    polygon[:, 0] *= img_width
    polygon[:, 1] *= img_height
    polygon = polygon.astype(np.int32)
    
    # Create mask
    mask = np.zeros((img_height, img_width), dtype=np.uint8)
    cv2.fillPoly(mask, [polygon], 1)
    return mask

def parse_yolo_segmentation(label_path, img_width, img_height):
    """
    Parse YOLO segmentation format label file.
    
    Parameters:
        label_path: Path to label file
        img_width: Image width
        img_height: Image height
        
    Returns:
        Tuple of (masks, classes) lists
    """
    masks = []
    classes = []
    
    if not os.path.exists(label_path):
        return masks, classes
    
    with open(label_path, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        parts = line.strip().split()
        if len(parts) < 7:  # At least class + 3 points (6 coordinates)
            continue
        
        class_id = int(parts[0])
        polygon = [float(x) for x in parts[1:]]
        
        mask = polygon_to_mask(polygon, img_width, img_height)
        masks.append(mask)
        classes.append(class_id)
    
    return masks, classes

def calculate_iou(mask1, mask2):
    """
    Calculate Intersection over Union between two masks.
    
    Parameters:
        mask1: First binary mask
        mask2: Second binary mask
        
    Returns:
        IoU value
    """
    intersection = np.logical_and(mask1, mask2).sum()
    union = np.logical_or(mask1, mask2).sum()
    
    if union == 0:
        return 0.0
    return intersection / union

def match_predictions_to_ground_truth(pred_masks, pred_classes, gt_masks, gt_classes, iou_threshold=0.5):
    """
    Match predictions to ground truth based on IoU.
    
    Parameters:
        pred_masks: List of predicted masks
        pred_classes: List of predicted classes
        gt_masks: List of ground truth masks
        gt_classes: List of ground truth classes
        iou_threshold: Minimum IoU for matching
        
    Returns:
        List of (pred_idx, gt_idx, iou) tuples
    """
    matches = []
    matched_gt = set()
    
    for i, (pred_mask, pred_class) in enumerate(zip(pred_masks, pred_classes)):
        best_iou = 0
        best_gt_idx = -1
        
        for j, (gt_mask, gt_class) in enumerate(zip(gt_masks, gt_classes)):
            if j in matched_gt:
                continue
            
            if pred_class != gt_class:
                continue
            
            iou = calculate_iou(pred_mask, gt_mask)
            if iou > best_iou and iou >= iou_threshold:
                best_iou = iou
                best_gt_idx = j
        
        if best_gt_idx >= 0:
            matches.append((i, best_gt_idx, best_iou))
            matched_gt.add(best_gt_idx)
    
    return matches

def calculate_metrics_for_image(pred_masks, pred_classes, pred_scores, gt_masks, gt_classes, iou_thresholds):
    """
    Calculate metrics for a single image.
    
    Parameters:
        pred_masks: List of predicted masks
        pred_classes: List of predicted classes
        pred_scores: List of confidence scores
        gt_masks: List of ground truth masks
        gt_classes: List of ground truth classes
        iou_thresholds: List of IoU thresholds for mAP calculation
        
    Returns:
        Dictionary of metrics
    """
    metrics = {}
    
    # Handle empty predictions or ground truth
    if len(pred_masks) == 0 and len(gt_masks) == 0:
        # Perfect case - no predictions and no ground truth
        for iou_thresh in iou_thresholds:
            metrics[f'map_{int(iou_thresh*100)}'] = 1.0
        metrics['precision'] = 1.0
        metrics['recall'] = 1.0
        metrics['f1_score'] = 1.0
        metrics['accuracy'] = 1.0
        metrics['iou'] = 1.0
        metrics['mean_iou'] = 1.0
        return metrics
    
    if len(pred_masks) == 0:
        # No predictions but there are ground truth objects
        for iou_thresh in iou_thresholds:
            metrics[f'map_{int(iou_thresh*100)}'] = 0.0
        metrics['precision'] = 0.0
        metrics['recall'] = 0.0
        metrics['f1_score'] = 0.0
        metrics['accuracy'] = 0.0
        metrics['iou'] = 0.0
        metrics['mean_iou'] = 0.0
        return metrics
    
    if len(gt_masks) == 0:
        # Predictions but no ground truth
        for iou_thresh in iou_thresholds:
            metrics[f'map_{int(iou_thresh*100)}'] = 0.0
        metrics['precision'] = 0.0
        metrics['recall'] = 1.0
        metrics['f1_score'] = 0.0
        metrics['accuracy'] = 0.0
        metrics['iou'] = 0.0
        metrics['mean_iou'] = 0.0
        return metrics
    
    # Calculate metrics for each IoU threshold
    ious_for_matches = []
    
    for iou_thresh in iou_thresholds:
        matches = match_predictions_to_ground_truth(pred_masks, pred_classes, gt_masks, gt_classes, iou_thresh)
        
        true_positives = len(matches)
        false_positives = len(pred_masks) - true_positives
        false_negatives = len(gt_masks) - true_positives
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        # Calculate AP for this threshold
        if len(pred_scores) > 0:
            # Sort by confidence
            sorted_indices = np.argsort(pred_scores)[::-1]
            sorted_pred_masks = [pred_masks[i] for i in sorted_indices]
            sorted_pred_classes = [pred_classes[i] for i in sorted_indices]
            
            # Recalculate matches with sorted predictions
            matches = match_predictions_to_ground_truth(sorted_pred_masks, sorted_pred_classes, gt_masks, gt_classes, iou_thresh)
            
            # Calculate AP
            precisions = []
            recalls = []
            tp = 0
            fp = 0
            
            matched_gt = set()
            for i in range(len(sorted_pred_masks)):
                matched = False
                for match in matches:
                    if match[0] == i:
                        tp += 1
                        matched = True
                        ious_for_matches.append(match[2])
                        break
                
                if not matched:
                    fp += 1
                
                precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                recall = tp / len(gt_masks) if len(gt_masks) > 0 else 0
                
                precisions.append(precision)
                recalls.append(recall)
            
            # Calculate AP using 11-point interpolation
            ap = 0
            for r in np.linspace(0, 1, 11):
                if len(recalls) > 0:
                    prec_at_recall = [p for p, rec in zip(precisions, recalls) if rec >= r]
                    if prec_at_recall:
                        ap += max(prec_at_recall) / 11
            
            metrics[f'map_{int(iou_thresh*100)}'] = ap
        else:
            metrics[f'map_{int(iou_thresh*100)}'] = precision
    
    # Calculate overall metrics at IoU 0.5
    matches = match_predictions_to_ground_truth(pred_masks, pred_classes, gt_masks, gt_classes, 0.5)
    
    true_positives = len(matches)
    false_positives = len(pred_masks) - true_positives
    false_negatives = len(gt_masks) - true_positives
    
    metrics['precision'] = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    metrics['recall'] = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    metrics['f1_score'] = 2 * metrics['precision'] * metrics['recall'] / (metrics['precision'] + metrics['recall']) if (metrics['precision'] + metrics['recall']) > 0 else 0
    
    # Accuracy
    total_predictions = true_positives + false_positives + false_negatives
    metrics['accuracy'] = true_positives / total_predictions if total_predictions > 0 else 0
    
    # Mean IoU of matched predictions
    metrics['mean_iou'] = np.mean(ious_for_matches) if ious_for_matches else 0.0
    metrics['iou'] = metrics['mean_iou']
    
    return metrics

def evaluate_model_on_test_set(model_path, test_images_dir, test_labels_dir):
    """
    Evaluate a single YOLO model on test dataset.
    
    Parameters:
        model_path: Path to YOLO model weights
        test_images_dir: Directory containing test images
        test_labels_dir: Directory containing test labels
        
    Returns:
        Tuple of (per_image_results, overall_metrics)
    """
    # Load model
    try:
        model = YOLO(model_path)
    except Exception as e:
        print(f"Error loading model {model_path}: {e}")
        return None, None
    
    # Get all test images
    image_files = list(Path(test_images_dir).glob("*.jpg")) + \
                  list(Path(test_images_dir).glob("*.png")) + \
                  list(Path(test_images_dir).glob("*.jpeg"))
    
    if len(image_files) == 0:
        print(f"No images found in {test_images_dir}")
        return None, None
    
    per_image_results = []
    all_metrics = defaultdict(list)
    
    # IoU thresholds for mAP calculation
    iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
    
    for img_path in tqdm(image_files, desc="Evaluating images"):
        # Read image
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        
        img_height, img_width = img.shape[:2]
        
        # Get corresponding label
        label_name = img_path.stem + ".txt"
        label_path = os.path.join(test_labels_dir, label_name)
        
        # Parse ground truth
        gt_masks, gt_classes = parse_yolo_segmentation(label_path, img_width, img_height)
        
        # Run inference
        try:
            results = model(img, verbose=False)
            
            pred_masks = []
            pred_classes = []
            pred_scores = []
            
            if results[0].masks is not None:
                masks_data = results[0].masks.data.cpu().numpy()
                boxes = results[0].boxes
                
                for i in range(len(masks_data)):
                    mask = masks_data[i]
                    # Resize mask to original image size
                    mask = cv2.resize(mask, (img_width, img_height), interpolation=cv2.INTER_NEAREST)
                    mask = (mask > 0.5).astype(np.uint8)
                    
                    pred_masks.append(mask)
                    pred_classes.append(int(boxes.cls[i]))
                    pred_scores.append(float(boxes.conf[i]))
            
            # Calculate metrics for this image
            image_metrics = calculate_metrics_for_image(
                pred_masks, pred_classes, pred_scores,
                gt_masks, gt_classes,
                iou_thresholds
            )
            
            # Store results
            image_result = {
                'image_name': img_path.name,
                **image_metrics
            }
            per_image_results.append(image_result)
            
            # Accumulate for overall metrics
            for key, value in image_metrics.items():
                all_metrics[key].append(value)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue
    
    # Calculate overall test set metrics
    overall_metrics = {}
    for key, values in all_metrics.items():
        overall_metrics[key] = np.mean(values)
    
    return per_image_results, overall_metrics

In [None]:
if YOLO_EVALUATE:
    # Verify paths exist
    if not os.path.exists(INPUT_MODELS_BASE_DIR_PATH):
        print(f"ERROR: INPUT_MODELS_BASE_DIR_PATH does not exist: {INPUT_MODELS_BASE_DIR_PATH}")

    if not os.path.exists(TEST_DATASET_PATH):
        print(f"WARNING: TEST_DATASET_PATH does not exist: {TEST_DATASET_PATH}")
        print("Trying to find it relative to INPUT_MODELS_BASE_DIR_PATH...")
        alt_test_path = os.path.join(INPUT_MODELS_BASE_DIR_PATH, TEST_DATASET_PATH)
        if os.path.exists(alt_test_path):
            TEST_DATASET_PATH = alt_test_path
            print(f"Found test dataset at: {TEST_DATASET_PATH}")
        else:
            print("ERROR: Could not find test dataset")
            print("Please update TEST_DATASET_PATH in the script")

    # Find all model directories
    print(f"\nSearching for model directories in: {INPUT_MODELS_BASE_DIR_PATH}")
    all_dirs = os.listdir(INPUT_MODELS_BASE_DIR_PATH)
    model_dirs = [d for d in all_dirs 
                    if d.startswith("01_training_yolo") and os.path.isdir(os.path.join(INPUT_MODELS_BASE_DIR_PATH, d))]

    print(f"Found {len(model_dirs)} model directories starting with '01_training_yolo':")
    for d in model_dirs:
        print(f"  - {d}")

    if len(model_dirs) == 0:
        print("\nNo directories starting with '01_training_yolo' found.")
        print("Available directories:")
        for d in all_dirs:
            if os.path.isdir(os.path.join(INPUT_MODELS_BASE_DIR_PATH, d)):
                print(f"  - {d}")

    all_results = []
    models_found = 0

    for model_dir in model_dirs:
        model_dir_path = os.path.join(INPUT_MODELS_BASE_DIR_PATH, model_dir)
        print(f"\nSearching in: {model_dir_path}")
        
        # Look for training_metrics.json files
        for root, dirs, files in os.walk(model_dir_path):
            if "training_metrics.json" in files:
                metrics_path = os.path.join(root, "training_metrics.json")
                print(f"Found training_metrics.json at: {metrics_path}")
                
                try:
                    # Load existing metrics
                    with open(metrics_path, 'r') as f:
                        existing_metrics = json.load(f)
                    
                    model_path = existing_metrics.get("model_path")
                    if not model_path:
                        print(f"  No model path found in {metrics_path}")
                        continue
                    
                    # Construct full model path if relative
                    if not os.path.isabs(model_path):
                        # Try multiple possible base paths
                        possible_paths = [
                            os.path.join(INPUT_MODELS_BASE_DIR_PATH, model_path),
                            os.path.join(root, model_path),
                            os.path.join(root, "weights", "best.pt"),
                            os.path.join(root, "..", model_path),
                            os.path.join(root, "..", "..", model_path)
                        ]
                        
                        model_found = False
                        for p in possible_paths:
                            p = os.path.normpath(p)
                            if os.path.exists(p):
                                model_path = p
                                model_found = True
                                break
                        
                        if not model_found:
                            print("  Model not found at any of these locations:")
                            for p in possible_paths:
                                print(f"    - {os.path.normpath(p)}")
                            continue
                    
                    if not os.path.exists(model_path):
                        print(f"  Model not found: {model_path}")
                        continue
                    
                    models_found += 1
                    print(f"  Found model: {model_path}")
                    print("  Evaluating model...")
                    
                    # Evaluate model
                    test_images_dir = os.path.join(TEST_DATASET_PATH, "images")
                    test_labels_dir = os.path.join(TEST_DATASET_PATH, "labels")
                    
                    if not os.path.exists(test_images_dir):
                        print(f"  ERROR: Test images directory not found: {test_images_dir}")
                        continue
                    
                    if not os.path.exists(test_labels_dir):
                        print(f"  ERROR: Test labels directory not found: {test_labels_dir}")
                        continue
                    
                    per_image_results, overall_metrics = evaluate_model_on_test_set(
                        model_path, test_images_dir, test_labels_dir
                    )
                    
                    if overall_metrics:
                        # Create result entry
                        result = {
                            'model_name': os.path.basename(model_dir),
                            'model_path': model_path,
                            'fold': existing_metrics.get("fold", 0),
                            'training_time_minutes': existing_metrics.get("training_time_minutes", 0),
                            **overall_metrics
                        }
                        all_results.append(result)
                        
                        # Save per-image results
                        if per_image_results:
                            per_image_df = pd.DataFrame(per_image_results)
                            per_image_output_path = os.path.join(
                                root, 
                                "test_evaluation_per_image.csv"
                            )
                            per_image_df.to_csv(per_image_output_path, index=False)
                            print(f"  Saved per-image results to: {per_image_output_path}")
                    else:
                        print("  Failed to evaluate model")
                        
                except Exception as e:
                    print(f"  Error processing {metrics_path}: {e}")
                    import traceback
                    traceback.print_exc()

    print(f"\nTotal models found: {models_found}")

    # Create overall results dataframe
    if all_results:
        results_df = pd.DataFrame(all_results)
        
        # Reorder columns
        column_order = [
            'model_name', 'model_path', 'fold', 'training_time_minutes',
            'iou', 'f1_score', 'accuracy', 'recall', 'precision',
            'map_50', 'map_55', 'map_60', 'map_65', 'map_70',
            'map_75', 'map_80', 'map_85', 'map_90', 'map_95',
            'mean_iou'
        ]
        
        # Ensure all columns exist
        for col in column_order:
            if col not in results_df.columns:
                results_df[col] = np.nan
        
        results_df = results_df[column_order]
        
        # Save overall results
        output_path = "yolo_models_test_evaluation_results.csv"
        results_df.to_csv(output_path, index=False)
        print(f"\nSaved overall results to: {output_path}")
        
        # Display summary
        print("\nEvaluation Summary:")
        print(results_df.to_string())
    else:
        print("\nNo models were successfully evaluated.")
        if models_found > 0:
            print("Models were found but evaluation failed. Check error messages above.")

### YOLO Data Cleaning

In [None]:
# Read the results
yolo_results = pd.read_csv("yolo_models_test_evaluation_results.csv")

# Define aggregation: sum training time, mean for metrics
custom_agg = {
    'fold': 'count',
    'training_time_minutes': 'sum',
    'iou': 'mean',
    'f1_score': 'mean',
    'accuracy': 'mean',
    'recall': 'mean',
    'precision': 'mean',
    'map_50': 'mean',
    'map_55': 'mean',
    'map_60': 'mean',
    'map_65': 'mean',
    'map_70': 'mean',
    'map_75': 'mean',
    'map_80': 'mean',
    'map_85': 'mean',
    'map_90': 'mean',
    'map_95': 'mean',
    'mean_iou': 'mean'
}
yolo_grouped_custom = yolo_results.groupby('model_name').agg(custom_agg).reset_index()

# Drop mean_iou if exists
yolo_grouped_custom.drop(columns=['mean_iou'], inplace=True, errors='ignore')

# Convert training time from minutes to hours
yolo_grouped_custom['total_training_time_hour'] = yolo_grouped_custom['training_time_minutes'] / 60

# Drop the original training_time_minutes column
yolo_grouped_custom.drop(columns=['training_time_minutes'], inplace=True)

# Add parameter count for YOLO models
yolo_params_m = {
    "01_training_yolo12n_20250623": 2.8,
    "01_training_yolo12s_20250623": 9.8,
    "01_training_yolo12m_20250623": 21.9,
    "01_training_yolo12l_20250623": 28.8,
    "01_training_yolo12x_20250623": 64.5}
yolo_grouped_custom['num_params_M'] = yolo_grouped_custom['model_name'].map(yolo_params_m)

# Create architecture/backbone columns
yolo_grouped_custom['architecture'] = 'YOLO'
yolo_grouped_custom['backbone'] = yolo_grouped_custom['model_name'].apply(lambda x: '_'.join(x.split('_')[2:3]))

# Rename columns for consistency
yolo_grouped_custom.rename(columns={
    'fold': 'num_training_sessions',
    'model_name': 'config',
    'iou': 'eval_test_iou_mean',
    'f1_score': 'eval_test_f1_score_mean',
    'accuracy': 'eval_test_accuracy_mean',
    'recall': 'eval_test_recall_mean',
    'precision': 'eval_test_precision_mean',
    'map_50': 'eval_test_map_50_mean',
    'map_55': 'eval_test_map_55_mean',
    'map_60': 'eval_test_map_60_mean',
    'map_65': 'eval_test_map_65_mean',
    'map_70': 'eval_test_map_70_mean',
    'map_75': 'eval_test_map_75_mean',
    'map_80': 'eval_test_map_80_mean',
    'map_85': 'eval_test_map_85_mean',
    'map_90': 'eval_test_map_90_mean',
    'map_95': 'eval_test_map_95_mean',
}, inplace=True)

# Filter models with IoU > 0.2
yolo_grouped_custom = yolo_grouped_custom[yolo_grouped_custom['eval_test_iou_mean'] > 0.2]

display(yolo_grouped_custom)

## Combine SMP and YOLO Results

In [None]:
# Concatenate SMP and YOLO results
df_model_results_clean_concat_yolo_smp = pd.concat([df_model_results_clean, yolo_grouped_custom], ignore_index=True)

display(df_model_results_clean_concat_yolo_smp)

## Performance Visualizations

In [None]:
# Set global style parameters
def set_plot_style():
    """Set consistent style for all plots."""
    sns.set_style("whitegrid")
    sns.set_palette("pastel")
    plt.rcParams.update({
        'font.size': 10,
        'axes.titlesize': 11,
        'axes.titleweight': 'bold',
        'axes.labelsize': 10,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'figure.dpi': 150,
        'savefig.dpi': 300
    })

### Architecture-Backbone Heatmap

In [None]:
def plot_architecture_backbone_heatmap(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Create heatmap showing performance for architecture-backbone combinations.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    sns.set_style("white")
    
    if 'architecture' not in df_models.columns or 'backbone' not in df_models.columns:
        print("Architecture or backbone columns not found")
        return
    
    sns.set_style("whitegrid")

    fig, ax = plt.subplots(1, 1, figsize=(12, 7.5))
    
    # Create pivot table with mean performance
    heatmap_data = df_models.pivot_table(
        values=performance_metric,
        index='architecture',
        columns='backbone',
        aggfunc='mean'
    )
    
    # Sort architectures and backbones
    heatmap_data = heatmap_data.sort_index()
    
    # Clean up backbone names
    clean_columns = []
    for col in heatmap_data.columns:
        clean_name = col
        for prefix in ['timm-', 'tu-', 'tv-']:
            if clean_name.startswith(prefix):
                clean_name = clean_name[len(prefix):]
        clean_columns.append(clean_name)
    
    heatmap_data.columns = clean_columns
    heatmap_data = heatmap_data[sorted(heatmap_data.columns)]
    
    # Create mask for NaN values
    mask = heatmap_data.isna()
    
    # Determine metric display name and graphic number
    metric_names = {
        'eval_test_iou_mean': ("IoU moyen 5 folds", "01"),
        'eval_test_map_50_mean': ("mAP@50 moyen 5 folds", "02"),
        'eval_test_map_75_mean': ("mAP@75 moyen 5 folds", "03"),
        'eval_test_map_95_mean': ("mAP@95 moyen 5 folds", "04"),
        'eval_test_accuracy_mean': ("Accuracy moyenne 5 folds", "05"),
        'eval_test_recall_mean': ("Recall moyen 5 folds", "06"),
        'eval_test_precision_mean': ("Precision moyenne 5 folds", "07"),
        'eval_test_f1_score_mean': ("F1-score moyen 5 folds", "08")
    }
    
    texte_performance, graphic_number = metric_names.get(performance_metric, (performance_metric, "99"))

    # Create heatmap
    sns.heatmap(heatmap_data,
               annot=True,
               fmt=".2f",
               cmap='coolwarm',
               linewidth=.5,
               annot_kws={"fontsize":9},
               mask=mask)

    # Customize plot
    ax.set_title('Heatmap Encodeur vs Décodeur - ' + texte_performance, 
                fontsize=11, fontweight='bold', pad=20)
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Rotate labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right', fontsize=10)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=10)
    
    ax.grid(False)
   
    # Remove spines
    for spine in ax.spines.values():
        spine.set_visible(False)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_01_architecture_backbone_heatmap_{graphic_number}_{performance_metric}.png", 
                   dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

# Plot heatmaps for different metrics
plot_architecture_backbone_heatmap(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_architecture_backbone_heatmap(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_architecture_backbone_heatmap(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_architecture_backbone_heatmap(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Top 10 Models by Performance

In [None]:
def plot_top_models_performance(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot top 10 models by specified performance metric.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to rank models by
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    if performance_metric in df_models.columns:
        # Get top 10 models
        top_models = df_models.nlargest(10, performance_metric).copy()
        
        if not top_models.empty:
            y_pos = range(len(top_models) - 1, -1, -1)
            bars = ax.barh(y_pos, top_models[performance_metric], alpha=0.7)
            
            # Add value labels
            for i, (bar, perf_val) in enumerate(zip(bars, top_models[performance_metric])):
                ax.text(bar.get_width()*1.005, bar.get_y() + bar.get_height()/2, 
                       f'{perf_val:.3f}', ha='left', va='center', fontsize=9)
            
            # Add more space for labels
            ax.set_xlim(left=0, right=ax.get_xlim()[1]*1.05)
            
            # Create model labels
            model_labels = []
            for _, row in top_models.iterrows():
                label_parts = []
                
                # Add config name
                if 'config_' in row and pd.notna(row['config_']):
                    config_name = str(row['config_'])
                    # Clean up config name
                    if '_' in config_name:
                        config_name = '_'.join(config_name.split('_')[1:])
                    config_name = config_name.replace('_imagenet', '')
                    label_parts.append(config_name)
                
                # Add architecture
                if 'architecture' in row and pd.notna(row['architecture']):
                    label_parts.append(f"({row['architecture']})")
                
                # Add parameter count
                if 'num_params_M' in row and pd.notna(row['num_params_M']):
                    label_parts.append(f"[{row['num_params_M']:.0f}M]")
                
                # If no config name found, use index
                if not label_parts:
                    label_parts.append(f"Model {row.name}")
                
                model_labels.append(" ".join(label_parts))
            
            # Place labels inside bars
            ax.set_yticks(y_pos)
            for i, (bar, label) in enumerate(zip(bars, model_labels)):
                ax.text(ax.get_xlim()[1]*0.01, bar.get_y() + bar.get_height()/2, 
                       label, ha='left', va='center', fontsize=9, color='black', zorder=10)
            
            ax.set_yticklabels([])
            ax.tick_params(axis='y', pad=-5)
            
            # Set labels and title based on metric
            metric_info = {
                'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds)', 'Top 10 modèles par IoU', "01"),
                'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds)', 'Top 10 modèles par mAP@0.5', "02"),
                'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds)', 'Top 10 modèles par mAP@0.75', "03"),
                'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds)', 'Top 10 modèles par mAP@0.95', "04"),
                'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds)', 'Top 10 modèles par Accuracy', "05"),
                'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds)', 'Top 10 modèles par Recall', "06"),
                'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds)', 'Top 10 modèles par Precision', "07"),
                'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds)', 'Top 10 modèles par F1 Score', "08")
            }
            
            x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Top 10 modèles', "99"))
            ax.set_xlabel(x_label, fontsize=10)
            ax.set_title(title, fontsize=11, fontweight='bold')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_02_top_models_performance_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_top_models_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_top_models_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_top_models_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_top_models_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)


### Top 10 Decoders

In [None]:
def plot_architecture_performance(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot performance by decoder architecture (top 10).
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    if 'architecture' not in df_models.columns:
        print("Architecture column not found")
        return
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    # Calculate performance by architecture
    architecture_performance = df_models.groupby('architecture')[performance_metric].agg(['mean', 'count']).reset_index()
    architecture_performance = architecture_performance[architecture_performance['count'] >= 1]
    architecture_performance = architecture_performance.sort_values('mean', ascending=False).head(10)
    
    if not architecture_performance.empty:
        y_pos = range(len(architecture_performance) - 1, -1, -1)
        bars = ax.barh(y_pos, architecture_performance['mean'], alpha=0.7)
        
        # Add value labels
        for i, (bar, mean_val) in enumerate(zip(bars, architecture_performance['mean'])):
            ax.text(bar.get_width()*1.005, bar.get_y() + bar.get_height()/2, 
                   f'{mean_val:.3f}', ha='left', va='center', fontsize=9)
        
        ax.set_xlim(left=0, right=ax.get_xlim()[1]*1.05)
        ax.set_yticks(y_pos)
        
        # Place architecture names inside bars
        architecture_names = list(architecture_performance['architecture'])
        for i, bar in enumerate(bars):
            ax.text(ax.get_xlim()[1]*0.01, bar.get_y() + bar.get_height()/2, 
                   architecture_names[i], ha='left', va='center', fontsize=9, color='black', zorder=10)
        
        ax.set_yticklabels([])
        ax.tick_params(axis='y', pad=-5)
        
        # Set labels and title based on metric
        metric_info = {
            'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par IoU', "01"),
            'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par mAP@0.5', "02"),
            'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par mAP@0.75', "03"),
            'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par mAP@0.95', "04"),
            'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par Accuracy', "05"),
            'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par Recall', "06"),
            'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par Precision', "07"),
            'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds de tous les encodeurs)', 'Top 10 décodeurs par F1 Score', "08")
        }
        
        x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Top 10 décodeurs', "99"))
        ax.set_xlabel(x_label, fontsize=10)
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_03_plot_architecture_performance_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_architecture_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_architecture_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_architecture_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_architecture_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Top 10 Encoders

In [None]:
def plot_backbone_performance(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot performance by encoder backbone (top 10).
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    if 'backbone' not in df_models.columns:
        print("Backbone column not found")
        return
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    # Calculate performance by backbone
    backbone_performance = df_models.groupby('backbone')[performance_metric].agg(['mean', 'count']).reset_index()
    backbone_performance = backbone_performance[backbone_performance['count'] >= 1]
    backbone_performance = backbone_performance.sort_values('mean', ascending=False).head(10)
    
    if not backbone_performance.empty:
        y_pos = range(len(backbone_performance) - 1, -1, -1)
        bars = ax.barh(y_pos, backbone_performance['mean'], alpha=0.7)
        
        # Add value labels
        for i, (bar, mean_val) in enumerate(zip(bars, backbone_performance['mean'])):
            ax.text(bar.get_width()*1.005, bar.get_y() + bar.get_height()/2, 
                   f'{mean_val:.3f}', ha='left', va='center', fontsize=9)
        
        ax.set_xlim(left=0, right=ax.get_xlim()[1]*1.05)
        ax.set_yticks(y_pos)
        
        # Place backbone names inside bars
        backbone_names = list(backbone_performance['backbone'])
        for i, bar in enumerate(bars):
            ax.text(ax.get_xlim()[1]*0.01, bar.get_y() + bar.get_height()/2, 
                   backbone_names[i], ha='left', va='center', fontsize=9, color='black', zorder=10)
        
        ax.set_yticklabels([])
        ax.tick_params(axis='y', pad=-5)
        
        # Set labels and title based on metric
        metric_info = {
            'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par IoU', "01"),
            'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par mAP@0.5', "02"),
            'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par mAP@0.75', "03"),
            'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par mAP@0.95', "04"),
            'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par Accuracy', "05"),
            'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par Recall', "06"),
            'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par Precision', "07"),
            'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds de tous les décodeurs)', 'Top 10 encodeurs par F1 Score', "08")
        }
        
        x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Top 10 encodeurs', "99"))
        ax.set_xlabel(x_label, fontsize=10)
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_04_backbone_performance_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_backbone_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_backbone_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_backbone_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_backbone_performance(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Top 3 Models by Size Category

In [None]:
def plot_models_by_size_category(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot top 3 models in each parameter count size category.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    if 'num_params_M' in df_models.columns and performance_metric in df_models.columns:
        # Define size bins
        size_bins = [(0, 10), (10, 25), (25, 50), (50, 75), (75, 100), (100, float('inf'))]
        bin_labels = ['0-10M', '10-25M', '25-50M', '50-75M', '75-100M', '100M+']
        
        # Pastel colors for each size category
        pastel_colors = ['#FFB3BA', '#FFDFBA', '#FFFFBA', '#BAFFC9', '#BAE1FF', '#DCC9FF']
        
        # Filter out missing values
        size_data = df_models[['num_params_M', performance_metric, 'config_', 'architecture']].dropna()
        
        if not size_data.empty:
            # Fixed positioning parameters
            models_per_bin = 3
            bin_spacing = 0.5
            total_bins = len(bin_labels)
            
            # Calculate fixed positions for each bin
            bin_positions = {}
            current_y = 0
            for i, label in enumerate(bin_labels):
                bin_positions[label] = {
                    'start': current_y,
                    'middle': current_y + (models_per_bin - 1) / 2,
                    'end': current_y + models_per_bin - 1
                }
                current_y += models_per_bin + bin_spacing
            
            all_models = []
            
            for i, ((min_size, max_size), label) in enumerate(zip(size_bins, bin_labels)):
                # Filter models in this size range
                if max_size == float('inf'):
                    bin_models = size_data[size_data['num_params_M'] >= min_size]
                else:
                    bin_models = size_data[(size_data['num_params_M'] >= min_size) & 
                                         (size_data['num_params_M'] < max_size)]
                
                # Get top 3 models in this bin
                top3 = bin_models.nlargest(3, performance_metric)
                
                if not top3.empty:
                    top3_reversed = top3.iloc[::-1]
                    bin_start_y = bin_positions[label]['start']
                    
                    for rank, (_, model) in enumerate(top3_reversed.iterrows()):
                        # Clean config name
                        config_name = str(model['config_'])
                        if '_' in config_name:
                            config_name = '_'.join(config_name.split('_')[1:])
                        config_name = config_name.replace('_imagenet', '')
                        
                        all_models.append({
                            'performance': model[performance_metric],
                            'label': f"{config_name} ({model['architecture']}) [{model['num_params_M']:.0f}M]",
                            'color': pastel_colors[i],
                            'bin_label': label,
                            'y_pos': bin_start_y + rank
                        })
            
            if all_models:
                # Create the plot
                for model in all_models:
                    bar = ax.barh(model['y_pos'], model['performance'], 
                                 color=model['color'], alpha=0.8, height=0.8)
                    
                    # Add performance value label
                    ax.text(model['performance']*1.005, model['y_pos'], 
                           f"{model['performance']:.3f}", 
                           ha='left', va='center', fontsize=9, color='black')
                    
                    # Add model label inside bar
                    ax.text(ax.get_xlim()[1]*0.01, model['y_pos'], model['label'], 
                           ha='left', va='center', fontsize=9, 
                           color='black', zorder=10)
                
                # Set fixed y-axis labels and ticks
                y_ticks = [bin_positions[label]['middle'] for label in bin_labels]
                y_labels = bin_labels
                
                ax.set_yticks(y_ticks)
                ax.set_yticklabels(y_labels, fontsize=10)
                
                # Set labels and title based on metric
                metric_info = {
                    'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds)', 'Top 3 modèles par IoU par taille', "01"),
                    'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds)', 'Top 3 modèles par mAP@0.5 par taille', "02"),
                    'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds)', 'Top 3 modèles par mAP@0.75 par taille', "03"),
                    'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds)', 'Top 3 modèles par mAP@0.95 par taille', "04"),
                    'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds)', 'Top 3 modèles par Accuracy par taille', "05"),
                    'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds)', 'Top 3 modèles par Recall par taille', "06"),
                    'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds)', 'Top 3 modèles par Precision par taille', "07"),
                    'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds)', 'Top 3 modèles par F1 Score par taille', "08")
                }
                
                x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Top 3 modèles par taille', "99"))
                ax.set_xlabel(x_label, fontsize=10)
                ax.set_title(title, fontsize=11, fontweight='bold')
                ax.grid(True, alpha=0.3, axis='x')
                
                # Add more space for labels
                ax.set_xlim(left=0, right=ax.get_xlim()[1]*1.05)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_05_models_by_size_category_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_models_by_size_category(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_models_by_size_category(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_models_by_size_category(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_models_by_size_category(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Box Plots by Architecture

In [None]:
def plot_architecture_boxplot(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Create box plot showing performance distribution by architecture.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    if 'architecture' not in df_models.columns:
        print("Architecture column not found")
        return
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    plot_data = df_models[[performance_metric, 'architecture']].dropna()
    if len(plot_data) > 0:
        sns.boxplot(data=plot_data, y='architecture', x=performance_metric, ax=ax)
        
        # Set labels and title based on metric
        metric_info = {
            'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds de tous les encodeurs)', 'Test IoU par décodeur', "01"),
            'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds de tous les encodeurs)', 'Test mAP@0.5 par décodeur', "02"),
            'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds de tous les encodeurs)', 'Test mAP@0.75 par décodeur', "03"),
            'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds de tous les encodeurs)', 'Test mAP@0.95 par décodeur', "04"),
            'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds de tous les encodeurs)', 'Test Accuracy par décodeur', "05"),
            'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds de tous les encodeurs)', 'Test Recall par décodeur', "06"),
            'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds de tous les encodeurs)', 'Test Precision par décodeur', "07"),
            'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds de tous les encodeurs)', 'Test F1 Score par décodeur', "08")
        }
        
        x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Performance par décodeur', "99"))
        ax.set_xlabel(x_label, fontsize=10)
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_ylabel('', fontsize=10)
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_06_architecture_boxplot_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_architecture_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_architecture_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_architecture_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_architecture_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Box Plots by Backbone

In [None]:
def plot_backbone_boxplot(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Create box plot showing performance distribution by backbone.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    if 'backbone' not in df_models.columns:
        print("Backbone column not found")
        return
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    plot_data = df_models[[performance_metric, 'backbone']].dropna()
    if len(plot_data) > 0:
        sns.boxplot(data=plot_data, y='backbone', x=performance_metric, ax=ax)
        
        # Set labels and title based on metric
        metric_info = {
            'eval_test_iou_mean': ('Test IoU (moyenne des 5 folds)', 'Test IoU par encodeur', "01"),
            'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds)', 'Test mAP@0.5 par encodeur', "02"),
            'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds)', 'Test mAP@0.75 par encodeur', "03"),
            'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds)', 'Test mAP@0.95 par encodeur', "04"),
            'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds)', 'Test Accuracy par encodeur', "05"),
            'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds)', 'Test Recall par encodeur', "06"),
            'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds)', 'Test Precision par encodeur', "07"),
            'eval_test_f1_score_mean': ('Test F1 Score (moyenne des 5 folds)', 'Test F1 Score par encodeur', "08")
        }
        
        x_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Performance par encodeur', "99"))
        ax.set_xlabel(x_label, fontsize=10)
        ax.set_title(title, fontsize=11, fontweight='bold')
        ax.set_ylabel('', fontsize=10)
        ax.tick_params(axis='y', labelsize=9)
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_07_backbone_boxplot_{graphic_number}_{performance_metric}.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_backbone_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_backbone_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_backbone_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_backbone_boxplot(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Pareto Frontier - Performance vs Training Time

In [None]:
def plot_performance_vs_training_time_pareto(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot performance vs training time with Pareto frontier.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 5.5))
    
    if 'total_training_time_hour' in df_models.columns and 'architecture' in df_models.columns:
        # Check available columns
        required_cols = [performance_metric, 'total_training_time_hour', 'architecture', 'config_']
        if 'backbone' in df_models.columns:
            required_cols.append('backbone')
        scatter_data = df_models[required_cols].dropna()
        
        if len(scatter_data) > 0:
            # Plot all points with low alpha
            for i, arch in enumerate(scatter_data['architecture'].unique()):
                arch_data = scatter_data[scatter_data['architecture'] == arch]
                ax.scatter(arch_data['total_training_time_hour'], 
                          arch_data[performance_metric], 
                          label=arch, alpha=0.3, s=60)
            
            # Find Pareto frontier
            pareto_models = []
            
            for idx, row in scatter_data.iterrows():
                is_pareto = True
                for idx2, row2 in scatter_data.iterrows():
                    if idx != idx2:
                        # Check if row2 dominates row (better performance AND faster)
                        if (row2[performance_metric] >= row[performance_metric] and 
                            row2['total_training_time_hour'] <= row['total_training_time_hour'] and
                            (row2[performance_metric] > row[performance_metric] or 
                             row2['total_training_time_hour'] < row['total_training_time_hour'])):
                            is_pareto = False
                            break
                
                if is_pareto:
                    pareto_models.append(idx)
            
            # Highlight Pareto frontier models
            pareto_data = scatter_data.loc[pareto_models].sort_values('total_training_time_hour')
            
            # Plot Pareto frontier line
            ax.plot(pareto_data['total_training_time_hour'], 
                   pareto_data[performance_metric], 
                   'r-', linewidth=2, alpha=0.7)
            
            # Highlight Pareto points
            ax.scatter(pareto_data['total_training_time_hour'], 
                      pareto_data[performance_metric], 
                      color='red', s=100, zorder=5, edgecolors='black', linewidth=0.5, alpha=0.8)
            
            # Create model name list for legend
            model_legend_text = []
            
            # Annotate Pareto frontier models
            for i, (_, row) in enumerate(pareto_data.iterrows(), 1):
                # Clean config name
                config_name = str(row['config_'])
                if '_' in config_name:
                    config_name = '_'.join(config_name.split('_')[1:])
                config_name = config_name.replace('_imagenet', '')
                
                # Get decoder
                decoder = row['architecture']
                
                # Alternate annotation position
                if i % 2 == 1:
                    offset_x = -10
                else:
                    offset_x = 10
                ax.annotate(str(i), 
                           xy=(row['total_training_time_hour'], row[performance_metric]),
                           xytext=(offset_x, -5), textcoords='offset points',
                           fontsize=8,
                           ha='center', va='center',
                           color='black', zorder=10,)
                
                # Add to legend text
                model_legend_text.append(f"{i}. {config_name} ({decoder})")
            
            # Set labels and title
            ax.set_xlabel("Temps d'entraînement moyen des 5 folds\nde tous les encodeurs (heures)", fontsize=10)
            
            # Set labels and title based on metric
            metric_info = {
                'eval_test_iou_mean': ('Test IoU', "IoU vs Temps d'entraînement - Modèles optimaux", "01"),
                'eval_test_map_50_mean': ('Test mAP@0.5', "mAP@0.5 vs Temps d'entraînement - Modèles optimaux", "02"),
                'eval_test_map_75_mean': ('Test mAP@0.75', "mAP@0.75 vs Temps d'entraînement - Modèles optimaux", "03"),
                'eval_test_map_95_mean': ('Test mAP@0.95', "mAP@0.95 vs Temps d'entraînement - Modèles optimaux", "04"),
                'eval_test_accuracy_mean': ('Test Accuracy', "Accuracy vs Temps d'entraînement - Modèles optimaux", "05"),
                'eval_test_recall_mean': ('Test Recall', "Recall vs Temps d'entraînement - Modèles optimaux", "06"),
                'eval_test_precision_mean': ('Test Precision', "Precision vs Temps d'entraînement - Modèles optimaux", "07"),
                'eval_test_f1_score_mean': ('Test F1 Score', "F1 Score vs Temps d'entraînement - Modèles optimaux", "08")
            }
            
            y_label, title, graphic_number = metric_info.get(performance_metric, (performance_metric, f'Performance vs Temps', "99"))
            ax.set_ylabel(y_label, fontsize=10)
            ax.set_title(title, fontsize=11, fontweight='bold')
            
            # Create architecture legend
            handles, labels = ax.get_legend_handles_labels()
            legend1 = ax.legend(handles, labels,
                               bbox_to_anchor=(1.05, 1), loc='upper left', 
                               fontsize=8, framealpha=0.9)
            
            ax.add_artist(legend1)
            
            # Create model numbers legend
            if model_legend_text:
                from matplotlib.patches import Rectangle
                extra = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
                
                legend2 = ax.legend([extra]*len(model_legend_text), model_legend_text,
                                  title='Modèles optimaux', title_fontsize=9,
                                  bbox_to_anchor=(1.05, 0.00), loc='lower left',
                                  fontsize=8, framealpha=0.9,
                                  handlelength=0, handletextpad=0)
            
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_08_performance_vs_training_time_pareto_{graphic_number}_{performance_metric}.png", 
                   dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_performance_vs_training_time_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_training_time_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_training_time_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_training_time_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Pareto Frontier - Performance vs Parameters

In [None]:
def plot_performance_vs_parameters_pareto(df_models, performance_metric='eval_test_iou_mean', save_path=None):
    """
    Plot performance vs number of parameters with Pareto frontier.
    
    Parameters:
        df_models: DataFrame with model results
        performance_metric: Metric to visualize
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 5.5))
    
    if 'num_params_M' in df_models.columns and 'architecture' in df_models.columns:
        # Check available columns
        required_cols = [performance_metric, 'num_params_M', 'architecture', 'config_']
        if 'backbone' in df_models.columns:
            required_cols.append('backbone')
        scatter_data = df_models[required_cols].dropna()
        
        if len(scatter_data) > 0:
            # Plot all points with low alpha
            for i, arch in enumerate(scatter_data['architecture'].unique()):
                arch_data = scatter_data[scatter_data['architecture'] == arch]
                ax.scatter(arch_data['num_params_M'], 
                          arch_data[performance_metric], 
                          label=arch, alpha=0.3, s=60)
            
            # Find Pareto frontier
            pareto_models = []
            
            for idx, row in scatter_data.iterrows():
                is_pareto = True
                for idx2, row2 in scatter_data.iterrows():
                    if idx != idx2:
                        # Check if row2 dominates row (better performance AND fewer parameters)
                        if (row2[performance_metric] >= row[performance_metric] and 
                            row2['num_params_M'] <= row['num_params_M'] and
                            (row2[performance_metric] > row[performance_metric] or 
                             row2['num_params_M'] < row['num_params_M'])):
                            is_pareto = False
                            break
                
                if is_pareto:
                    pareto_models.append(idx)
            
            # Highlight Pareto frontier models
            pareto_data = scatter_data.loc[pareto_models].sort_values('num_params_M')
            
            # Plot Pareto frontier line
            ax.plot(pareto_data['num_params_M'], 
                   pareto_data[performance_metric], 
                   'r-', linewidth=2, alpha=0.7)
            
            # Highlight Pareto points
            ax.scatter(pareto_data['num_params_M'], 
                      pareto_data[performance_metric], 
                      color='red', s=100, zorder=5, edgecolors='black', linewidth=0.5, alpha=0.8)
            
            # Create model name list for legend
            model_legend_text = []
            
            # Annotate Pareto frontier models
            for i, (_, row) in enumerate(pareto_data.iterrows(), 1):
                # Clean config name
                config_name = str(row['config_'])
                if '_' in config_name:
                    config_name = '_'.join(config_name.split('_')[1:])
                config_name = config_name.replace('_imagenet', '')
                
                # Get decoder
                decoder = row['architecture']
                
                # Alternate annotation position
                if i % 2 == 1:
                    offset_x = -10
                else:
                    offset_x = 10
                ax.annotate(str(i), 
                           xy=(row['num_params_M'], row[performance_metric]),
                           xytext=(offset_x, -5), textcoords='offset points',
                           fontsize=8,
                           ha='center', va='center',
                           color='black', zorder=10,)
                
                # Add to legend text
                model_legend_text.append(f"{i}. {config_name} ({decoder})")
            
            # Set labels and title
            ax.set_xlabel("Nombre de paramètres (millions)", fontsize=10)
            
            # Set labels and title based on metric
            metric_info = {
                'eval_test_iou_mean': ('Test IoU', "IoU vs Nombre de paramètres - Modèles optimaux", "01", 0.00),
                'eval_test_map_50_mean': ('Test mAP@0.5', "mAP@0.5 vs Nombre de paramètres - Modèles optimaux", "02", 0.00),
                'eval_test_map_75_mean': ('Test mAP@0.75', "mAP@0.75 vs Nombre de paramètres - Modèles optimaux", "03", 0.00),
                'eval_test_map_95_mean': ('Test mAP@0.95', "mAP@0.95 vs Nombre de paramètres - Modèles optimaux", "04", 0.00),
                'eval_test_accuracy_mean': ('Test Accuracy', "Accuracy vs Nombre de paramètres - Modèles optimaux", "05", 0.00),
                'eval_test_recall_mean': ('Test Recall', "Recall vs Nombre de paramètres - Modèles optimaux", "06", 0.00),
                'eval_test_precision_mean': ('Test Precision', "Precision vs Nombre de paramètres - Modèles optimaux", "07", 0.00),
                'eval_test_f1_score_mean': ('Test F1 Score', "F1 Score vs Nombre de paramètres - Modèles optimaux", "08", 0.00)
            }
            
            y_label, title, graphic_number, legend_y = metric_info.get(performance_metric, (performance_metric, f'Performance vs Paramètres', "99", 0.00))
            ax.set_ylabel(y_label, fontsize=10)
            ax.set_title(title, fontsize=11, fontweight='bold')
            
            # Create architecture legend
            handles, labels = ax.get_legend_handles_labels()
            legend1 = ax.legend(handles, labels,
                               bbox_to_anchor=(1.05, 1), loc='upper left', 
                               fontsize=8, framealpha=0.9)
            
            ax.add_artist(legend1)
            
            # Create model numbers legend
            if model_legend_text:
                from matplotlib.patches import Rectangle
                extra = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
                
                legend2 = ax.legend([extra]*len(model_legend_text), model_legend_text,
                                  title='Modèles optimaux', title_fontsize=9,
                                  bbox_to_anchor=(1.05, legend_y), loc='lower left',
                                  fontsize=8, framealpha=0.9,
                                  handlelength=0, handletextpad=0)
            
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_09_performance_vs_parameters_pareto_{graphic_number}_{performance_metric}.png", 
                   dpi=300, bbox_inches='tight')
    plt.show()

# Plot for different metrics
plot_performance_vs_parameters_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_iou_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_parameters_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_parameters_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_performance_vs_parameters_pareto(df_model_results_clean_concat_yolo_smp, performance_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)

### Pareto Frontier - IoU vs Other Metrics

In [None]:
def plot_iou_vs_metric_pareto(df_models, second_metric='eval_test_f1_score_mean', save_path=None):
    """
    Plot IoU vs another metric with Pareto frontier.
    
    Parameters:
        df_models: DataFrame with model results
        second_metric: Second metric to compare against IoU
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    iou_metric = 'eval_test_iou_mean'
    
    if iou_metric in df_models.columns and second_metric in df_models.columns and 'architecture' in df_models.columns:
        # Check available columns
        required_cols = [iou_metric, second_metric, 'architecture', 'config_', 'num_params_M']
        scatter_data = df_models[required_cols].dropna()
        
        if len(scatter_data) > 0:
            # Plot all points with low alpha
            for i, arch in enumerate(scatter_data['architecture'].unique()):
                arch_data = scatter_data[scatter_data['architecture'] == arch]
                ax.scatter(arch_data[iou_metric], 
                          arch_data[second_metric], 
                          label=arch, alpha=0.3, s=60)
            
            # Find Pareto frontier
            pareto_models = []
            
            for idx, row in scatter_data.iterrows():
                is_pareto = True
                for idx2, row2 in scatter_data.iterrows():
                    if idx != idx2:
                        # Check if row2 dominates row (better IoU AND better second metric)
                        if (row2[iou_metric] >= row[iou_metric] and 
                            row2[second_metric] >= row[second_metric] and
                            (row2[iou_metric] > row[iou_metric] or 
                             row2[second_metric] > row[second_metric])):
                            is_pareto = False
                            break
                
                if is_pareto:
                    pareto_models.append(idx)
            
            # Highlight Pareto frontier models
            pareto_data = scatter_data.loc[pareto_models].sort_values(iou_metric)
            
            # Plot Pareto frontier line
            ax.plot(pareto_data[iou_metric], 
                   pareto_data[second_metric], 
                   'r-', linewidth=2, alpha=0.7)
            
            # Highlight Pareto points
            ax.scatter(pareto_data[iou_metric], 
                      pareto_data[second_metric], 
                      color='red', s=100, zorder=5, edgecolors='black', linewidth=0.5, alpha=0.8)
            
            # Create model name list for legend
            model_legend_text = []
            
            # Annotate Pareto frontier models
            for i, (_, row) in enumerate(pareto_data.iterrows(), 1):
                # Clean config name
                config_name = str(row['config_'])
                if '_' in config_name:
                    config_name = '_'.join(config_name.split('_')[1:])
                config_name = config_name.replace('_imagenet', '')
                
                # Get decoder and parameters
                decoder = row['architecture']
                params = row['num_params_M']
                
                # Alternate annotation position
                if i % 2 == 1:
                    offset_x = -10
                else:
                    offset_x = 10
                ax.annotate(str(i), 
                           xy=(row[iou_metric], row[second_metric]),
                           xytext=(offset_x, -5), textcoords='offset points',
                           fontsize=8,
                           ha='center', va='center',
                           color='black', zorder=10,)
                
                # Add to legend text
                model_legend_text.append(f"{i}. {config_name} ({decoder}) [{params:.0f}M]")
            
            # Set labels
            ax.set_xlabel("Test IoU (moyenne des 5 folds)", fontsize=10)
            
            # Determine Y-axis label and title based on second metric
            metric_info = {
                'eval_test_f1_score_mean': ('Test F1-Score (moyenne des 5 folds)', "IoU vs F1-Score - Modèles optimaux", "01"),
                'eval_test_map_50_mean': ('Test mAP@0.5 (moyenne des 5 folds)', "IoU vs mAP@0.5 - Modèles optimaux", "02"),
                'eval_test_map_75_mean': ('Test mAP@0.75 (moyenne des 5 folds)', "IoU vs mAP@0.75 - Modèles optimaux", "03"),
                'eval_test_map_95_mean': ('Test mAP@0.95 (moyenne des 5 folds)', "IoU vs mAP@0.95 - Modèles optimaux", "04"),
                'eval_test_accuracy_mean': ('Test Accuracy (moyenne des 5 folds)', "IoU vs Accuracy - Modèles optimaux", "05"),
                'eval_test_recall_mean': ('Test Recall (moyenne des 5 folds)', "IoU vs Recall - Modèles optimaux", "06"),
                'eval_test_precision_mean': ('Test Precision (moyenne des 5 folds)', "IoU vs Precision - Modèles optimaux", "07")
            }
            
            y_label, title, graphic_number = metric_info.get(second_metric, (second_metric, f'IoU vs {second_metric}', "99"))
            ax.set_ylabel(y_label, fontsize=10)
            ax.set_title(title, fontsize=11, fontweight='bold')
            
            # Add diagonal reference line for comparable metrics
            if second_metric in ['eval_test_f1_score_mean', 'eval_test_accuracy_mean']:
                min_val = min(ax.get_xlim()[0], ax.get_ylim()[0])
                max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
                ax.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.3, linewidth=1)
            
            # Create architecture legend
            handles, labels = ax.get_legend_handles_labels()
            legend1 = ax.legend(handles, labels,
                               bbox_to_anchor=(1.05, 1), loc='upper left', 
                               fontsize=8, framealpha=0.9)
            
            ax.add_artist(legend1)
            
            # Create model numbers legend
            if model_legend_text:
                from matplotlib.patches import Rectangle
                extra = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
                
                legend2 = ax.legend([extra]*len(model_legend_text), model_legend_text,
                                  title='Modèles optimaux', title_fontsize=9,
                                  bbox_to_anchor=(1.05, 0.00), loc='lower left',
                                  fontsize=8, framealpha=0.9,
                                  handlelength=0, handletextpad=0)
            
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_10_{graphic_number}_iou_vs_{second_metric}.png", 
                   dpi=300, bbox_inches='tight')
    plt.show()

# Plot IoU vs different metrics
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_f1_score_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_map_50_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_map_75_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_map_95_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_accuracy_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_recall_mean', save_path=GRAPHICS_PATH)
plot_iou_vs_metric_pareto(df_model_results_clean_concat_yolo_smp, second_metric='eval_test_precision_mean', save_path=GRAPHICS_PATH)

### Training Time Distribution

In [None]:
def plot_training_time_distribution(df_models, save_path=None):
    """
    Create histogram showing training time distribution.
    
    Parameters:
        df_models: DataFrame with model results
        save_path: Path to save figure
    """
    set_plot_style()
    
    fig, ax = plt.subplots(1, 1, figsize=(6.5, 4.5))
    
    if 'total_training_time_hour' in df_models.columns:
        valid_times = df_models['total_training_time_hour'].dropna()
        if len(valid_times) > 0:
            # Create bins every 2 hours
            max_time = valid_times.max()
            min_time = valid_times.min()
            
            # Round to even numbers
            max_bin = int(np.ceil(max_time / 2) * 2)
            min_bin = int(np.floor(min_time / 2) * 2)
            
            # Create bins
            bins = np.arange(min_bin, max_bin + 2, 2)
            
            # Plot histogram
            n, bins, patches = ax.hist(valid_times, bins=bins, alpha=0.7, 
                                      edgecolor='white', linewidth=1)
            
            # Add mean line
            mean_val = valid_times.mean()
            ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, alpha=0.5,
                      label=f'Moyenne: {mean_val:.1f}h')
            
            # Add median line
            median_val = valid_times.median()
            ax.axvline(median_val, color='green', linestyle='--', linewidth=2, alpha=0.5,
                      label=f'Médiane: {median_val:.1f}h')
            
            # Set labels and title
            ax.set_title("Distribution du temps d'entraînement", fontsize=11, fontweight='bold')
            ax.set_xlabel("Temps d'entraînement des 5 folds (heures)", fontsize=10)
            ax.set_ylabel('Nombre de modèles', fontsize=10)
            
            # Set x-axis ticks
            ax.set_xticks(bins)
            ax.set_xticklabels([f'{int(b)}' for b in bins], fontsize=9)
            
            # Add grid
            ax.grid(True, alpha=0.3, axis='y')
            ax.grid(True, alpha=0.1, axis='x')
            
            # Legend
            ax.legend(fontsize=9, loc='upper right')
            
            # Add statistics text
            stats_text = f'Total: {len(valid_times)} modèles\n'
            stats_text += f'Min: {valid_times.min():.1f}h\n'
            stats_text += f'Max: {valid_times.max():.1f}h'
            
            ax.text(0.787, 0.835, stats_text, transform=ax.transAxes, 
                   fontsize=9, verticalalignment='top',
                   bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.5,
                            edgecolor='lightgray', zorder=10))
    
    plt.tight_layout()
    if save_path:
        plt.savefig(f"{save_path}/ch4_11_training_time_dist_09.png", dpi=300, bbox_inches='tight')
    plt.show()

# Plot training time distribution
plot_training_time_distribution(df_model_results_clean, save_path=GRAPHICS_PATH)

## Performance Analysis
### Top 10 Models by IoU

In [None]:
# Display top 10 models by IoU
top_10_iou = df_model_results_clean_concat_yolo_smp.sort_values('eval_test_iou_mean', ascending=False).head(10)[
    ["config", "eval_test_iou_mean", "eval_test_f1_score_mean", "eval_test_map_50_mean", 
     "eval_test_map_75_mean", "eval_test_map_95_mean", "num_params_M", "total_training_time_hour"]
].reset_index(drop=True)

print(top_10_iou.to_latex())

### Overall Performance Statistics

In [None]:
# Calculate and display overall statistics
metrics = ['eval_test_iou_mean', 'eval_test_f1_score_mean', 'eval_test_map_50_mean', 
           'eval_test_map_75_mean', 'eval_test_map_95_mean', 'eval_test_accuracy_mean', 
           'eval_test_recall_mean', 'eval_test_precision_mean']

print("Mean performance of all models:")
for metric in metrics:
    if metric in df_model_results_clean_concat_yolo_smp.columns:
        mean_val = df_model_results_clean_concat_yolo_smp[metric].mean()
        print(f"{metric.replace('eval_test_', '').replace('_mean', '')}: {mean_val:.3f}")

print("\nDetailed statistics (LaTeX format):")
for metric in metrics:
    if metric in df_model_results_clean_concat_yolo_smp.columns:
        data = df_model_results_clean_concat_yolo_smp[metric]
        metric_name = metric.replace('eval_test_', '').replace('_mean', '').replace('_', ' ').title()
        print(f"{metric_name} & {data.mean():.3f} \\pm {data.std():.3f} & {data.median():.3f} & {data.min():.3f} & {data.max():.3f} \\\\")

# Add parameter and training time statistics
print(f"Nombre de paramètres & {df_model_results_clean_concat_yolo_smp['num_params_M'].mean():.3f} \\pm {df_model_results_clean_concat_yolo_smp['num_params_M'].std():.3f} & {df_model_results_clean_concat_yolo_smp['num_params_M'].median():.3f} & {df_model_results_clean_concat_yolo_smp['num_params_M'].min():.3f} & {df_model_results_clean_concat_yolo_smp['num_params_M'].max():.3f} \\\\")
print(f"Temps d'entraînement (heures) & {df_model_results_clean_concat_yolo_smp['total_training_time_hour'].mean():.3f} \\pm {df_model_results_clean_concat_yolo_smp['total_training_time_hour'].std():.3f} & {df_model_results_clean_concat_yolo_smp['total_training_time_hour'].median():.3f} & {df_model_results_clean_concat_yolo_smp['total_training_time_hour'].min():.3f} & {df_model_results_clean_concat_yolo_smp['total_training_time_hour'].max():.3f} \\\\")

### Performance by Architecture

In [None]:
df_model_results_clean_concat_yolo_smp.groupby('architecture').agg({
    'eval_test_iou_mean': ['mean', 'std', 'median', 'min', 'max'],

}).reset_index()

In [None]:
print(df_model_results_clean_concat_yolo_smp.groupby('architecture').agg({
    'eval_test_iou_mean': ['mean', 'std', 'median', 'min', 'max'],
}).reset_index().to_latex(index=False, 
    caption="Statistiques IoU par décodeur",
    label="tab:statistique_par_decodeur_iou",
    float_format="%.3f",))

In [None]:
print(df_model_results_clean_concat_yolo_smp.groupby('architecture').agg({
    'eval_test_map_95_mean': ['mean', 'std', 'median', 'min', 'max'],
}).reset_index().to_latex(index=False, 
    caption="Statistiques mAP@0.95 par décodeur",
    label="tab:statistique_par_decodeur_map95",
    float_format="%.3f",
    column_format="lccccccccc",))

### Performance by Backbone

In [None]:
print(df_model_results_clean_concat_yolo_smp.groupby('backbone').agg({
    'eval_test_iou_mean': ['mean', 'std', 'median', 'min', 'max'],
}).reset_index().to_latex(index=False, 
    caption="Statistiques IoU par décodeur",
    label="tab:statistique_par_encodeur_iou",
    float_format="%.3f",))

In [None]:
print(df_model_results_clean_concat_yolo_smp.groupby('backbone').agg({
    'eval_test_map_95_mean': ['mean', 'std', 'median', 'min', 'max'],
}).reset_index().to_latex(index=False, 
    caption="Statistiques mAP@0.95 par décodeur",
    label="tab:statistique_par_encodeur_map95",
    float_format="%.3f",))

### Pareto Optimal Models Analysis

In [None]:
# Get specific models from Pareto frontier
pareto_models = ['linknet_timm_efficientnet_b5_imagenet', 
                 'segformer_regnety_032_imagenet',
                 'unetplusplus_efficientnet_b3_imagenet']

for model_config in pareto_models:
    model_data = df_model_results_clean_concat_yolo_smp[
        df_model_results_clean_concat_yolo_smp["config"] == model_config
    ]
    if not model_data.empty:
        print(f"\n{model_config}:")
        print(model_data[["config", "eval_test_iou_mean", "eval_test_f1_score_mean", 
                         "eval_test_map_50_mean", "eval_test_map_75_mean", "eval_test_map_95_mean",
                         "num_params_M", "total_training_time_hour"]])

### Multi-Threshold mAP Analysis

In [None]:
# Group by architecture and calculate mean mAP at different thresholds
df_grouped = df_model_results_clean_concat_yolo_smp.groupby('architecture').agg({
    'eval_test_map_50_mean': ['mean'],
    'eval_test_map_65_mean': ['mean'],
    'eval_test_map_75_mean': ['mean'],
    'eval_test_map_85_mean': ['mean'],
    'eval_test_map_90_mean': ['mean'],
    'eval_test_map_95_mean': ['mean'],
}).reset_index()

# Flatten column names
df_grouped.columns = ['architecture', 'map_50_mean', 'map_65_mean', 'map_75_mean', 
                      'map_85_mean', 'map_90_mean', 'map_95_mean']

# Calculate percentage drop
df_grouped['percentage_drop_50_to_95'] = ((df_grouped['map_50_mean'] - df_grouped['map_95_mean']) / df_grouped['map_50_mean']) * 100

print("Multi-threshold mAP Analysis:")
print(df_grouped.to_latex(index=False, 
    caption="Performances moyennes mAP à différents seuils",
    label="tab:performance_moyenne_map_different_seuils",
    float_format="%.3f"))

## Qualitative Analysis on Test Dataset

In [None]:
df_model_results_clean

In [None]:
import os
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from datetime import datetime
import json
import gc

class LaTeXVisualizationSystem:
    """
    Visualization system for segmentation results with ensemble support.
    
    Parameters:
        img_size: Target image size (width, height)
        device: CUDA device to use
    """
    
    def __init__(self, img_size=(256, 256), device='cuda:0'):
        self.img_size = img_size
        self.device = device
        self.overlay_alpha = 0.5
        self.mask_color_gt = [0, 0, 255]      # Blue for ground truth
        self.mask_color_pred = [255, 0, 0]    # Red for prediction
        self.mask_color_ensemble = [0, 255, 0] # Green for ensemble
        
    def load_model_for_inference(self, model_info, device_id=0):
        """
        Load a model for inference.
        
        Parameters:
            model_info: Dictionary with model configuration
            device_id: GPU device ID
            
        Returns:
            Tuple of (model, device)
        """
        try:
            device = torch.device(f'cuda:{device_id}')
            
            config = {
                "architecture": model_info["architecture"],
                "backbone": model_info["backbone"],
                "encoder_weights": model_info.get("encoder_weights"),
                "num_classes": int(model_info.get("num_classes", 1)),
                "model_path": model_info["model_path"],
            }
            
            # Create model
            model, _ = create_model(config)
            
            # Load weights
            import logging
            logger = logging.getLogger('silent')
            logger.setLevel(logging.CRITICAL)
            
            if not corriger_state_dict(model, config["model_path"], logger):
                raise Exception("Failed to load model weights")
            
            model = model.to(device)
            model.eval()
            
            return model, device
            
        except Exception as e:
            print(f"Error loading model: {e}")
            return None, None
    
    def predict_batch(self, model, image_paths, device, batch_size=8):
        """
        Generate predictions for a batch of images.
        
        Parameters:
            model: PyTorch model
            image_paths: List of image paths
            device: Torch device
            batch_size: Batch size for inference
            
        Returns:
            List of prediction dictionaries
        """
        predictions = []
        
        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i+batch_size]
            batch_images = []
            
            # Load and preprocess batch
            for image_path in batch_paths:
                image = Image.open(image_path).convert("RGB")
                image_resized = image.resize(self.img_size, Image.Resampling.BILINEAR)
                image_array = np.array(image_resized).astype(np.float32) / 255.0
                batch_images.append(image_array)
            
            # Convert to tensor
            batch_array = np.stack(batch_images)
            batch_tensor = torch.from_numpy(batch_array).permute(0, 3, 1, 2)
            batch_tensor = batch_tensor.to(device)
            
            # Predict
            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    outputs = model(batch_tensor)
                    if isinstance(outputs, tuple):
                        outputs = outputs[0]
                    
                    # Apply sigmoid and threshold
                    pred_masks = torch.sigmoid(outputs).squeeze(1).cpu().numpy()
                    pred_binaries = (pred_masks > 0.5).astype(np.uint8)
            
            # Store results
            for j, (pred_mask, pred_binary) in enumerate(zip(pred_masks, pred_binaries)):
                predictions.append({
                    'image_array': batch_images[j],
                    'pred_mask': pred_mask,
                    'pred_binary': pred_binary
                })
            
            # Clean up
            del batch_tensor, outputs
            
        return predictions
    
    def ensemble_predict_batch(self, model_infos, image_paths, device_id=0, batch_size=8):
        """
        Generate ensemble predictions using sequential model loading.
        
        Parameters:
            model_infos: List of model info dictionaries
            image_paths: List of image paths
            device_id: GPU device ID
            batch_size: Batch size for inference
            
        Returns:
            List of ensemble prediction dictionaries
        """
        ensemble_predictions = None
        num_models = len(model_infos)
        
        print(f"Ensemble prediction with {num_models} models...")
        
        # Process each model sequentially
        for model_idx, model_info in enumerate(tqdm(model_infos, desc="Processing models")):
            # Load model
            model, device = self.load_model_for_inference(model_info, device_id)
            if model is None:
                print(f"Skipping model {model_idx+1}: failed to load")
                continue
            
            # Initialize storage for this model's predictions
            model_predictions = []
            
            # Process batches
            for i in range(0, len(image_paths), batch_size):
                batch_paths = image_paths[i:i+batch_size]
                batch_images = []
                
                # Load and preprocess batch
                for image_path in batch_paths:
                    image = Image.open(image_path).convert("RGB")
                    image_resized = image.resize(self.img_size, Image.Resampling.BILINEAR)
                    image_array = np.array(image_resized).astype(np.float32) / 255.0
                    batch_images.append(image_array)
                
                # Convert to tensor
                batch_array = np.stack(batch_images)
                batch_tensor = torch.from_numpy(batch_array).permute(0, 3, 1, 2)
                batch_tensor = batch_tensor.to(device)
                
                # Predict
                with torch.no_grad():
                    with torch.cuda.amp.autocast():
                        outputs = model(batch_tensor)
                        if isinstance(outputs, tuple):
                            outputs = outputs[0]
                        
                        # Apply sigmoid to get probabilities
                        pred_probs = torch.sigmoid(outputs).squeeze(1).cpu().numpy()
                
                # Store predictions for this batch
                for j, pred_prob in enumerate(pred_probs):
                    if model_idx == 0:
                        # First model - store image arrays too
                        model_predictions.append({
                            'image_array': batch_images[j],
                            'pred_prob': pred_prob
                        })
                    else:
                        model_predictions.append(pred_prob)
                
                # Clean up batch
                del batch_tensor, outputs
            
            # Accumulate predictions
            if model_idx == 0:
                # Initialize ensemble predictions with first model
                ensemble_predictions = model_predictions
            else:
                # Add to ensemble
                for i, pred_prob in enumerate(model_predictions):
                    ensemble_predictions[i]['pred_prob'] += pred_prob
            
            # Free memory before loading next model
            del model
            torch.cuda.empty_cache()
            gc.collect()
        
        # Average predictions and create binary masks
        final_predictions = []
        for pred_data in ensemble_predictions:
            # Average probabilities
            avg_prob = pred_data['pred_prob'] / num_models
            binary_mask = (avg_prob > 0.5).astype(np.uint8)
            
            final_predictions.append({
                'image_array': pred_data['image_array'],
                'pred_mask': avg_prob,
                'pred_binary': binary_mask,
                'ensemble': True
            })
        
        return final_predictions
    
    def load_ground_truth(self, mask_path):
        """Load and process ground truth mask."""
        mask = Image.open(mask_path)
        if mask.mode != "L":
            mask = mask.convert("L")
        
        mask = mask.resize(self.img_size, Image.Resampling.NEAREST)
        mask_array = np.array(mask)
        
        # Normalize to binary
        if mask_array.max() > 1:
            mask_array = (mask_array > 127).astype(np.uint8)
        else:
            mask_array = (mask_array > 0).astype(np.uint8)
            
        return mask_array
    
    def calculate_iou(self, pred_binary, gt_mask):
        """Calculate IoU for a single image."""
        intersection = np.logical_and(pred_binary, gt_mask).sum()
        union = np.logical_or(pred_binary, gt_mask).sum()
        iou = intersection / (union + 1e-6)
        return iou
    
    def save_individual_images(self, image_array, gt_mask, pred_binary, output_dir, image_name, is_ensemble=False):
        """Save individual images for visualization."""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save original image
        original_img = Image.fromarray((image_array * 255).astype(np.uint8))
        original_img.save(output_dir / f"{image_name}_original.png")
        
        # Save ground truth mask
        gt_img = Image.fromarray((gt_mask * 255).astype(np.uint8), mode='L')
        gt_img.save(output_dir / f"{image_name}_gt.png")
        
        # Save prediction mask
        pred_img = Image.fromarray((pred_binary * 255).astype(np.uint8), mode='L')
        pred_img.save(output_dir / f"{image_name}_pred.png")
        
        # Save overlay versions
        # Ground truth overlay
        overlay_gt = image_array.copy()
        mask_indices = gt_mask > 0
        for c, color_val in enumerate(self.mask_color_gt):
            overlay_gt[mask_indices, c] = (1 - self.overlay_alpha) * overlay_gt[mask_indices, c] + \
                                          self.overlay_alpha * (color_val / 255.0)
        
        overlay_gt_img = Image.fromarray((overlay_gt * 255).astype(np.uint8))
        overlay_gt_img.save(output_dir / f"{image_name}_overlay_gt.png")
        
        # Prediction overlay
        overlay_pred = image_array.copy()
        mask_indices = pred_binary > 0
        mask_color = self.mask_color_ensemble if is_ensemble else self.mask_color_pred
        for c, color_val in enumerate(mask_color):
            overlay_pred[mask_indices, c] = (1 - self.overlay_alpha) * overlay_pred[mask_indices, c] + \
                                           self.overlay_alpha * (color_val / 255.0)
        
        overlay_pred_img = Image.fromarray((overlay_pred * 255).astype(np.uint8))
        overlay_pred_img.save(output_dir / f"{image_name}_overlay_pred.png")
    
    def process_ensemble_models(self, model_infos, test_paths, output_base_dir, 
                               device_id=0, save_best_worst=True):
        """
        Process test images using ensemble of models.
        
        Parameters:
            model_infos: List of model info dictionaries
            test_paths: Test dataset paths
            output_base_dir: Output directory
            device_id: GPU device ID
            save_best_worst: Whether to save best/worst cases
            
        Returns:
            Dictionary with results
        """
        ensemble_name = f"ensemble_{len(model_infos)}_models"
        print(f"\nProcessing ensemble of {len(model_infos)} models")
        
        # Create output directory
        model_output_dir = Path(output_base_dir) / ensemble_name
        model_output_dir.mkdir(parents=True, exist_ok=True)
        
        # Process all images with ensemble
        all_ious = []
        image_metrics = []
        
        print("Generating ensemble predictions...")
        predictions = self.ensemble_predict_batch(model_infos, test_paths['images'], device_id, batch_size=8)
        
        print("Saving visualizations...")
        for idx, (image_path, mask_path, pred_data) in enumerate(
            tqdm(zip(test_paths['images'], test_paths['masks'], predictions), 
                 total=len(test_paths['images']), desc="Saving images")):
            
            # Load ground truth
            gt_mask = self.load_ground_truth(mask_path)
            
            # Calculate IoU
            iou = self.calculate_iou(pred_data['pred_binary'], gt_mask)
            all_ious.append(iou)
            
            # Save images
            image_name = Path(image_path).stem
            self.save_individual_images(
                pred_data['image_array'], 
                gt_mask, 
                pred_data['pred_binary'],
                model_output_dir / "all_images",
                image_name,
                is_ensemble=True
            )
            
            # Track metrics
            image_metrics.append({
                'image_idx': idx,
                'image_name': image_name,
                'image_path': image_path,
                'iou': iou
            })
        
        # Save best and worst cases
        if save_best_worst and len(image_metrics) > 10:
            # Sort by IoU
            sorted_metrics = sorted(image_metrics, key=lambda x: x['iou'])
            
            # Get worst 5 and best 5
            worst_5 = sorted_metrics[:5]
            best_5 = sorted_metrics[-5:]
            
            print("\nSaving best and worst cases...")
            
            # Save worst cases
            worst_dir = model_output_dir / "worst_cases"
            for rank, metric in enumerate(worst_5):
                idx = metric['image_idx']
                pred_data = predictions[idx]
                gt_mask = self.load_ground_truth(test_paths['masks'][idx])
                
                self.save_individual_images(
                    pred_data['image_array'],
                    gt_mask,
                    pred_data['pred_binary'],
                    worst_dir,
                    f"worst_{rank+1}_iou{metric['iou']:.3f}_{metric['image_name']}",
                    is_ensemble=True
                )
            
            # Save best cases
            best_dir = model_output_dir / "best_cases"
            for rank, metric in enumerate(best_5):
                idx = metric['image_idx']
                pred_data = predictions[idx]
                gt_mask = self.load_ground_truth(test_paths['masks'][idx])
                
                self.save_individual_images(
                    pred_data['image_array'],
                    gt_mask,
                    pred_data['pred_binary'],
                    best_dir,
                    f"best_{rank+1}_iou{metric['iou']:.3f}_{metric['image_name']}",
                    is_ensemble=True
                )
            
            # Save summary JSON
            summary = {
                'model_name': ensemble_name,
                'ensemble_models': [f"{m['architecture']}_{m['backbone']}" for m in model_infos],
                'total_images': len(test_paths['images']),
                'mean_iou': float(np.mean(all_ious)),
                'std_iou': float(np.std(all_ious)),
                'min_iou': float(np.min(all_ious)),
                'max_iou': float(np.max(all_ious)),
                'worst_5_cases': worst_5,
                'best_5_cases': best_5
            }
            
            with open(model_output_dir / 'summary.json', 'w') as f:
                json.dump(summary, f, indent=2, default=str)
            
            print(f"\nEnsemble: {ensemble_name}")
            print(f"Mean IoU: {summary['mean_iou']:.3f} ± {summary['std_iou']:.3f}")
            print(f"IoU range: [{summary['min_iou']:.3f}, {summary['max_iou']:.3f}]")
            print("\nWorst 5 cases:")
            for case in worst_5:
                print(f"  - {case['image_name']}: IoU = {case['iou']:.3f}")
            print("\nBest 5 cases:")
            for case in best_5:
                print(f"  - {case['image_name']}: IoU = {case['iou']:.3f}")
        
        # Save detailed metrics CSV
        metrics_df = pd.DataFrame(image_metrics)
        metrics_df.to_csv(model_output_dir / 'image_metrics.csv', index=False)
        
        return {
            'model_name': ensemble_name,
            'output_dir': str(model_output_dir),
            'mean_iou': np.mean(all_ious),
            'image_metrics': image_metrics
        }
    
    def process_all_images_for_model(self, model_info, test_paths, output_base_dir, 
                                   device_id=3, save_best_worst=True):
        """
        Process all test images for a single model.
        
        Parameters:
            model_info: Model configuration dictionary
            test_paths: Test dataset paths
            output_base_dir: Output directory
            device_id: GPU device ID
            save_best_worst: Whether to save best/worst cases
            
        Returns:
            Dictionary with results
        """
        model_name = f"{model_info['architecture']}_{model_info['backbone']}"
        print(f"\nProcessing model: {model_name}")
        
        # Create output directory
        model_output_dir = Path(output_base_dir) / model_name
        model_output_dir.mkdir(parents=True, exist_ok=True)
        
        # Load model
        model, device = self.load_model_for_inference(model_info, device_id)
        if model is None:
            print(f"Failed to load model: {model_name}")
            return None
        
        # Process all images
        all_ious = []
        image_metrics = []
        
        print("Generating predictions...")
        predictions = self.predict_batch(model, test_paths['images'], device, batch_size=8)
        
        print("Saving visualizations...")
        for idx, (image_path, mask_path, pred_data) in enumerate(
            tqdm(zip(test_paths['images'], test_paths['masks'], predictions), 
                 total=len(test_paths['images']), desc="Saving images")):
            
            # Load ground truth
            gt_mask = self.load_ground_truth(mask_path)
            
            # Calculate IoU
            iou = self.calculate_iou(pred_data['pred_binary'], gt_mask)
            all_ious.append(iou)
            
            # Save images
            image_name = Path(image_path).stem
            self.save_individual_images(
                pred_data['image_array'], 
                gt_mask, 
                pred_data['pred_binary'],
                model_output_dir / "all_images",
                image_name
            )
            
            # Track metrics
            image_metrics.append({
                'image_idx': idx,
                'image_name': image_name,
                'image_path': image_path,
                'iou': iou
            })
        
        # Save best and worst cases (same logic as ensemble)
        if save_best_worst and len(image_metrics) > 10:
            # Sort by IoU
            sorted_metrics = sorted(image_metrics, key=lambda x: x['iou'])
            
            # Get worst 5 and best 5
            worst_5 = sorted_metrics[:5]
            best_5 = sorted_metrics[-5:]
            
            print("\nSaving best and worst cases...")
            
            # Save worst cases
            worst_dir = model_output_dir / "worst_cases"
            for rank, metric in enumerate(worst_5):
                idx = metric['image_idx']
                pred_data = predictions[idx]
                gt_mask = self.load_ground_truth(test_paths['masks'][idx])
                
                self.save_individual_images(
                    pred_data['image_array'],
                    gt_mask,
                    pred_data['pred_binary'],
                    worst_dir,
                    f"worst_{rank+1}_iou{metric['iou']:.3f}_{metric['image_name']}"
                )
            
            # Save best cases
            best_dir = model_output_dir / "best_cases"
            for rank, metric in enumerate(best_5):
                idx = metric['image_idx']
                pred_data = predictions[idx]
                gt_mask = self.load_ground_truth(test_paths['masks'][idx])
                
                self.save_individual_images(
                    pred_data['image_array'],
                    gt_mask,
                    pred_data['pred_binary'],
                    best_dir,
                    f"best_{rank+1}_iou{metric['iou']:.3f}_{metric['image_name']}"
                )
            
            # Save summary JSON
            summary = {
                'model_name': model_name,
                'model_info': model_info,
                'total_images': len(test_paths['images']),
                'mean_iou': float(np.mean(all_ious)),
                'std_iou': float(np.std(all_ious)),
                'min_iou': float(np.min(all_ious)),
                'max_iou': float(np.max(all_ious)),
                'worst_5_cases': worst_5,
                'best_5_cases': best_5
            }
            
            with open(model_output_dir / 'summary.json', 'w') as f:
                json.dump(summary, f, indent=2, default=str)
            
            print(f"\nModel: {model_name}")
            print(f"Mean IoU: {summary['mean_iou']:.3f} ± {summary['std_iou']:.3f}")
            print(f"IoU range: [{summary['min_iou']:.3f}, {summary['max_iou']:.3f}]")
            print("\nWorst 5 cases:")
            for case in worst_5:
                print(f"  - {case['image_name']}: IoU = {case['iou']:.3f}")
            print("\nBest 5 cases:")
            for case in best_5:
                print(f"  - {case['image_name']}: IoU = {case['iou']:.3f}")
        
        # Save detailed metrics CSV
        metrics_df = pd.DataFrame(image_metrics)
        metrics_df.to_csv(model_output_dir / 'image_metrics.csv', index=False)
        
        # Clean up
        del model
        torch.cuda.empty_cache()
        gc.collect()
        
        return {
            'model_name': model_name,
            'output_dir': str(model_output_dir),
            'mean_iou': np.mean(all_ious),
            'image_metrics': image_metrics
        }
    
    def inference_on_unseen_images(self, model_infos, image_paths, output_dir, 
                                ensemble_name="ensemble", device_id=0, batch_size=8):
        """
        Run inference on unseen images using ensemble models.
        
        Parameters:
            model_infos: List of model info dictionaries
            image_paths: List of paths to images for inference
            output_dir: Directory to save results
            ensemble_name: Name for this ensemble run
            device_id: GPU device ID
            batch_size: Batch size for inference
        
        Returns:
            Dictionary with paths to generated outputs
        """
        output_dir = Path(output_dir)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        run_dir = output_dir / f"{ensemble_name}_{timestamp}"
        run_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"\nRunning inference on {len(image_paths)} unseen images")
        print(f"Output directory: {run_dir}")
        
        # Get ensemble predictions
        predictions = self.ensemble_predict_batch(
            model_infos, 
            image_paths, 
            device_id=device_id, 
            batch_size=batch_size
        )
        
        # Process each image
        results = []
        print("\nSaving predictions and visualizations...")
        
        for idx, (image_path, pred_data) in enumerate(
            tqdm(zip(image_paths, predictions), total=len(image_paths), desc="Processing images")):
            
            image_name = Path(image_path).stem
            image_dir = run_dir / image_name
            image_dir.mkdir(exist_ok=True)
            
            # Save original image
            original_img = Image.fromarray((pred_data['image_array'] * 255).astype(np.uint8))
            original_path = image_dir / f"{image_name}_original.png"
            original_img.save(original_path)
            
            # Save binary prediction mask
            binary_mask = pred_data['pred_binary']
            mask_img = Image.fromarray((binary_mask * 255).astype(np.uint8), mode='L')
            mask_path = image_dir / f"{image_name}_mask.png"
            mask_img.save(mask_path)
            
            # Save prediction overlay
            overlay = self.create_prediction_overlay(
                pred_data['image_array'], 
                binary_mask,
                color=[0, 255, 0]  # Green for ensemble
            )
            overlay_img = Image.fromarray((overlay * 255).astype(np.uint8))
            overlay_path = image_dir / f"{image_name}_overlay.png"
            overlay_img.save(overlay_path)
            
            # Save confidence heatmap
            confidence = pred_data['pred_mask']
            heatmap_path = image_dir / f"{image_name}_confidence.png"
            self.save_confidence_heatmap(confidence, heatmap_path)
            
            # Save confidence overlay
            confidence_overlay = self.create_confidence_overlay(
                pred_data['image_array'],
                confidence
            )
            conf_overlay_img = Image.fromarray((confidence_overlay * 255).astype(np.uint8))
            conf_overlay_path = image_dir / f"{image_name}_confidence_overlay.png"
            conf_overlay_img.save(conf_overlay_path)
            
            # Store result info
            result = {
                'image_name': image_name,
                'original_path': str(image_path),
                'outputs': {
                    'original': str(original_path),
                    'mask': str(mask_path),
                    'overlay': str(overlay_path),
                    'confidence': str(heatmap_path),
                    'confidence_overlay': str(conf_overlay_path)
                },
                'confidence_stats': {
                    'mean': float(confidence.mean()),
                    'std': float(confidence.std()),
                    'min': float(confidence.min()),
                    'max': float(confidence.max()),
                    'pixels_above_50': int((confidence > 0.5).sum()),
                    'pixels_above_90': int((confidence > 0.9).sum())
                }
            }
            results.append(result)
        
        # Create summary JSON
        summary = {
            'timestamp': timestamp,
            'ensemble_name': ensemble_name,
            'num_models': len(model_infos),
            'num_images': len(image_paths),
            'output_directory': str(run_dir),
            'model_architectures': [f"{m['architecture']}_{m['backbone']}" for m in model_infos],
            'results': results
        }
        
        with open(run_dir / 'inference_summary.json', 'w') as f:
            json.dump(summary, f, indent=2)
        
        # Create index HTML for easy viewing
        self.create_html_index(results, run_dir)
        
        print("\nInference complete!")
        print(f"Results saved to: {run_dir}")
        print(f"View results: {run_dir / 'index.html'}")
        
        return summary

    def create_prediction_overlay(self, image_array, binary_mask, color=[0, 255, 0], alpha=0.5):
        """Create overlay of prediction on original image."""
        overlay = image_array.copy()
        mask_indices = binary_mask > 0
        
        for c, color_val in enumerate(color):
            overlay[mask_indices, c] = (1 - alpha) * overlay[mask_indices, c] + \
                                    alpha * (color_val / 255.0)
        
        return overlay

    def save_confidence_heatmap(self, confidence_map, save_path):
        """Save confidence/probability map as heatmap."""
        plt.figure(figsize=(8, 8))
        plt.imshow(confidence_map, cmap='hot', vmin=0, vmax=1)
        plt.colorbar(label='Confidence')
        plt.title('Prediction Confidence Heatmap')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        plt.close()

    def create_confidence_overlay(self, image_array, confidence_map, alpha=0.6):
        """Overlay confidence heatmap on original image."""
        import matplotlib.cm as cm
        
        # Convert confidence to RGB using colormap
        colormap = cm.get_cmap('hot')
        confidence_rgb = colormap(confidence_map)[:, :, :3]
        
        # Blend with original image
        overlay = (1 - alpha) * image_array + alpha * confidence_rgb
        overlay = np.clip(overlay, 0, 1)
        
        return overlay

    def create_html_index(self, results, output_dir):
        """Create HTML index page for viewing results."""
        html_content = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>Ensemble Inference Results</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
                .container { max-width: 1400px; margin: 0 auto; }
                .image-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; }
                .image-card { background: white; border-radius: 8px; padding: 15px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
                .image-name { font-weight: bold; margin-bottom: 10px; color: #333; }
                .thumbnails { display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px; margin-bottom: 10px; }
                .thumbnail { width: 100%; cursor: pointer; border: 2px solid transparent; transition: border-color 0.3s; }
                .thumbnail:hover { border-color: #007bff; }
                .stats { font-size: 12px; color: #666; background: #f8f8f8; padding: 10px; border-radius: 4px; }
                .stat-row { margin: 2px 0; }
                h1 { color: #333; text-align: center; }
                .summary { background: white; padding: 20px; border-radius: 8px; margin-bottom: 30px; }
            </style>
        </head>
        <body>
            <div class="container">
                <h1>Ensemble Inference Results</h1>
                <div class="summary">
                    <h3>Summary</h3>
                    <p><strong>Total Images:</strong> """ + str(len(results)) + """</p>
                    <p><strong>Timestamp:</strong> """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """</p>
                </div>
                <div class="image-grid">
        """
        
        for result in results:
            name = result['image_name']
            stats = result['confidence_stats']
            
            # Create relative paths for HTML
            rel_dir = f"{name}/"
            
            html_content += f"""
                <div class="image-card">
                    <div class="image-name">{name}</div>
                    <div class="thumbnails">
                        <img class="thumbnail" src="{rel_dir}{name}_original.png" alt="Original" title="Original">
                        <img class="thumbnail" src="{rel_dir}{name}_overlay.png" alt="Prediction Overlay" title="Prediction Overlay">
                        <img class="thumbnail" src="{rel_dir}{name}_confidence.png" alt="Confidence Heatmap" title="Confidence Heatmap">
                        <img class="thumbnail" src="{rel_dir}{name}_confidence_overlay.png" alt="Confidence Overlay" title="Confidence Overlay">
                    </div>
                    <div class="stats">
                        <div class="stat-row"><strong>Confidence Stats:</strong></div>
                        <div class="stat-row">Mean: {stats['mean']:.3f} ± {stats['std']:.3f}</div>
                        <div class="stat-row">Range: [{stats['min']:.3f}, {stats['max']:.3f}]</div>
                        <div class="stat-row">Pixels &gt;50%: {stats['pixels_above_50']:,}</div>
                        <div class="stat-row">Pixels &gt;90%: {stats['pixels_above_90']:,}</div>
                    </div>
                </div>
            """
        
        html_content += """
                </div>
            </div>
            <script>
                // Make images clickable to view full size
                document.querySelectorAll('.thumbnail').forEach(img => {
                    img.addEventListener('click', () => {
                        window.open(img.src, '_blank');
                    });
                });
            </script>
        </body>
        </html>
        """
        
        with open(output_dir / 'index.html', 'w') as f:
            f.write(html_content)

# Convenience function for running inference
def run_inference_on_new_images(viz_system, df_model_results, unseen_image_paths, output_dir):
    """
    Run inference using pre-defined k-fold ensembles on new images.
    
    Parameters:
        viz_system: LaTeXVisualizationSystem instance
        df_model_results: DataFrame with model information
        unseen_image_paths: List of paths to new images
        output_dir: Directory to save results
    """
    
    # Define the same models as in k-fold ensemble
    models_to_use = [
        ('unet', 'tu-mambaout_small'),
        ('upernet', 'tu-efficientnetv2_rw_s.ra2_in1k'),
        ('segformer', 'tu-mambaout_base'),
        ('linknet', 'timm-efficientnet-b5'),
        ('unetplusplus', 'tu-efficientnetv2_rw_s.ra2_in1k'),
        ('segformer', 'tu-regnety_080.ra3_in1k'),
    ]
    
    # Process each model ensemble
    for arch, backbone in models_to_use:
        model_name = f"{arch}_{backbone}"
        print(f"\n{'='*60}")
        print(f"Processing ensemble: {model_name}")
        print(f"{'='*60}")
        
        # Get all k-fold models for this architecture-backbone combination
        model_infos = []
        for fold in range(5):
            mask = (df_model_results['architecture'] == arch) & \
                   (df_model_results['backbone'] == backbone) & \
                   (df_model_results['validation_fold'] == fold)
            
            fold_models = df_model_results[mask]
            if not fold_models.empty:
                model_infos.append(fold_models.iloc[0].to_dict())
        
        if len(model_infos) == 0:
            print(f"No models found for {model_name}")
            continue
        
        print(f"Found {len(model_infos)} fold models")
        
        # Run inference
        viz_system.inference_on_unseen_images(
            model_infos,
            unseen_image_paths,
            output_dir,
            ensemble_name=model_name,
            device_id=0
        )


In [None]:
if EVALUER_ENSEMBLE:
    if not MAJ_DATASET:
        logger.info("Loading existing results...")
        df_model_results = pd.read_parquet(SAVED_MODEL_RESULTS_PARQUET_PATH)
        df_per_image_results = pd.read_parquet(SAVED_PER_IMAGE_RESULTS_PARQUET_PATH)
        
        # Remove duplicates, keep highest eval_test_iou
        df_model_results = df_model_results.sort_values('eval_test_iou', ascending=False).drop_duplicates(
            subset=['architecture', 'backbone', 'validation_fold'], keep='first'
        ).reset_index(drop=True)

        # Load test paths
        test_paths = load_test_data(TEST_DATASET_FILE, DATASET_IMAGES_DIR, DATASET_MASKS_DIR)

    # Initialize visualization system
    viz_system = LaTeXVisualizationSystem(img_size=IMG_SIZE)

    # Simple function to get model indices by name and fold
    def get_model_indices(df, architecture, backbone, fold=None):
        """Get model indices by architecture, backbone, and optionally fold."""
        mask = (df['architecture'] == architecture) & (df['backbone'] == backbone)
        if fold is not None:
            mask = mask & (df['validation_fold'] == fold)
        return df[mask].index.tolist()

    # Define the models to ensemble
    models_to_ensemble = [
        ('unet', 'tu-mambaout_small'),                          # top1 mAP@0.95
        ('upernet', 'tu-efficientnetv2_rw_s.ra2_in1k'),        # top2 mAP@0.95
        ('segformer', 'tu-mambaout_base'),                     # top1 mAP@0.75
        ('linknet', 'timm-efficientnet-b5'),                   # top1 mAP@0.5
        ('unetplusplus', 'tu-efficientnetv2_rw_s.ra2_in1k'),   # top2 mAP@0.5
        ('segformer', 'tu-regnety_080.ra3_in1k'),              # top2 IoU
    ]

    # Dictionary to store ensemble results
    ensemble_results = {}
    individual_fold_results = {}

    # Process each model separately
    print("\n" + "=" * 60)
    print("Creating K-Fold Ensembles for Each Model")
    print("=" * 60)

    for arch, backbone in models_to_ensemble:
        model_name = f"{arch}_{backbone}"
        print(f"\n{'='*40}")
        print(f"Processing: {model_name}")
        print(f"{'='*40}")
        
        # Get all folds for this model
        fold_indices = []
        fold_performances = []
        
        for fold in range(5):  # 5-fold CV
            indices = get_model_indices(df_model_results, arch, backbone, fold)
            if indices:
                fold_indices.extend(indices)
                # Get individual fold performance
                fold_data = df_model_results.iloc[indices[0]]
                fold_performances.append({
                    'fold': fold,
                    'iou': fold_data['eval_test_iou'],
                    'f1': fold_data['eval_test_f1_score'],
                    'map50': fold_data['eval_test_map_50']
                })
                print(f"  Fold {fold}: IoU = {fold_data['eval_test_iou']:.4f}")
        
        if len(fold_indices) == 0:
            print(f"  No models found for {model_name}")
            continue
        
        # Store individual fold results
        individual_fold_results[model_name] = fold_performances
        
        # Calculate statistics
        ious = [f['iou'] for f in fold_performances]
        mean_iou = np.mean(ious)
        std_iou = np.std(ious)
        
        print("\n  Individual Fold Statistics:")
        print(f"     Mean IoU: {mean_iou:.4f} ± {std_iou:.4f}")
        print(f"     Best fold: {max(ious):.4f}")
        print(f"     Worst fold: {min(ious):.4f}")
        
        # Create ensemble for this model across all folds
        print(f"\n  Creating ensemble with {len(fold_indices)} folds...")

        # Get model infos for ensemble
        model_infos = [df_model_results.iloc[idx].to_dict() for idx in fold_indices]

        # Create unique folder for each model
        model_specific_dir = VIZ_OUTPUT_DIR / "kfold_ensembles" / model_name

        # Use process_ensemble_models with model-specific directory
        ensemble_result = viz_system.process_ensemble_models(
            model_infos,
            test_paths,
            model_specific_dir,
            device_id=0
        )
        
        # Extract results
        if ensemble_result:
            # Get the actual output path
            actual_output = model_specific_dir / 'ensemble_5_models'
            
            # Get ensemble IoU directly from result
            ensemble_iou = ensemble_result.get('mean_iou', None)
            
            if ensemble_iou:
                improvement = ((ensemble_iou - mean_iou) / mean_iou) * 100
                improvement_best = ((ensemble_iou - max(ious)) / max(ious)) * 100
                
                print("\n  Ensemble Results:")
                print(f"     Ensemble IoU: {ensemble_iou:.4f}")
                print(f"     Improvement over mean: {improvement:.2f}%")
                print(f"     Improvement over best fold: {improvement_best:.2f}%")
            else:
                ensemble_iou = None
                improvement = None
                improvement_best = None
        else:
            actual_output = None
            ensemble_iou = None
            improvement = None
            improvement_best = None
        
        # Store ensemble result
        ensemble_results[model_name] = {
            'output_path': str(actual_output) if actual_output else None,
            'fold_indices': fold_indices,
            'num_folds': len(fold_indices),
            'individual_fold_performances': fold_performances,
            'fold_mean_iou': mean_iou,
            'fold_std_iou': std_iou,
            'ensemble_result': ensemble_result,
            'ensemble_iou': ensemble_iou,
            'improvement_percent': improvement,
            'improvement_best_percent': improvement_best
        }

    # Create Summary Report
    print("\n" + "=" * 60)
    print("Creating K-Fold Ensemble Report")
    print("=" * 60)

    # Summary statistics
    successful_ensembles = [r for r in ensemble_results.values() if r.get('ensemble_iou') is not None]
    all_improvements = [r['improvement_percent'] for r in successful_ensembles if r.get('improvement_percent') is not None]

    print("\nSUMMARY STATISTICS:")
    print(f"Models processed: {len(ensemble_results)}")
    print(f"Successful ensembles: {len(successful_ensembles)}")

    if all_improvements:
        print("\nImprovement over mean fold:")
        print(f"  - Average: {np.mean(all_improvements):.2f}%")
        print(f"  - Best: {max(all_improvements):.2f}%")
        print(f"  - Worst: {min(all_improvements):.2f}%")

    # Detailed results
    print("\n" + "-" * 60)
    print("DETAILED RESULTS BY MODEL:")
    print("-" * 60)

    for model_name, result in ensemble_results.items():
        if result.get('ensemble_iou') is not None:
            print(f"\n{model_name}:")
            print(f"  Folds: {result['num_folds']}")
            print(f"  Mean fold IoU: {result['fold_mean_iou']:.4f} ± {result['fold_std_iou']:.4f}")
            print(f"  Best fold IoU: {max([f['iou'] for f in result['individual_fold_performances']]):.4f}")
            print(f"  Ensemble IoU: {result['ensemble_iou']:.4f}")
            print(f"  Improvement: +{result['improvement_percent']:.2f}% (vs mean), {result['improvement_best_percent']:+.2f}% (vs best)")
            print(f"  Output: {result['output_path']}")

    # Create Detailed Report
    report_content = f"""# K-Fold Ensemble Analysis Report
    Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

    ## Executive Summary
    This report analyzes the performance improvement achieved by ensembling models across their k-fold variants.

    ## Dataset Information
    - Total test images: {len(test_paths['images'])}
    - Number of folds: 5
    - Models evaluated: {len(models_to_ensemble)}

    ## Detailed Results by Model

    """

    for model_name, result in ensemble_results.items():
        if result.get('ensemble_iou') is not None:
            report_content += f"### {model_name}\n"
            report_content += f"- **Number of folds ensembled**: {result['num_folds']}\n"
            report_content += f"- **Individual fold IoUs**: {[f'{f:.4f}' for f in [fold['iou'] for fold in result['individual_fold_performances']]]}\n"
            report_content += f"- **Mean fold IoU**: {result['fold_mean_iou']:.4f} ± {result['fold_std_iou']:.4f}\n"
            report_content += f"- **Best single fold**: {max([fold['iou'] for fold in result['individual_fold_performances']]):.4f}\n"
            report_content += f"- **Ensemble IoU**: {result['ensemble_iou']:.4f}\n"
            report_content += f"- **Improvement over mean**: {result['improvement_percent']:.2f}%\n"
            report_content += f"- **Output directory**: `{result.get('output_path', 'Not generated')}`\n\n"

    # Summary statistics
    if all_improvements:
        report_content += f"""## Overall Statistics

    - **Average improvement**: {np.mean(all_improvements):.2f}%
    - **Best improvement**: {max(all_improvements):.2f}%
    - **Worst improvement**: {min(all_improvements):.2f}%

    """

    with open(VIZ_OUTPUT_DIR / 'KFOLD_ENSEMBLE_REPORT.md', 'w') as f:
        f.write(report_content)

    print(f"\nReport saved to: {VIZ_OUTPUT_DIR / 'KFOLD_ENSEMBLE_REPORT.md'}")

In [None]:
if EVALUER_ENSEMBLE:
    # Create LaTeX Table
    latex_table = """\\begin{table}[h]
    \\centering
    \\caption{K-Fold Ensemble Performance Comparison}
    \\label{tab:kfold_ensemble}
    \\begin{tabular}{|l|c|c|c|c|c|}
    \\hline
    \\textbf{Model} & \\textbf{Folds} & \\textbf{Mean Fold IoU} & \\textbf{Best Fold} & \\textbf{Ensemble IoU} & \\textbf{Improvement} \\\\
    \\hline
    """

    for model_name, result in ensemble_results.items():
        if result.get('ensemble_iou') is not None:
            model_name_tex = model_name.replace('_', '\\_')[:35] + '...' if len(model_name) > 35 else model_name.replace('_', '\\_')
            fold_ious = [fold['iou'] for fold in result['individual_fold_performances']]
            
            latex_table += f"{model_name_tex} & {result['num_folds']} & "
            latex_table += f"{result['fold_mean_iou']:.3f} ± {result['fold_std_iou']:.3f} & "
            latex_table += f"{max(fold_ious):.3f} & "
            latex_table += f"\\textbf{{{result['ensemble_iou']:.3f}}} & "
            latex_table += f"+{result['improvement_percent']:.1f}\\% \\\\\n"

    latex_table += """\\hline
    \\end{tabular}
    \\end{table}"""

    with open(VIZ_OUTPUT_DIR / 'kfold_ensemble_table.tex', 'w') as f:
        f.write(latex_table)

    print(f"LaTeX table saved to: {VIZ_OUTPUT_DIR / 'kfold_ensemble_table.tex'}")

In [None]:
if EVALUER_ENSEMBLE:
    # Configuration for LaTeX figure generation
    
    def sanitize_label(text):
        """
        Convert text to valid LaTeX label format.
        
        Removes spaces, underscores, and special characters to ensure
        compatibility with LaTeX reference system.
        
        Parameters:
            text (str): Input text to sanitize
            
        Returns:
            str: LaTeX-compatible label string
        """
        # Remove common prefixes and suffixes
        text = text.replace('timm-', '').replace('tu-', '')
        # Keep only alphanumeric characters
        return re.sub(r'[^a-zA-Z0-9]', '', text).lower()

    def escape_latex(text):
        """
        Escape special LaTeX characters in text.
        
        Parameters:
            text (str): Input text containing LaTeX special characters
            
        Returns:
            str: Text with escaped special characters
        """
        chars = {
            '_': r'\_',
            '%': r'\%',
            '&': r'\&',
            '#': r'\#',
            '$': r'\$',
            '{': r'\{',
            '}': r'\}',
        }
        for char, escape in chars.items():
            text = text.replace(char, escape)
        return text

    def format_model_name(model_name):
        """
        Format model name for improved display in LaTeX.
        
        Splits architecture and backbone components, applies proper
        capitalization and formatting conventions.
        
        Parameters:
            model_name (str): Combined model name (architecture_backbone)
            
        Returns:
            tuple: (formatted_architecture, formatted_backbone)
        """
        # Split architecture and backbone
        parts = model_name.split('_', 1)
        if len(parts) == 2:
            arch, backbone = parts
            # Clean up common prefixes
            backbone = backbone.replace('timm-', '').replace('tu-', '')
            # Capitalize architecture
            arch = arch.capitalize()
            if arch == 'Unetplusplus':
                arch = 'UNet++'
            elif arch == 'Upernet':
                arch = 'UPerNet'
            elif arch == 'Segformer':
                arch = 'SegFormer'
            elif arch == 'Linknet':
                arch = 'LinkNet'
            return arch, backbone
        return model_name, ''

    def get_image_files(directory, case_type='best', case_number=1):
        """
        Retrieve all image files for a specific test case.
        
        Searches for original, ground truth, prediction, and overlay
        images based on naming convention.
        
        Parameters:
            directory (Path): Directory containing case images
            case_type (str): Type of case ('best' or 'worst')
            case_number (int): Case index number
            
        Returns:
            tuple: (files_dict, iou_score)
                - files_dict: Dictionary mapping image types to file paths
                - iou_score: IoU score extracted from filename
        """
        files = {}
        patterns = {
            'original': f'{case_type}_{case_number}_iou*_original.png',
            'gt': f'{case_type}_{case_number}_iou*_gt.png',
            'pred': f'{case_type}_{case_number}_iou*_pred.png',
            'overlay_gt': f'{case_type}_{case_number}_iou*_overlay_gt.png',
            'overlay_pred': f'{case_type}_{case_number}_iou*_overlay_pred.png'
        }
        
        for key, pattern in patterns.items():
            found = list(directory.glob(pattern))
            if found:
                files[key] = found[0]
        
        # Extract IoU from filename
        iou = None
        if files:
            first_file = list(files.values())[0]
            match = re.search(r'iou([\d.]+)', first_file.name)
            if match:
                iou = float(match.group(1))
        
        return files, iou

    def load_summary_data(model_output_path):
        """
        Load summary JSON file for additional model information.
        
        Parameters:
            model_output_path (Path): Path to model output directory
            
        Returns:
            dict: Summary data or None if not found
        """
        summary_path = Path(model_output_path) / 'summary.json'
        if summary_path.exists():
            with open(summary_path, 'r') as f:
                return json.load(f)
        return None

    # Initialize LaTeX document structure
    latex_figures = r"""% LaTeX code for K-Fold Ensemble Visualization Figures
% Add this to your preamble:
% \usepackage{graphicx}
% \usepackage{subcaption}
% \usepackage{float}

% IMPORTANT: Set the graphics path to your kfold_ensembles directory
% \graphicspath{{./path/to/kfold_ensembles/}}

"""

    # Track all models for comparison figure
    all_models_best = []

    # Process each model
    for model_name, result in ensemble_results.items():
        if result.get('output_path') is None:
            continue
        
        # Extract path from stored result
        model_dir = Path(result['output_path'])
        
        # Check if pointing to ensemble_5_models subdirectory
        if model_dir.name == 'ensemble_5_models':
            model_dir = model_dir.parent
        
        # Ensemble results location
        ensemble_dir = model_dir / 'ensemble_5_models'
        
        if not ensemble_dir.exists():
            print(f"Warning: Ensemble directory not found for {model_name}: {ensemble_dir}")
            continue
        
        # Format model name for display
        arch_display, backbone_display = format_model_name(model_name)
        model_name_tex = escape_latex(model_name)
        arch_tex = escape_latex(arch_display)
        backbone_tex = escape_latex(backbone_display)
        model_label = sanitize_label(model_name)
        
        # Load summary for additional info
        summary = load_summary_data(ensemble_dir)
        
        # Get ensemble IoU from result
        ensemble_iou = result.get('ensemble_iou', 0)
        
        latex_figures += f"""
% Model: {arch_display} with {backbone_display}
% Ensemble IoU: {ensemble_iou:.4f}

"""
        
        # Process best cases
        best_cases_dir = ensemble_dir / "best_cases"
        if best_cases_dir.exists():
            # Get best case 1
            best_files, best_iou = get_image_files(best_cases_dir, 'best', 1)
            
            if best_files and all(k in best_files for k in ['original', 'gt', 'pred', 'overlay_pred']):
                # Store for comparison figure
                all_models_best.append({
                    'model_name': model_name,
                    'arch': arch_display,
                    'backbone': backbone_display,
                    'ensemble_iou': ensemble_iou,
                    'best_iou': best_iou,
                    'overlay_file': best_files['overlay_pred'].name,
                    'model_dir': model_dir.name
                })
                
                # Create figure for best case
                latex_figures += f"""% Best performing segmentation for {arch_tex}
\\begin{{figure}}[htbp]
\\centering
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/best_cases/{best_files['original'].name}}}
    \\caption{{Input Image}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/best_cases/{best_files['gt'].name}}}
    \\caption{{Ground Truth}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/best_cases/{best_files['pred'].name}}}
    \\caption{{Prediction}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/best_cases/{best_files['overlay_pred'].name}}}
    \\caption{{Overlay}}
\\end{{subfigure}}
\\caption{{Best segmentation result for {arch_tex} with {backbone_tex} backbone. K-fold ensemble of 5 models achieves IoU = {best_iou:.3f} on this image (ensemble mean IoU = {ensemble_iou:.3f}).}}
\\label{{fig:{model_label}best}}
\\end{{figure}}

"""
        
        # Process worst cases
        worst_cases_dir = ensemble_dir / "worst_cases"
        if worst_cases_dir.exists():
            # Get worst case 1
            worst_files, worst_iou = get_image_files(worst_cases_dir, 'worst', 1)
            
            if worst_files and all(k in worst_files for k in ['original', 'gt', 'pred', 'overlay_pred']):
                latex_figures += f"""% Challenging case for {arch_tex}
\\begin{{figure}}[htbp]
\\centering
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/worst_cases/{worst_files['original'].name}}}
    \\caption{{Input Image}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/worst_cases/{worst_files['gt'].name}}}
    \\caption{{Ground Truth}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/worst_cases/{worst_files['pred'].name}}}
    \\caption{{Prediction}}
\\end{{subfigure}}
\\hfill
\\begin{{subfigure}}{{0.24\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_dir.name}/ensemble_5_models/worst_cases/{worst_files['overlay_pred'].name}}}
    \\caption{{Overlay}}
\\end{{subfigure}}
\\caption{{Most challenging case for {arch_tex} with {backbone_tex} backbone. This example shows the limitations with IoU = {worst_iou:.3f}.}}
\\label{{fig:{model_label}worst}}
\\end{{figure}}

"""

    # Create comparison figure with all models' best cases
    if all_models_best:
        # Sort by ensemble IoU descending
        all_models_best.sort(key=lambda x: x['ensemble_iou'], reverse=True)
        
        latex_figures += """
% Comparison of Best Cases Across All K-Fold Ensembles

\\begin{figure}[htbp]
\\centering
"""
        
        # Use top 6 models
        models_to_show = all_models_best[:6]
        
        for i, model_data in enumerate(models_to_show):
            # Add line break after every 3 subfigures
            if i > 0 and i % 3 == 0:
                latex_figures += "\n\\vspace{0.5em}\n"
            
            latex_figures += f"""\\begin{{subfigure}}{{0.32\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{model_data['model_dir']}/ensemble_5_models/best_cases/{model_data['overlay_file']}}}
    \\caption{{{model_data['arch']} (IoU={model_data['ensemble_iou']:.3f})}}
\\end{{subfigure}}"""
            
            # Add horizontal space between subfigures (except last in row)
            if (i + 1) % 3 != 0 and i < len(models_to_show) - 1:
                latex_figures += "\n\\hfill"
            elif (i + 1) % 3 == 0 and i < len(models_to_show) - 1:
                latex_figures += "\n"
        
        latex_figures += """
\\caption{Comparison of best segmentation results across different architectures using k-fold ensembles. Green overlays show the ensemble predictions. Models are ordered by mean ensemble IoU performance.}
\\label{fig:kfoldcomparison}
\\end{figure}

"""

    # Add section for individual model comparisons
    latex_figures += """
% Individual Model Performance Across Multiple Cases

"""

    # Select best performing model to show multiple cases
    if all_models_best:
        best_model = all_models_best[0]
        model_dir = Path(VIZ_OUTPUT_DIR) / 'kfold_ensembles' / best_model['model_dir']
        ensemble_dir = model_dir / 'ensemble_5_models'
        best_cases_dir = ensemble_dir / 'best_cases'
        
        if best_cases_dir.exists():
            # Get top 3 best cases for this model
            latex_figures += f"""% Multiple successful cases for best model: {best_model['arch']}
\\begin{{figure}}[htbp]
\\centering
"""
            
            for case_num in range(1, 4):  # Cases 1, 2, 3
                files, iou = get_image_files(best_cases_dir, 'best', case_num)
                if files and 'overlay_pred' in files:
                    latex_figures += f"""\\begin{{subfigure}}{{0.32\\textwidth}}
    \\includegraphics[width=\\textwidth]{{{best_model['model_dir']}/ensemble_5_models/best_cases/{files['overlay_pred'].name}}}
    \\caption{{Case {case_num}: IoU={iou:.3f}}}
\\end{{subfigure}}"""
                    
                    if case_num < 3:
                        latex_figures += "\n\\hfill"
            
            latex_figures += f"""
\\caption{{Top 3 segmentation results for {best_model['arch']} with {best_model['backbone']} backbone, demonstrating consistent high performance across different dermoscopy images.}}
\\label{{fig:bestmodelmultiple}}
\\end{{figure}}

"""

    # Add usage instructions
    latex_figures += """
% USAGE INSTRUCTIONS
%
% 1. Copy the kfold_ensembles directory to your LaTeX project:
%    cp -r """ + str(VIZ_OUTPUT_DIR / 'kfold_ensembles') + """ ./figures/
%
% 2. In your LaTeX document preamble, add:
%    \\graphicspath{{./figures/kfold_ensembles/}}
%
% 3. Include the figures in your document:
%    \\input{kfold_ensemble_figures_improved.tex}
%
% 4. Or copy individual figure environments to your document

"""

    # Save the improved LaTeX file
    output_file = VIZ_OUTPUT_DIR / 'kfold_ensemble_figures_improved.tex'
    with open(output_file, 'w') as f:
        f.write(latex_figures)

    print(f"\nImproved LaTeX figure snippets saved to: {output_file}")

    # Create summary of generated figures
    summary_text = f"""LaTeX Figure Generation Summary
==============================

Total models processed: {len(all_models_best)}
Figures generated per model: 2 (best case + worst case)
Comparison figures: 2

Models included (ordered by performance):
"""

    for i, model in enumerate(all_models_best, 1):
        summary_text += f"{i}. {model['arch']} with {model['backbone']} - Ensemble IoU: {model['ensemble_iou']:.4f}\n"

    summary_text += f"""
Output files:
- LaTeX figures: {output_file}
- This summary: {VIZ_OUTPUT_DIR / 'latex_generation_summary.txt'}

To use in your LaTeX document:
1. Copy the kfold_ensembles folder to your LaTeX project
2. Add \\graphicspath{{./figures/kfold_ensembles/}} to your preamble
3. Include the generated .tex file or copy individual figures
"""

    with open(VIZ_OUTPUT_DIR / 'latex_generation_summary.txt', 'w') as f:
        f.write(summary_text)

    print("\nSummary of generated figures:")
    print(summary_text)

    # Save results to JSON
    results_summary = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'models_evaluated': len(models_to_ensemble),
        'successful_ensembles': len(successful_ensembles),
        'average_improvement': float(np.mean(all_improvements)) if all_improvements else None,
        'ensemble_results': ensemble_results
    }

    with open(VIZ_OUTPUT_DIR / 'kfold_ensemble_results.json', 'w') as f:
        json.dump(results_summary, f, indent=2, default=str)

    print("\nK-Fold ensemble analysis complete")
    print(f"Results saved to: {VIZ_OUTPUT_DIR / 'kfold_ensembles/'}")
    print(f"Summary saved to: {VIZ_OUTPUT_DIR / 'kfold_ensemble_results.json'}")
    print("LaTeX outputs:")
    print(f"   - Table: {VIZ_OUTPUT_DIR / 'kfold_ensemble_table.tex'}")
    print(f"   - Figures: {VIZ_OUTPUT_DIR / 'kfold_ensemble_figures.tex'}")
    print(f"   - Report: {VIZ_OUTPUT_DIR / 'KFOLD_ENSEMBLE_REPORT.md'}")

## Neighborhood Analysis
### Image Pre-processing and Building Mask Application

In [None]:
# Input data configuration
images_to_process = [
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_10_7_18eac4.tif",
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_10_8_2931b8.tif",
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_11_7_4db66f.tif",
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_11_8_6b34a7.tif",
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_12_7_7ce219.tif",
    "data/notebook_04/geotiff/tile_1024_split/24991118_tile_12_8_83485c.tif",
]

# Output directory
output_dir = QUARTIER_OUTPUT_DIR / "masked_images"
os.makedirs(output_dir, exist_ok=True)

# Load building polygons
gdf_toitures = gpd.read_file("data/SITG/CAD_BATIMENT_HORSOL_TOIT_2024-11-03.gpkg")
print(f"Loaded {len(gdf_toitures)} building polygons")

# Process each image
for image_path in images_to_process:
    print(f"\nProcessing: {image_path}")
    
    with rasterio.open(image_path) as src:
        print(f"  Original - Bands: {src.count}, Dtype: {src.dtypes[0]}")
        print(f"  Original - Size: {src.width} x {src.height}")
        print(f"  Original - Data range: {src.read(1).min()} to {src.read(1).max()}")
        
        # Check original color interpretation
        original_colorinterp = src.colorinterp
        print(f"  Original color interpretation: {original_colorinterp}")
        
        # Get profile for output
        profile = src.profile.copy()
        
        # Reproject polygons if needed
        if gdf_toitures.crs != src.crs:
            gdf_reproj = gdf_toitures.to_crs(src.crs)
        else:
            gdf_reproj = gdf_toitures
        
        # Find intersecting buildings
        image_bbox = box(*src.bounds)
        intersecting = gdf_reproj[gdf_reproj.intersects(image_bbox)]
        
        if len(intersecting) == 0:
            print(f"  Warning: No buildings found in {image_path}")
            continue
        
        print(f"  Found {len(intersecting)} buildings")
        
        # Apply mask to keep buildings
        geometries = intersecting.geometry.values
        masked_data, masked_transform = mask(
            src, 
            geometries, 
            crop=False,
            invert=False,    # Keep inside buildings
            nodata=None,     # Don't set nodata to 0
            filled=True
        )
        
        print(f"  Masked data shape: {masked_data.shape}")
        print(f"  Masked data range: {masked_data.min()} to {masked_data.max()}")
        
        # Calculate coverage
        non_zero_pixels = np.count_nonzero(masked_data)
        total_pixels = masked_data.shape[1] * masked_data.shape[2]
        coverage_pct = (non_zero_pixels / total_pixels) * 100
        print(f"  Coverage: {coverage_pct:.2f}%")
        
        # Update profile maintaining original properties
        profile.update({
            'driver': 'GTiff',
            'compress': 'lzw',
            'tiled': True,
            'count': src.count,           # Keep original band count
            'dtype': src.dtypes[0],       # Keep original data type
            'width': masked_data.shape[2], # Update dimensions
            'height': masked_data.shape[1],
            'transform': masked_transform,  # Use masked transform
        })
        
        # Only set nodata if original had it
        if src.nodata is not None:
            profile['nodata'] = src.nodata
        
        # Save masked image
        base_name = Path(image_path).stem
        output_path = os.path.join(output_dir, f"{base_name}_masked.tif")
        
        with rasterio.open(output_path, 'w', **profile) as dst:
            # Write all bands
            dst.write(masked_data)
            
            # Set proper color interpretation for RGB images
            if src.count >= 3:
                # Set color interpretation to RGB
                dst.colorinterp = [ColorInterp.red, ColorInterp.green, ColorInterp.blue]
                if src.count == 4:
                    dst.colorinterp = [ColorInterp.red, ColorInterp.green, ColorInterp.blue, ColorInterp.alpha]
            else:
                # Copy original color interpretation
                dst.colorinterp = original_colorinterp
            
            # Copy over any additional metadata tags
            dst.update_tags(**src.tags())
            
            # Add custom metadata about masking
            dst.update_tags(
                PROCESSING='Building mask applied',
                MASK_COVERAGE=f'{coverage_pct:.2f}%',
                BUILDINGS_COUNT=str(len(intersecting))
            )
        
        print(f"  Saved: {output_path}")
        
        # Verify saved file
        with rasterio.open(output_path) as verify:
            print(f"  Verification - Bands: {verify.count}, Dtype: {verify.dtypes[0]}")
            print(f"  Verification - Color interp: {verify.colorinterp}")
            print(f"  Verification - Data range: {verify.read(1).min()} to {verify.read(1).max()}")
        
        # Create color preview for RGB images
        if src.count >= 3:
            print(f"  Creating color preview...")
            
            # Create RGB composite for display using percentile stretch
            rgb_display = np.zeros((masked_data.shape[1], masked_data.shape[2], 3), dtype=np.uint8)
            
            for i in range(3):
                band_data = masked_data[i].astype(np.float32)
                
                # Only process non-zero pixels (building areas)
                non_zero_mask = band_data > 0
                
                if np.any(non_zero_mask):
                    # Calculate percentiles only on non-zero values
                    non_zero_values = band_data[non_zero_mask]
                    p2, p98 = np.percentile(non_zero_values, [2, 98])
                    
                    # Stretch to 0-255 range
                    stretched = np.clip((band_data - p2) / (p98 - p2) * 255, 0, 255)
                    rgb_display[:, :, i] = stretched.astype(np.uint8)
            
            # Display comparison
            fig, axes = plt.subplots(1, 2, figsize=(12, 6))
            
            # Original RGB using same percentile stretch
            original_rgb = np.zeros((src.height, src.width, 3), dtype=np.uint8)
            original_data = src.read()
            
            for i in range(3):
                band_data = original_data[i].astype(np.float32)
                p2, p98 = np.percentile(band_data, [2, 98])
                stretched = np.clip((band_data - p2) / (p98 - p2) * 255, 0, 255)
                original_rgb[:, :, i] = stretched.astype(np.uint8)
            
            axes[0].imshow(original_rgb)
            axes[0].set_title('Original Color Image')
            axes[0].axis('off')
            
            axes[1].imshow(rgb_display)
            axes[1].set_title('Masked Color Image (Buildings Only)')
            axes[1].axis('off')
            
            plt.tight_layout()
            plt.show()

print(f"\nProcessing complete. Color masked images saved to: {output_dir}")

# Final verification of all output files
print("\nFinal verification of all output files:")
for image_path in images_to_process:
    base_name = Path(image_path).stem
    output_path = os.path.join(output_dir, f"{base_name}_masked.tif")
    if os.path.exists(output_path):
        with rasterio.open(output_path) as check:
            print(f"  {base_name}_masked.tif:")
            print(f"     - Bands: {check.count} ({check.dtypes[0]})")
            print(f"     - Color interp: {check.colorinterp}")
            print(f"     - Size: {check.width} x {check.height}")
            print(f"     - CRS: {check.crs}")
            print(f"     - Non-zero pixels: {np.count_nonzero(check.read()):,}")
    else:
        print(f"  {base_name}_masked.tif: NOT FOUND")

print("\nNote: These files can be opened in QGIS/ArcGIS for full color display")

### Model Inference on Masked Images

In [None]:
import os
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from datetime import datetime
import json
import gc
import matplotlib.cm as cm

def run_inference_on_new_images(viz_system, df_model_results, unseen_image_paths, output_dir):
    """
    Run inference using pre-defined k-fold ensembles on new images.
    
    Applies the same ensemble models used in validation to new unseen images,
    generating predictions and visualizations for each model combination.
    
    Parameters:
        viz_system: LaTeXVisualizationSystem instance
        df_model_results: DataFrame with model information
        unseen_image_paths: List of paths to new images
        output_dir: Directory to save results
    """
    
    # Define models matching k-fold ensemble configuration
    models_to_use = [
        ('unet', 'tu-mambaout_small'),
        ('upernet', 'tu-efficientnetv2_rw_s.ra2_in1k'),
        ('segformer', 'tu-mambaout_base'),
        ('linknet', 'timm-efficientnet-b5'),
        ('unetplusplus', 'tu-efficientnetv2_rw_s.ra2_in1k'),
        ('segformer', 'tu-regnety_080.ra3_in1k'),
    ]
    
    # Process each model ensemble
    for arch, backbone in models_to_use:
        model_name = f"{arch}_{backbone}"
        print(f"\n{'='*60}")
        print(f"Processing ensemble: {model_name}")
        print(f"{'='*60}")
        
        # Get all k-fold models for this architecture-backbone combination
        model_infos = []
        for fold in range(5):
            mask = (df_model_results['architecture'] == arch) & \
                   (df_model_results['backbone'] == backbone) & \
                   (df_model_results['validation_fold'] == fold)
            
            fold_models = df_model_results[mask]
            if not fold_models.empty:
                model_infos.append(fold_models.iloc[0].to_dict())
        
        if len(model_infos) == 0:
            print(f"Warning: No models found for {model_name}")
            continue
        
        print(f"Found {len(model_infos)} fold models")
        
        # Run inference
        viz_system.inference_on_unseen_images(
            model_infos,
            unseen_image_paths,
            output_dir,
            ensemble_name=model_name,
            device_id=0
        )

In [None]:
# List of new image paths
unseen_images = [
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_10_7_18eac4_masked.tif",
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_10_8_2931b8_masked.tif",
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_11_7_4db66f_masked.tif",
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_11_8_6b34a7_masked.tif",
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_12_7_7ce219_masked.tif",
    QUARTIER_OUTPUT_DIR / "masked_images/24991118_tile_12_8_83485c_masked.tif",
]

# Load existing results
logger.info("Loading existing results...")
df_model_results = pd.read_parquet(SAVED_MODEL_RESULTS_PARQUET_PATH)
df_per_image_results = pd.read_parquet(SAVED_PER_IMAGE_RESULTS_PARQUET_PATH)

# Remove duplicates, keeping highest performing models
df_model_results = df_model_results.sort_values('eval_test_iou', ascending=False).drop_duplicates(
    subset=['architecture', 'backbone', 'validation_fold'], keep='first'
).reset_index(drop=True)

# Load test paths
test_paths = load_test_data(TEST_DATASET_FILE, DATASET_IMAGES_DIR, DATASET_MASKS_DIR)

# Initialize visualization system
viz_system = LaTeXVisualizationSystem(img_size=IMG_SIZE)

# Run inference on new images
run_inference_on_new_images(viz_system, df_model_results, unseen_images, QUARTIER_OUTPUT_DIR)