# Postprocessing annotations

## Imports

In [1]:
import datetime
import os
import random
import shutil
import tempfile
import traceback
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import geopandas as gpd
from shapely.geometry import Polygon, mapping

import rasterio
import rasterio.mask
from rasterio.crs import CRS
from rasterio.windows import from_bounds, Window

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor


## Variables

In [2]:
todays_date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

DATASET_ANNOTATED_BRUT_PATH = "datasets/supervisely/341575_free_space_rooftop_geneva_20250511_binary_mask"
DATASET_TILES_INFORMATION_CSV_PATH = "data/notebook_06/dataset_20250405-193125/PNG_dataset_roboflow_20250405-193143/sampled_tiles.csv"
CORRECT_CRS = CRS.from_epsg(2056)
EPSG_SUISSE = "EPSG:2056"

CAD_BATIMENT_HORSOL_TOIT_MERGE_PARQUET_PATH = "data/notebook_04/parquet/04_02_merged_rooftops_poly.parquet"

# Parquet files
VERIFICATION_OUTPUT_PARQUET_PATH = "data/notebook_06/parquet/06b_01_verification.parquet"
DATASET_OUTPUT_PARQUET_PATH = "data/notebook_06/parquet/06b_02_dataset_processed.parquet"
DATASET_FINAL_OUTPUT_PARQUET_PATH = "data/notebook_06/parquet/06b_03_dataset_final.parquet"

# dataset processed
DATASET_PROCESSED_NAME = "dataset_processed_" + str(todays_date)
DATASET_PROCESSED_PATH = "datasets/supervisely/" + DATASET_PROCESSED_NAME
DATASET_OUTPUT_IMG_PATH = DATASET_PROCESSED_PATH + "/images"
DATASET_OUTPUT_MASKS_PATH = DATASET_PROCESSED_PATH + "/masks"
DATASET_OUTPUT_CHECKS_PATH = DATASET_PROCESSED_PATH + "/check_dataset"

# buffer pour les chevauchements
BUFFER_DISTANCE = 0 # en mètre
OVERLAP_POSITIONS=['top', 'right', 'top-left', 'top-right']

os.makedirs(DATASET_PROCESSED_PATH)
os.makedirs(DATASET_OUTPUT_IMG_PATH)
os.makedirs(DATASET_OUTPUT_MASKS_PATH)
os.makedirs(DATASET_OUTPUT_CHECKS_PATH)

#! Régénérer les tuiles dans tile_1024_split depuis les geotiff de 1.4Gb de SITG. Environ 1h
#! Utiles si jamais les tuiles de 1024 sont corrompues ou effacées par erreur
REGENERATE_TILE_1024_SPLIT_FROM_SITG = False
REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_PROCESSES = 2
REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_THREADS = 2
REGENERATE_TILE_1024_SPLIT_FROM_SITG_COMBINED_METADATA_PARQUET = "data/notebook_04/geotiff/tile_1024_split_old_20250519-120028/combined_metadata.parquet"
REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TILE_1024 = "data/notebook_04/geotiff/tile_1024_split"
REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TEMP_1280 = 'data/notebook_04/geotiff/tile_1280_split'

## Régénérer tile_1024_split si problème

optionnel, c'est dans le cas ou le tile_1024_split a été modifié par erreur

In [None]:
# Process a single tile
def process_tile(row, output_dir, debug_dir):
    try:
        bounds_str = row['buffered_bounds']
        if isinstance(bounds_str, str):
            bounds_str = bounds_str.replace(' ', '')
            bounds = tuple(float(x) for x in bounds_str.strip('()').split(','))
        else:
            bounds = bounds_str
        min_x, min_y, max_x, max_y = bounds

        tile_path = row['tile_path']
        output_filename = os.path.basename(tile_path)
        output_path = os.path.join(output_dir, output_filename)

        with rasterio.open(row['geotiff_path']) as src:
            window = from_bounds(min_x, min_y, max_x, max_y, src.transform)
            window = rasterio.windows.Window(
                col_off=int(round(window.col_off)),
                row_off=int(round(window.row_off)),
                width=int(round(window.width)),
                height=int(round(window.height))
            )
            # Ensure window is within image bounds
            if (window.col_off < 0 or window.row_off < 0 or 
                window.col_off + window.width > src.width or 
                window.row_off + window.height > src.height):
                window = window.intersection(
                    rasterio.windows.Window(0, 0, src.width, src.height)
                )
            data = src.read(window=window)
            window_transform = rasterio.windows.transform(window, src.transform)
            profile = src.profile.copy()
            profile.update({
                'height': window.height,
                'width': window.width,
                'transform': window_transform,
                'crs': CORRECT_CRS,
                'driver': 'GTiff',
                'compress': None,
                'predictor': 1,
                'tiled': False,
                'interleave': 'band',
                'bigtiff': True,
                'dtype': src.dtypes[0],
            })
            with rasterio.open(output_path, 'w', **profile) as dst:
                dst.write(data)
        return (row.name, True, None)
    except Exception as e:
        tb = traceback.format_exc()
        return (row.name, False, f"Error: {str(e)}\n{tb}")

# Copy a single file
def copy_file(args):
    src_file, dst_file = args
    try:
        shutil.copy2(src_file, dst_file)
        return (True, src_file)
    except Exception as e:
        return (False, f"Error copying {src_file} to {dst_file}: {str(e)}")

# Process a group of tiles
def process_geotiffs(chunk_df, output_dir, debug_dir):
    results = []
    for idx, row in chunk_df.iterrows():
        result = process_tile(row, output_dir, debug_dir)
        results.append(result)
    return results

if REGENERATE_TILE_1024_SPLIT_FROM_SITG:
    warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)
    df = pd.read_parquet(REGENERATE_TILE_1024_SPLIT_FROM_SITG_COMBINED_METADATA_PARQUET)
    os.makedirs(REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TEMP_1280, exist_ok=True)
    debug_dir = os.path.join(REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TEMP_1280, 'debug')
    os.makedirs(debug_dir, exist_ok=True)

    # Group by source file
    grouped = df.groupby('geotiff_path')
    group_dfs = [group for _, group in grouped]

    print(f"Processing {len(df)} tiles from {len(group_dfs)} source GeoTIFFs using {REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_PROCESSES} processes")

    # Parallel processing
    with ProcessPoolExecutor(max_workers=REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_PROCESSES) as executor:
        futures = [executor.submit(process_geotiffs, group_df, REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TEMP_1280, debug_dir) 
                  for group_df in group_dfs]
        all_results = []
        for future in tqdm(futures, total=len(futures), desc="Processing GeoTIFF groups"):
            results = future.result()
            all_results.extend(results)

    # Log results
    success_count = 0
    error_count = 0
    for idx, success, error_msg in all_results:
        if success:
            success_count += 1
        else:
            error_count += 1
            error_info_path = os.path.join(debug_dir, f"error_row_{idx}.txt")
            with open(error_info_path, 'w') as f:
                f.write(error_msg)
    print(f"Tile processing complete: {success_count} successful, {error_count} errors")

    src_dir = REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TILE_1024
    dst_dir = REGENERATE_TILE_1024_SPLIT_FROM_SITG_OUTPUT_DIR_TEMP_1280

    processed_tif_files = set(os.path.basename(row['tile_path']) for _, row in df.iterrows())

    files_to_copy = []
    print("\nScanning directory structure...")
    for root, dirs, files in os.walk(src_dir):
        rel_path = os.path.relpath(root, src_dir)
        if rel_path != '.':
            dst_root = os.path.join(dst_dir, rel_path)
            os.makedirs(dst_root, exist_ok=True)
            print(f"Created directory: {rel_path}")
        for file in files:
            if rel_path == '.' and file.endswith('.tif') and file in processed_tif_files:
                continue
            src_file = os.path.join(root, file)
            dst_file = os.path.join(dst_dir, rel_path, file) if rel_path != '.' else os.path.join(dst_dir, file)
            files_to_copy.append((src_file, dst_file))

    # Parallel file copy
    print(f"\nCopying {len(files_to_copy)} additional files using {REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_THREADS} threads...")
    with ThreadPoolExecutor(max_workers=REGENERATE_TILE_1024_SPLIT_FROM_SITG_NUM_THREADS) as executor:
        results = list(tqdm(executor.map(copy_file, files_to_copy), total=len(files_to_copy), desc="Copying files"))

    # Check copy errors
    copy_errors = [result for result in results if not result[0]]
    if copy_errors:
        print(f"Warning: {len(copy_errors)} files failed to copy:")
        for _, error in copy_errors[:10]:
            print(f"  {error}")
        if len(copy_errors) > 10:
            print(f"  ... and {len(copy_errors) - 10} more errors")

    # File count verification
    def count_files(directory):
        count = 0
        for root, _, files in os.walk(directory):
            count += len(files)
        return count

    old_count = count_files(src_dir)
    new_count = count_files(dst_dir)

    print(f"\nTotal files in old directory (including subdirectories): {old_count}")
    print(f"Total files in new directory (including subdirectories): {new_count}")

    expected_diff = len(processed_tif_files)
    actual_diff = new_count - old_count + expected_diff

    print(f"Expected difference (replaced TIF files): {expected_diff}")
    print(f"Actual difference: {actual_diff}")

    if actual_diff != 0:
        print("WARNING: File count doesn't match expectations!")
        proceed = input("Do you want to proceed with the renaming? (y/n): ")
        if proceed.lower() != 'y':
            print("Operation aborted")
            exit()

    # Rename old and new folders
    todaysdate = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    old_folder_new_name = "data/notebook_04/geotiff/tile_1024_split_old_" + str(todaysdate)
    print(f"\nRenaming old folder '{src_dir}' to '{old_folder_new_name}'")
    os.rename(src_dir, old_folder_new_name)
    print(f"Renaming new folder '{dst_dir}' to '{src_dir}'")
    os.rename(dst_dir, src_dir)

    print("\nProcessing complete!")


## Charger données

### Dataset avant annotation

In [4]:
gdf_dataset = gpd.read_file(DATASET_TILES_INFORMATION_CSV_PATH)
if "geometry" in gdf_dataset.columns:
    gdf_dataset = gdf_dataset.drop(columns=["geometry"])
if "geometry_x" in gdf_dataset.columns:
    gdf_dataset = gdf_dataset.drop(columns=["geometry_x"])

In [5]:
print(type(gdf_dataset))
gdf_dataset.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,tile_id,globalid,sia_cat,altitude_min,altitude_max,date_leve,tile_path,tile_bounds,SHAPE__Area,dominant_class,area_bin
0,10_12_8d26a8,"['72BC0BCB-C609-49C3-9EBB-DAEE02CBEDDD', 'FFE7...","['I habitat collectif', 'I habitat collectif',...",489.9533333333333,490.2443589743589,2009-06-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25031...,"(2503614.4, 1119436.8, 2503665.6, 1119488.0)",199.5486314409892,I habitat collectif,0-200
1,0_6_9201e5,"['F24DC253-B282-4519-A142-07A8FFB9BE07', '4B14...","['I habitat collectif', 'IX industrie']",400.53,400.53,2019-06-08 02:00:00,data/notebook_04/geotiff/tile_1024_split/25051...,"(2505307.2, 1123948.8, 2505358.4, 1124000.0)",179.86172888887037,I habitat collectif,0-200
2,18_4_b35ef3,"['D2047155-7D5C-4CC1-844B-3B672C640CD4', 'CF36...","['I habitat collectif', 'I habitat collectif',...",449.9333333333333,451.15,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25081...,"(2508204.8, 1121027.2, 2508256.0, 1121078.4)",53.2306893459604,I habitat collectif,0-200
3,1_3_14c592,"['79D87CAC-FD37-4E23-8E22-A2C21CFF9CB8', 'B67A...","['I habitat collectif', 'II habitat individuel...",428.7775,429.1725,2016-03-01 01:00:00,data/notebook_04/geotiff/tile_1024_split/24991...,"(2499153.6, 1113897.6, 2499204.8, 1113948.8)",123.19650865057967,I habitat collectif,0-200
4,15_17_5212bb,['8C0ECDB2-A52C-4264-B8F0-B6A0CF4265EC'],['I habitat collectif'],441.73,441.73,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/24971...,"(2497870.4, 1118180.8, 2497921.6, 1118232.0)",149.40822829361062,I habitat collectif,0-200


In [6]:
# no duplicate in gdf_dataset["tile_id"]
assert(len(gdf_dataset[gdf_dataset.duplicated(subset=["tile_id"])]) == 0), f"gdf_dataset has duplicates in tile_id: {gdf_dataset[gdf_dataset.duplicated(subset=['tile_id'])]}"

### Dataset annoté

In [7]:
# img
dataset_img_path = os.path.join(DATASET_ANNOTATED_BRUT_PATH, [f for f in os.listdir(DATASET_ANNOTATED_BRUT_PATH) if f.startswith("dataset")][0], "img")
assert(os.path.exists(dataset_img_path)), f"Path does not exist: {dataset_img_path}"
print(f"dataset_masks_path: {dataset_img_path}")
print(f"Number of files: {len(os.listdir(dataset_img_path))}")


# masks
dataset_masks_path = os.path.join(DATASET_ANNOTATED_BRUT_PATH, [f for f in os.listdir(DATASET_ANNOTATED_BRUT_PATH) if f.startswith("dataset")][0], "masks_machine")
assert(os.path.exists(dataset_masks_path)), f"Path does not exist: {dataset_masks_path}"
print(f"dataset_masks_path: {dataset_masks_path}")
print(f"Number of files: {len(os.listdir(dataset_masks_path))}")

# verif
assert(len(os.listdir(dataset_img_path)) == len(os.listdir(dataset_masks_path))), f"Number of files in {dataset_masks_path} is not equal to number of files in {dataset_masks_path}"

dataset_masks_path: datasets/supervisely/341575_free_space_rooftop_geneva_20250511_binary_mask/dataset 2025-04-07 17-26-34/img
Number of files: 539
dataset_masks_path: datasets/supervisely/341575_free_space_rooftop_geneva_20250511_binary_mask/dataset 2025-04-07 17-26-34/masks_machine
Number of files: 539


In [None]:
# Build lists of mask and image file paths
dataset_original_masks_path_list = [os.path.join(dataset_masks_path, f) for f in os.listdir(dataset_masks_path)]
dataset_original_img_path_list = [os.path.join(dataset_img_path, f) for f in os.listdir(dataset_img_path)]

# Create annotation dataframe
df_annotations = pd.DataFrame(
    {
        "tile_id": [os.path.basename(f).split(".")[0] for f in dataset_original_masks_path_list],
        "original_mask_path_png": dataset_original_masks_path_list,
        "original_img_path_png": dataset_original_img_path_list,
    }
)

# Extract tile_id suffix after second underscore
df_annotations["tile_id"] = df_annotations["tile_id"].apply(lambda x: "_".join(x.split("_")[2:]))

# Check for duplicates
assert(len(df_annotations[df_annotations.duplicated(subset=["tile_id"])]) == 0), f"gdf_dataset has duplicates in tile_id: {df_annotations[df_annotations.duplicated(subset=['tile_id'])]}"
assert(len(df_annotations[df_annotations.duplicated(subset=["original_mask_path_png"])]) == 0), f"gdf_dataset has duplicates in original_mask_path_png: {df_annotations[df_annotations.duplicated(subset=['original_mask_path_png'])]}"
assert(len(df_annotations[df_annotations.duplicated(subset=["original_img_path_png"])]) == 0), f"gdf_dataset has duplicates in original_img_path_png: {df_annotations[df_annotations.duplicated(subset=['original_img_path_png'])]}"

# Check for missing values
assert(df_annotations["tile_id"].notnull().all()), f"df_annotations has null values in tile_id: {df_annotations[df_annotations['tile_id'].isnull()]}"
assert(df_annotations["original_mask_path_png"].notnull().all()), f"df_annotations has null values in original_mask_path_png: {df_annotations[df_annotations['original_mask_path_png'].isnull()]}"
assert(df_annotations["original_img_path_png"].notnull().all()), f"df_annotations has null values in original_img_path_png: {df_annotations[df_annotations['original_img_path_png'].isnull()]}"

# Check row count consistency
assert(len(df_annotations["tile_id"]) == len(gdf_dataset["tile_id"])), f"len(df_annotations['tile_id']) is not equal to len(gdf_dataset['tile_id']): {len(df_annotations['tile_id'])} != {len(gdf_dataset['tile_id'])}"

display(df_annotations)

Unnamed: 0,tile_id,original_mask_path_png,original_img_path_png
0,3_10_d297df,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
1,19_8_067ade,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
2,4_9_e48187,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
3,5_9_4ffdcf,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
4,10_6_eecedc,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
...,...,...,...
534,10_1_e9591b,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
535,11_18_ac7623,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
536,16_8_46ad78,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
537,10_13_55bd29,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...


In [9]:
gdf_dataset = gdf_dataset.merge(
    df_annotations,
    how="left",
    left_on="tile_id",
    right_on="tile_id",
)

In [None]:
# Check for duplicates
assert(len(gdf_dataset[gdf_dataset.duplicated(subset=["tile_id"])]) == 0), f"gdf_dataset has duplicates in tile_id: {gdf_dataset[gdf_dataset.duplicated(subset=['tile_id'])]}"
assert(len(gdf_dataset[gdf_dataset.duplicated(subset=["original_img_path_png"])]) == 0), f"gdf_dataset has duplicates in img_path: {gdf_dataset[gdf_dataset.duplicated(subset=['original_img_path_png'])]}"
assert(len(gdf_dataset[gdf_dataset.duplicated(subset=["original_mask_path_png"])]) == 0), f"gdf_dataset has duplicates in original_mask_path_png: {gdf_dataset[gdf_dataset.duplicated(subset=['original_mask_path_png'])]}"

# Check for missing values
assert(gdf_dataset["tile_id"].notnull().all()), f"gdf_dataset has null values in tile_id: {gdf_dataset[gdf_dataset['tile_id'].isnull()]}"
assert(gdf_dataset["original_img_path_png"].notnull().all()), f"gdf_dataset has null values in original_img_path_png: {gdf_dataset[gdf_dataset['original_img_path_png'].isnull()]}"
assert(gdf_dataset["original_mask_path_png"].notnull().all()), f"gdf_dataset has null values in original_mask_path_png: {gdf_dataset[gdf_dataset['original_mask_path_png'].isnull()]}"

# Check row count consistency
assert(len(gdf_dataset["tile_id"]) == len(df_annotations["tile_id"])), f"len(gdf_dataset['tile_id']) is not equal to len(df_annotations['tile_id']): {len(gdf_dataset['tile_id'])} != {len(df_annotations['tile_id'])}"

In [11]:
gdf_dataset.head()

Unnamed: 0,tile_id,globalid,sia_cat,altitude_min,altitude_max,date_leve,tile_path,tile_bounds,SHAPE__Area,dominant_class,area_bin,original_mask_path_png,original_img_path_png
0,10_12_8d26a8,"['72BC0BCB-C609-49C3-9EBB-DAEE02CBEDDD', 'FFE7...","['I habitat collectif', 'I habitat collectif',...",489.9533333333333,490.2443589743589,2009-06-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25031...,"(2503614.4, 1119436.8, 2503665.6, 1119488.0)",199.5486314409892,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
1,0_6_9201e5,"['F24DC253-B282-4519-A142-07A8FFB9BE07', '4B14...","['I habitat collectif', 'IX industrie']",400.53,400.53,2019-06-08 02:00:00,data/notebook_04/geotiff/tile_1024_split/25051...,"(2505307.2, 1123948.8, 2505358.4, 1124000.0)",179.86172888887037,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
2,18_4_b35ef3,"['D2047155-7D5C-4CC1-844B-3B672C640CD4', 'CF36...","['I habitat collectif', 'I habitat collectif',...",449.9333333333333,451.15,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25081...,"(2508204.8, 1121027.2, 2508256.0, 1121078.4)",53.2306893459604,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
3,1_3_14c592,"['79D87CAC-FD37-4E23-8E22-A2C21CFF9CB8', 'B67A...","['I habitat collectif', 'II habitat individuel...",428.7775,429.1725,2016-03-01 01:00:00,data/notebook_04/geotiff/tile_1024_split/24991...,"(2499153.6, 1113897.6, 2499204.8, 1113948.8)",123.19650865057967,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...
4,15_17_5212bb,['8C0ECDB2-A52C-4264-B8F0-B6A0CF4265EC'],['I habitat collectif'],441.73,441.73,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/24971...,"(2497870.4, 1118180.8, 2497921.6, 1118232.0)",149.40822829361062,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...


### CAD_BATIMENT_HORSOL_TOIT_MERGE

In [12]:
gdf_cad_batiment_horsol = gpd.read_parquet(CAD_BATIMENT_HORSOL_TOIT_MERGE_PARQUET_PATH)
print(type(gdf_cad_batiment_horsol))
gdf_cad_batiment_horsol.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,geometry,objectid,egid,altitude_min,altitude_max,date_leve,SHAPE__Length,SHAPE__Area,globalid
0,"POLYGON ((2486492.692 1110581.039, 2486492.687...",48192,295076435.0,381.94,382.34,2005-08-01 02:00:00,51.65274923353816,124.87250692348236,{C22AB52C-0F0B-4FAA-9E75-4243A665B461}
1,"POLYGON ((2486457.102 1110749.186, 2486460.004...",48235,295091310.0,368.07,368.56,2005-08-01 02:00:00,31.015893027041194,54.70481326736387,{61AB5825-8EDB-4F5C-92E3-A8F3CB3B3637}
2,"POLYGON ((2486286.792 1110867.393, 2486290.749...",48204,295077439.0,339.15,339.15,2005-08-01 02:00:00,14.954717776916645,13.691946764255684,{FF2FD4B9-EE43-4622-8B1E-F18E65820C6F}
3,"POLYGON ((2486627.502 1110880.096, 2486627.462...",48225 | 48224 | 48223 | 48226 | 48129 | 48142 ...,295077673.0 | 1004238.0,360.04 | 367.89 | 367.37 | 364.7 | 367.36 | 36...,361.58 | 370.86 | 368.07 | 368.06,2005-08-01 02:00:00,16.51163203622832 | 16.65983583426023 | 16.657...,11.7960687648941 | 11.800539859749684 | 11.793...,{4925DA6C-D32C-4D40-A376-8715F1678EED} | {81C7...
4,"POLYGON ((2486688.491 1111040.638, 2486700.898...",48233,295091089.0,348.75,348.75,2005-08-01 02:00:00,38.97868110167111,87.73404090611139,{7EEE17BC-9A10-4AE7-906A-8F2021CEFB07}


### Compléter données

In [None]:
# Extract polygon geometry from a GeoTIFF file
def get_geometry_from_tiff(tiff_path, crs=CORRECT_CRS):
    try:
        with rasterio.open(tiff_path) as src:
            transform = src.transform
            height, width = src.shape
            crs_src = src.crs

            if crs_src != crs:
                print(f"Warning: CRS mismatch in {tiff_path}. Found {crs_src}, expected {crs}")

            minx = transform[2]
            maxy = transform[5]
            miny = maxy + height * transform[4]
            maxx = minx + width * transform[0]

            polygon = Polygon([
                (minx, miny), (maxx, miny), (maxx, maxy), (minx, maxy), (minx, miny)
            ])

            if not polygon.is_valid:
                print(f"Warning: Invalid polygon from {tiff_path}")
                polygon = polygon.buffer(0)

            if polygon.area <= 0:
                print(f"Warning: Zero-area polygon from {tiff_path}")
                return None

            return polygon
    except Exception as e:
        print(f"Error processing {tiff_path}: {e}")
        return None

# Get image width and height
def get_image_dimensions(image_path):
    try:
        if os.path.exists(image_path):
            with Image.open(image_path) as img:
                width, height = img.size
                return width, height
        else:
            return None, None
    except Exception as e:
        print(f"Error opening {image_path}: {e}")
        return None, None

def create_geodataframe_from_tiffs(df):
    """
    Create a GeoDataFrame from a DataFrame of GeoTIFF paths.

    Args:
        df: DataFrame with 'tile_path' column

    Returns:
        GeoDataFrame with geometry column
    """
    geometries = []
    indices = []
    total_files = len(df)

    for idx, row in df.iterrows():
        tiff_path = row['tile_path']
        if os.path.exists(tiff_path):
            geometry = get_geometry_from_tiff(tiff_path)
            if geometry is not None:
                geometries.append(geometry)
                indices.append(idx)
        else:
            print(f"File not found: {tiff_path}")

    print(f"Processed {len(geometries)} of {total_files} files")

    df_processed = df.loc[indices].copy()
    gdf = gpd.GeoDataFrame(
        df_processed,
        geometry=geometries,
        crs=EPSG_SUISSE
    )
    return gdf

gdf_dataset = create_geodataframe_from_tiffs(gdf_dataset)

# Add image dimensions columns
image_dimensions = gdf_dataset['tile_path'].apply(get_image_dimensions)
gdf_dataset['image_width'], gdf_dataset['image_height'] = zip(*image_dimensions)

Processing GeoTIFF files...
Successfully processed 539 out of 539 files


In [14]:
gdf_dataset.head()

Unnamed: 0,tile_id,globalid,sia_cat,altitude_min,altitude_max,date_leve,tile_path,tile_bounds,SHAPE__Area,dominant_class,area_bin,original_mask_path_png,original_img_path_png,geometry,image_width,image_height
0,10_12_8d26a8,"['72BC0BCB-C609-49C3-9EBB-DAEE02CBEDDD', 'FFE7...","['I habitat collectif', 'I habitat collectif',...",489.9533333333333,490.2443589743589,2009-06-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25031...,"(2503614.4, 1119436.8, 2503665.6, 1119488.0)",199.5486314409892,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2503608 1119430.4, 2503672 1119430.4...",1280,1280
1,0_6_9201e5,"['F24DC253-B282-4519-A142-07A8FFB9BE07', '4B14...","['I habitat collectif', 'IX industrie']",400.53,400.53,2019-06-08 02:00:00,data/notebook_04/geotiff/tile_1024_split/25051...,"(2505307.2, 1123948.8, 2505358.4, 1124000.0)",179.86172888887037,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2505300.8 1123936, 2505364.8 1123936...",1280,1280
2,18_4_b35ef3,"['D2047155-7D5C-4CC1-844B-3B672C640CD4', 'CF36...","['I habitat collectif', 'I habitat collectif',...",449.9333333333333,451.15,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25081...,"(2508204.8, 1121027.2, 2508256.0, 1121078.4)",53.2306893459604,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2508198.4 1121020.8, 2508262.4 11210...",1280,1280
3,1_3_14c592,"['79D87CAC-FD37-4E23-8E22-A2C21CFF9CB8', 'B67A...","['I habitat collectif', 'II habitat individuel...",428.7775,429.1725,2016-03-01 01:00:00,data/notebook_04/geotiff/tile_1024_split/24991...,"(2499153.6, 1113897.6, 2499204.8, 1113948.8)",123.19650865057967,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2499147.2 1113891.2, 2499211.2 11138...",1280,1280
4,15_17_5212bb,['8C0ECDB2-A52C-4264-B8F0-B6A0CF4265EC'],['I habitat collectif'],441.73,441.73,2005-08-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/24971...,"(2497870.4, 1118180.8, 2497921.6, 1118232.0)",149.40822829361062,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2497864 1118174.4, 2497928 1118174.4...",1280,1280


In [15]:
# assert there is no nan or null in gdf_dataset["image_width"], gdf_dataset["image_height"]
assert(gdf_dataset["image_width"].notnull().all()), f"gdf_dataset has null values in image_width: {gdf_dataset[gdf_dataset['image_width'].isnull()]}"
assert(gdf_dataset["image_height"].notnull().all()), f"gdf_dataset has null values in image_height: {gdf_dataset[gdf_dataset['image_height'].isnull()]}"

In [16]:
print(type(gdf_dataset))
gdf_dataset.head(2)

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,tile_id,globalid,sia_cat,altitude_min,altitude_max,date_leve,tile_path,tile_bounds,SHAPE__Area,dominant_class,area_bin,original_mask_path_png,original_img_path_png,geometry,image_width,image_height
0,10_12_8d26a8,"['72BC0BCB-C609-49C3-9EBB-DAEE02CBEDDD', 'FFE7...","['I habitat collectif', 'I habitat collectif',...",489.9533333333333,490.2443589743589,2009-06-01 02:00:00,data/notebook_04/geotiff/tile_1024_split/25031...,"(2503614.4, 1119436.8, 2503665.6, 1119488.0)",199.5486314409892,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2503608 1119430.4, 2503672 1119430.4...",1280,1280
1,0_6_9201e5,"['F24DC253-B282-4519-A142-07A8FFB9BE07', '4B14...","['I habitat collectif', 'IX industrie']",400.53,400.53,2019-06-08 02:00:00,data/notebook_04/geotiff/tile_1024_split/25051...,"(2505307.2, 1123948.8, 2505358.4, 1124000.0)",179.86172888887037,I habitat collectif,0-200,datasets/supervisely/341575_free_space_rooftop...,datasets/supervisely/341575_free_space_rooftop...,"POLYGON ((2505300.8 1123936, 2505364.8 1123936...",1280,1280


## Postprocessing images dataset



### Clip geotiff

In [None]:
def clip_geotiff_and_png_masks(gdf_dataset, gdf_buildings, output_img_dir, output_mask_dir, convert_masks_to_geotiff=True):
    """
    Clips GeoTIFFs and corresponding PNG masks using building polygons.
    Optionally converts PNG masks to GeoTIFF.
    
    Args:
        gdf_dataset: GeoDataFrame with 'tile_path', 'original_mask_path_png', and 'tile_id'
        gdf_buildings: GeoDataFrame with building polygons
        output_img_dir: Output directory for clipped GeoTIFFs
        output_mask_dir: Output directory for clipped masks
        convert_masks_to_geotiff: Convert PNG masks to GeoTIFF if True
        
    Returns:
        list: Processed tile_ids
        dict: Skipped tile_ids with reasons
    """
    successful_img_count = 0
    successful_mask_count = 0
    error_count = 0

    processed_tile_ids = []
    skipped_tile_ids = []
    skipped_reasons = {}

    for idx, row in tqdm(gdf_dataset.iterrows(), total=len(gdf_dataset), desc="Clipping files"):
        tiff_path = row['tile_path']
        mask_path = row.get('original_mask_path_png')
        tile_id = row['tile_id']

        if pd.isna(tiff_path):
            skipped_tile_ids.append(tile_id)
            skipped_reasons[tile_id] = "Missing tiff_path"
            continue

        if not os.path.exists(tiff_path):
            skipped_tile_ids.append(tile_id)
            skipped_reasons[tile_id] = f"TIFF file not found: {tiff_path}"
            continue

        output_img_path = os.path.join(output_img_dir, os.path.basename(tiff_path))

        if pd.isna(mask_path) or not os.path.exists(mask_path):
            mask_path = None
            output_mask_path = None
            skipped_reasons[tile_id] = "Missing or invalid mask_path"
        else:
            if convert_masks_to_geotiff:
                mask_basename = os.path.splitext(os.path.basename(mask_path))[0] + '.tif'
                output_mask_path = os.path.join(output_mask_dir, mask_basename)
            else:
                output_mask_path = os.path.join(output_mask_dir, os.path.basename(mask_path))

        try:
            tile_geom = row.geometry
            if tile_geom is None:
                skipped_tile_ids.append(tile_id)
                skipped_reasons[tile_id] = "Missing geometry"
                continue

            buildings_in_tile = gdf_buildings[gdf_buildings.intersects(tile_geom)]
            if len(buildings_in_tile) == 0:
                skipped_tile_ids.append(tile_id)
                skipped_reasons[tile_id] = "No intersecting buildings found"
                continue

            with rasterio.open(tiff_path) as src:
                src_meta = src.meta.copy()
                original_height, original_width = src.height, src.width
                shapes = [mapping(geom) for geom in buildings_in_tile.geometry]

                masked_data, mask_transform = rasterio.mask.mask(
                    src, 
                    shapes, 
                    crop=False, 
                    all_touched=True,
                    invert=True,
                    filled=True,
                    nodata=0
                )

                binary_mask = (masked_data[0] == 0).astype(np.uint8)
                original_img = src.read()
                masked_img = original_img.copy()

                for i in range(masked_img.shape[0]):
                    masked_img[i][binary_mask == 0] = 0

                out_meta = src_meta.copy()
                with rasterio.open(output_img_path, 'w', **out_meta) as dest:
                    dest.write(masked_img)

                successful_img_count += 1

                if mask_path is not None:
                    try:
                        with Image.open(mask_path) as mask_img:
                            mask_array = np.array(mask_img)
                            if mask_array.shape[:2] != (original_height, original_width):
                                tiff_dims = f"{original_width}x{original_height}"
                                mask_dims = f"{mask_array.shape[1]}x{mask_array.shape[0]}"
                                print(f"Warning: Mask dimensions don't match GeoTIFF for {os.path.basename(tiff_path)}")
                                print(f"  - GeoTIFF dimensions: {tiff_dims}")
                                print(f"  - Mask dimensions: {mask_dims}")
                                skipped_reasons[tile_id] = f"Mask dimensions don't match GeoTIFF: GeoTIFF={tiff_dims}, Mask={mask_dims}"
                                continue

                            if len(mask_array.shape) == 3:
                                for i in range(mask_array.shape[2]):
                                    mask_array[:, :, i] = mask_array[:, :, i] * binary_mask
                            else:
                                mask_array = mask_array * binary_mask

                            if convert_masks_to_geotiff:
                                mask_meta = src_meta.copy()
                                if len(mask_array.shape) == 3:
                                    mask_meta.update(
                                        dtype=mask_array.dtype,
                                        count=mask_array.shape[2],
                                        nodata=0,
                                    )
                                else:
                                    mask_meta.update(
                                        dtype=mask_array.dtype,
                                        count=1,
                                        nodata=0,
                                    )
                                with rasterio.open(output_mask_path, 'w', **mask_meta) as dest:
                                    if len(mask_array.shape) == 3:
                                        for i in range(mask_array.shape[2]):
                                            dest.write(mask_array[:, :, i], i+1)
                                    else:
                                        dest.write(mask_array, 1)
                            else:
                                Image.fromarray(mask_array).save(output_mask_path)

                            successful_mask_count += 1
                            processed_tile_ids.append(tile_id)

                    except Exception as e:
                        error_count += 1
                        skipped_tile_ids.append(tile_id)
                        skipped_reasons[tile_id] = f"Error processing mask: {str(e)}"
                        print(f"Error processing mask {mask_path}: {e}")
                else:
                    processed_tile_ids.append(tile_id)

        except Exception as e:
            error_count += 1
            skipped_tile_ids.append(tile_id)
            skipped_reasons[tile_id] = f"Error: {str(e)}"
            print(f"Error processing {tiff_path}: {e}")

    processed_tile_ids = list(set(processed_tile_ids))
    skipped_tile_ids = list(set(skipped_tile_ids))
    overlap = set(processed_tile_ids) & set(skipped_tile_ids)

    print(f"Processed {len(gdf_dataset)} files")
    print(f"- {successful_img_count} GeoTIFFs")
    print(f"- {successful_mask_count} masks")
    if convert_masks_to_geotiff:
        print(f"- {successful_mask_count} PNG masks converted to GeoTIFF")
    print(f"- {error_count} errors")
    print(f"- {len(processed_tile_ids)} tiles processed")
    print(f"- {len(skipped_tile_ids)} tiles skipped")

    if overlap:
        print(f"Warning: {len(overlap)} tile_ids in both processed and skipped lists.")

    if skipped_tile_ids:
        dimension_mismatches = [reason for tile_id, reason in skipped_reasons.items() 
                            if "Mask dimensions don't match" in reason]

        print("\nDimension mismatch summary:")
        print(f"  - {len(dimension_mismatches)} files with dimension mismatches")

        if dimension_mismatches:
            import re
            geotiff_dims = []
            mask_dims = []
            pattern = r"GeoTIFF=(\d+x\d+), Mask=(\d+x\d+)"

            for reason in dimension_mismatches:
                match = re.search(pattern, reason)
                if match:
                    geotiff_dims.append(match.group(1))
                    mask_dims.append(match.group(2))

            from collections import Counter
            geotiff_counter = Counter(geotiff_dims)
            mask_counter = Counter(mask_dims)

            print("\nMost common GeoTIFF dimensions:")
            for dims, count in geotiff_counter.most_common(3):
                print(f"  - {dims}: {count} files")

            print("\nMost common mask dimensions:")
            for dims, count in mask_counter.most_common(3):
                print(f"  - {dims}: {count} files")

        skipped_tiles = {tile_id: skipped_reasons[tile_id] for tile_id in skipped_tile_ids}
        return processed_tile_ids, skipped_tiles

processed_tile_ids, skipped_tiles = clip_geotiff_and_png_masks(
    gdf_dataset, gdf_cad_batiment_horsol, DATASET_OUTPUT_IMG_PATH, DATASET_OUTPUT_MASKS_PATH, convert_masks_to_geotiff=True
)

print(f"Processed tile_ids: {len(processed_tile_ids)}")
print(f"Skipped tile_ids: {len(skipped_tiles)}")


Clipping files:   0%|          | 0/539 [00:00<?, ?it/s]

Completed processing 539 files:
- Successfully processed 538 GeoTIFFs
- Successfully processed 538 masks
- Converted 538 PNG masks to GeoTIFF format
- Encountered 0 errors
- Successfully processed tiles: 538
- Skipped tiles: 1

Dimension mismatch summary:
  - Total files with dimension mismatches: 0
Processed tile_ids: 538
Skipped tile_ids: 1


### Ajouter path au gdf

In [18]:
# list of files inside the paths
dataset_processed_masks_path_list = [os.path.join(DATASET_OUTPUT_MASKS_PATH, f) for f in os.listdir(DATASET_OUTPUT_MASKS_PATH)]
dataset_processed_img_path_list = [os.path.join(DATASET_OUTPUT_IMG_PATH, f) for f in os.listdir(DATASET_OUTPUT_IMG_PATH)]

df_processed = pd.DataFrame(
    {
        "tile_id": [os.path.basename(f).split(".")[0] for f in dataset_processed_masks_path_list],
        "processed_mask_path_tif": dataset_processed_masks_path_list,
        "processed_img_path_tif": dataset_processed_img_path_list,
    }
)
# split tile_id after the second underscore keep the right part
df_processed["tile_id"] = df_processed["tile_id"].apply(lambda x: "_".join(x.split("_")[2:]))

In [19]:
# no duplicate in df_annotations["tile_id"], no duplicate in df_annotations["tile_id"], no duplicate in df_annotations["tile_id"]
assert(len(df_processed[df_processed.duplicated(subset=["tile_id"])]) == 0), f"df_processed has duplicates in tile_id: {df_processed[df_processed.duplicated(subset=['tile_id'])]}"
assert(len(df_processed[df_processed.duplicated(subset=["processed_img_path_tif"])]) == 0), f"df_processed has duplicates in processed_img_path_tif: {df_processed[df_processed.duplicated(subset=['processed_img_path_tif'])]}"
assert(len(df_processed[df_processed.duplicated(subset=["processed_mask_path_tif"])]) == 0), f"df_processed has duplicates in processed_mask_path_tif: {df_processed[df_processed.duplicated(subset=['processed_mask_path_tif'])]}"

# no nan or null in df_processed["tile_id"], no nan or null in df_processed["processed_img_path_tif"], no nan or null in df_processed["processed_mask_path_tif"]
assert(df_processed["tile_id"].isnull().sum() == 0), f"df_processed has null in tile_id: {df_processed[df_processed['tile_id'].isnull()]}"
assert(df_processed["processed_img_path_tif"].isnull().sum() == 0), f"df_processed has null in processed_img_path_tif: {df_processed[df_processed['processed_img_path_tif'].isnull()]}"
assert(df_processed["processed_mask_path_tif"].isnull().sum() == 0), f"df_processed has null in processed_mask_path_tif: {df_processed[df_processed['processed_mask_path_tif'].isnull()]}"

display(df_processed.head())

Unnamed: 0,tile_id,processed_mask_path_tif,processed_img_path_tif
0,10_12_8d26a8,datasets/supervisely/dataset_processed_2025052...,datasets/supervisely/dataset_processed_2025052...
1,0_6_9201e5,datasets/supervisely/dataset_processed_2025052...,datasets/supervisely/dataset_processed_2025052...
2,18_4_b35ef3,datasets/supervisely/dataset_processed_2025052...,datasets/supervisely/dataset_processed_2025052...
3,1_3_14c592,datasets/supervisely/dataset_processed_2025052...,datasets/supervisely/dataset_processed_2025052...
4,15_17_5212bb,datasets/supervisely/dataset_processed_2025052...,datasets/supervisely/dataset_processed_2025052...


In [20]:
gdf_dataset = gdf_dataset.merge(
    df_processed,
    how="left",
    left_on="tile_id",
    right_on="tile_id",
)


### Gérér les chevauchement entre tuiles

#### Déterminer coin chevaucehement

In [None]:
def determine_relative_position(geom1, geom2, tolerance=0.5):
    """
    Returns the relative position of geom1 to geom2.
    Assumes Y increases northward.
    """
    minx1, miny1, maxx1, maxy1 = geom1.bounds
    minx2, miny2, maxx2, maxy2 = geom2.bounds

    center_x1 = (minx1 + maxx1) / 2
    center_y1 = (miny1 + maxy1) / 2
    center_x2 = (minx2 + maxx2) / 2
    center_y2 = (miny2 + maxy2) / 2

    avg_width = ((maxx1 - minx1) + (maxx2 - minx2)) / 2
    avg_height = ((maxy1 - miny1) + (maxy2 - miny2)) / 2

    x_tolerance = tolerance * avg_width
    y_tolerance = tolerance * avg_height

    intersection = geom1.intersection(geom2)
    intersection_area = intersection.area

    vertical_position = None
    horizontal_position = None

    # Y increases northward
    vertical_diff = center_y1 - center_y2
    if abs(vertical_diff) <= y_tolerance:
        vertical_position = None
    elif vertical_diff > 0:
        vertical_position = "bottom"
    else:
        vertical_position = "top"

    # X increases eastward
    horizontal_diff = center_x1 - center_x2
    if abs(horizontal_diff) <= x_tolerance:
        horizontal_position = None
    elif horizontal_diff > 0:
        horizontal_position = "left"
    else:
        horizontal_position = "right"

    smaller_area = min(geom1.area, geom2.area)
    overlap_percentage = (intersection_area / smaller_area) * 100 if smaller_area > 0 else 0

    if vertical_position and horizontal_position:
        position = f"{vertical_position}-{horizontal_position}"
    elif vertical_position:
        position = vertical_position
    elif horizontal_position:
        position = horizontal_position
    else:
        position = "substantial-overlap" if overlap_percentage > 90 else "center"

    return position

def get_opposite_position(position):
    """Returns the opposite relative position."""
    position_map = {
        'top': 'bottom',
        'bottom': 'top',
        'left': 'right',
        'right': 'left',
        'top-left': 'bottom-right',
        'top-right': 'bottom-left',
        'bottom-left': 'top-right',
        'bottom-right': 'top-left',
        'center': 'center',
        'substantial-overlap': 'substantial-overlap'
    }
    return position_map.get(position, position)

def check_geotiffs_overlap(geom1, geom2, min_overlap_area=0.0):
    """Returns overlap info between two geometries."""
    result = {
        'overlaps': False,
        'overlap_area': 0.0,
        'relative_position': None,
        'overlap_percentage_1': 0.0,
        'overlap_percentage_2': 0.0
    }

    if geom1.intersects(geom2):
        intersection = geom1.intersection(geom2)
        overlap_area = intersection.area

        if overlap_area > min_overlap_area:
            result['overlaps'] = True
            result['overlap_area'] = overlap_area
            result['relative_position'] = determine_relative_position(geom1, geom2)
            result['overlap_percentage_1'] = (overlap_area / geom1.area) * 100
            result['overlap_percentage_2'] = (overlap_area / geom2.area) * 100

    return result

def check_overlaps_in_dataframe(gdf, min_overlap_area=1.0, include_symmetric=False, buffer_distance=0.01):
    """
    Checks for overlapping geometries in a GeoDataFrame.
    """
    overlap_results = []
    n = len(gdf)

    try:
        if not isinstance(gdf, gpd.GeoDataFrame):
            raise TypeError("Input must be a GeoDataFrame")

        if n == 0:
            raise ValueError("GeoDataFrame is empty")

        print("Creating spatial index...")
        sindex = gdf.sindex

        print(f"Checking overlaps among {n} geometries...")
        with tqdm(total=n, desc="Checking overlaps") as pbar:
            for i in range(n):
                geom1 = gdf.iloc[i]['geometry']
                tile_id1 = gdf.iloc[i]['tile_id']

                if geom1 is None or not geom1.is_valid:
                    print(f"Warning: Skipping invalid geometry for {tile_id1}")
                    pbar.update(1)
                    continue

                bbox = geom1.bounds
                potential_matches_idx = list(sindex.intersection(bbox))

                if i in potential_matches_idx:
                    potential_matches_idx.remove(i)

                potential_matches_idx = [j for j in potential_matches_idx if j > i]

                for j in potential_matches_idx:
                    geom2 = gdf.iloc[j]['geometry']
                    tile_id2 = gdf.iloc[j]['tile_id']

                    if geom2 is None or not geom2.is_valid:
                        print(f"Warning: Skipping invalid geometry for {tile_id2}")
                        continue

                    if buffer_distance > 0:
                        buffered_geom1 = geom1.buffer(buffer_distance)
                        buffered_geom2 = geom2.buffer(buffer_distance)
                    else:
                        buffered_geom1 = geom1
                        buffered_geom2 = geom2

                    if buffered_geom1.intersects(buffered_geom2):
                        intersection = buffered_geom1.intersection(buffered_geom2)

                        if not intersection.is_empty and intersection.area > min_overlap_area:
                            result = check_geotiffs_overlap(buffered_geom1, buffered_geom2, min_overlap_area)

                            if result['overlaps']:
                                overlap_results.append({
                                    'tile_id1': tile_id1,
                                    'tile_id2': tile_id2,
                                    'index1': i,
                                    'index2': j,
                                    'overlap_area': result['overlap_area'],
                                    'relative_position': result['relative_position'],
                                    'overlap_percentage_1': result['overlap_percentage_1'],
                                    'overlap_percentage_2': result['overlap_percentage_2'],
                                    'buffered': buffer_distance > 0
                                })

                                if include_symmetric:
                                    opposite_position = get_opposite_position(result['relative_position'])

                                    overlap_results.append({
                                        'tile_id1': tile_id2,
                                        'tile_id2': tile_id1,
                                        'index1': j,
                                        'index2': i,
                                        'overlap_area': result['overlap_area'],
                                        'relative_position': opposite_position,
                                        'overlap_percentage_1': result['overlap_percentage_2'],
                                        'overlap_percentage_2': result['overlap_percentage_1'],
                                        'buffered': buffer_distance > 0
                                    })

                pbar.update(1)

    except Exception as e:
        print(f"Error during overlap check: {e}")
        traceback.print_exc()

    if overlap_results:
        overlap_df = pd.DataFrame(overlap_results)
        print(f"Found {len(overlap_df)} overlapping pairs")

        position_counts = overlap_df['relative_position'].value_counts()
        print("\nOverlap positions before filtering:")
        for pos, count in position_counts.items():
            print(f"  {pos}: {count}")

        if 'buffered' in overlap_df.columns:
            buffered_count = overlap_df['buffered'].sum()
            print(f"\nOverlaps using buffered geometries: {buffered_count} ({(buffered_count/len(overlap_df))*100:.1f}%)")

        return overlap_df
    else:
        print("No overlapping pairs found")
        return pd.DataFrame(columns=['tile_id1', 'tile_id2', 'index1', 'index2',
                                    'overlap_area', 'relative_position',
                                    'overlap_percentage_1', 'overlap_percentage_2',
                                    'buffered'])


# Run overlap check
overlap_df = check_overlaps_in_dataframe(gdf_dataset, min_overlap_area=1.0, include_symmetric=True, buffer_distance=BUFFER_DISTANCE)

# Show unique positions
print("\nUnique position types found:")
for pos in sorted(overlap_df['relative_position'].unique()):
    print(f"  {pos}")


Creating spatial index...
Checking overlaps among 539 geometries...


Checking overlaps:   0%|          | 0/539 [00:00<?, ?it/s]

Found 194 overlapping pairs

Overlap positions before filtering:
  right: 34
  left: 34
  top: 28
  bottom: 28
  top-right: 19
  bottom-left: 19
  top-left: 16
  bottom-right: 16

Overlaps using buffered geometries: 0 (0.0%)

Unique position types found:
  bottom
  bottom-left
  bottom-right
  left
  right
  top
  top-left
  top-right


#### Mettre en background si chevauchement haut ou à droite

In [None]:
def remove_overlap_in_geotiffs(overlap_df, gdf_dataset, overlap_positions=None, overwrite=True, buffer_distance=0.01):
    """
    Removes overlapping regions in GeoTIFF files by setting pixel values to 0 in the specified overlap areas.
    Handles both image and mask files. Overlap regions are determined by spatial intersection and relative position.

    Args:
        overlap_df: DataFrame with overlap information between tiles.
        gdf_dataset: GeoDataFrame containing GeoTIFF paths and geometries.
        overlap_positions: List of relative positions to process (default: ['right', 'top', 'top-right', 'bottom-right']).
        overwrite: If True, modifies files in place. If False, writes to new files.
        buffer_distance: Buffer distance for geometry intersection (should match overlap detection).

    Returns:
        DataFrame with details about processed files.
    """
    # Set default positions if not provided
    if overlap_positions is None:
        overlap_positions = ['right', 'top', 'top-right', 'bottom-right']

    processed_files = []
    failed_files = []

    # Exit if there are no overlaps to process
    if len(overlap_df) == 0:
        print("No overlaps to process")
        return pd.DataFrame()

    # Filter overlaps by specified positions
    filtered_df = overlap_df.copy()
    if overlap_positions:
        position_filter = filtered_df['relative_position'].apply(
            lambda pos: any(p in pos for p in overlap_positions)
        )
        filtered_df = filtered_df[position_filter]
        print(f"Processing {len(filtered_df)} out of {len(overlap_df)} overlaps that match position criteria")

    if len(filtered_df) == 0:
        print("No overlaps match the specified positions")
        return pd.DataFrame()

    # Process each overlap entry
    with tqdm(total=len(filtered_df), desc="Processing overlaps") as pbar:
        for idx, row in filtered_df.iterrows():
            index1 = row['index1']
            index2 = row['index2']
            position = row['relative_position']

            # Get file paths for images and masks
            tiff_path1 = gdf_dataset.iloc[index1]['processed_img_path_tif']
            tiff_path2 = gdf_dataset.iloc[index2]['processed_img_path_tif']
            mask_path1 = gdf_dataset.iloc[index1]['processed_mask_path_tif']
            mask_path2 = gdf_dataset.iloc[index2]['processed_mask_path_tif']

            # Get geometries for both tiles
            geom1 = gdf_dataset.iloc[index1]['geometry']
            geom2 = gdf_dataset.iloc[index2]['geometry']

            # Use buffered geometries if specified
            use_buffer = row.get('buffered', True)
            if use_buffer:
                buffered_geom1 = geom1.buffer(buffer_distance)
                buffered_geom2 = geom2.buffer(buffer_distance)
            else:
                buffered_geom1 = geom1
                buffered_geom2 = geom2

            # Calculate intersection area
            intersection = buffered_geom1.intersection(buffered_geom2)

            # Skip if intersection is empty or invalid
            if intersection.is_empty or intersection.area <= 0:
                failed_files.append({
                    'file_path': f"{tiff_path1} / {tiff_path2}",
                    'file_type': "both",
                    'position': position,
                    'error': "Empty intersection"
                })
                pbar.update(1)
                continue

            # Decide which file to modify based on overlap position
            modify_idx1 = False
            if 'right' in position and 'left' not in position:
                modify_idx1 = True
            elif 'left' in position and 'right' not in position:
                modify_idx1 = False
            elif 'top' in position and 'bottom' not in position:
                modify_idx1 = False
            elif 'bottom' in position and 'top' not in position:
                modify_idx1 = True
            elif 'center' in position or 'substantial' in position:
                modify_idx1 = geom1.area <= geom2.area
            else:
                modify_idx1 = row['overlap_percentage_1'] <= row['overlap_percentage_2']

            # Select file paths to modify
            if modify_idx1:
                img_to_modify = tiff_path1
                mask_to_modify = mask_path1
                overlap_with_img = tiff_path2
                overlap_with_mask = mask_path2
                tile_id = gdf_dataset.iloc[index1]['tile_id']
            else:
                img_to_modify = tiff_path2
                mask_to_modify = mask_path2
                overlap_with_img = tiff_path1
                overlap_with_mask = mask_path1
                tile_id = gdf_dataset.iloc[index2]['tile_id']

            # Process both image and mask files
            for file_type, file_to_modify in [("image", img_to_modify), ("mask", mask_to_modify)]:
                if not os.path.exists(file_to_modify):
                    failed_files.append({
                        'file_path': file_to_modify,
                        'file_type': file_type,
                        'position': position,
                        'error': "File does not exist"
                    })
                    continue

                # Determine output file path
                if overwrite:
                    output_file = file_to_modify
                else:
                    output_dir = os.path.dirname(file_to_modify)
                    base_name = os.path.basename(file_to_modify)
                    output_file = os.path.join(output_dir, f"overlap_fixed_{base_name}")

                try:
                    temp_file = None
                    if overwrite:
                        temp_dir = os.path.dirname(file_to_modify)
                        temp_file = os.path.join(temp_dir, f"temp_{os.path.basename(file_to_modify)}")

                    with rasterio.open(file_to_modify) as src:
                        data = src.read()
                        minx, miny, maxx, maxy = intersection.bounds
                        window = from_bounds(minx, miny, maxx, maxy, src.transform)

                        # Validate window coordinates
                        if (np.isnan(window.col_off) or np.isnan(window.row_off) or 
                            np.isnan(window.width) or np.isnan(window.height)):
                            failed_files.append({
                                'file_path': file_to_modify,
                                'file_type': file_type,
                                'position': position,
                                'error': "Invalid window coordinates"
                            })
                            continue

                        # Convert window coordinates to integers and check bounds
                        col_off = max(0, int(window.col_off))
                        row_off = max(0, int(window.row_off))
                        width = min(int(np.ceil(window.width)), src.width - col_off)
                        height = min(int(np.ceil(window.height)), src.height - row_off)

                        if width <= 0 or height <= 0:
                            failed_files.append({
                                'file_path': file_to_modify,
                                'file_type': file_type,
                                'position': position,
                                'error': "Invalid window dimensions"
                            })
                            continue

                        # Set overlapping region pixels to 0
                        for band in range(data.shape[0]):
                            data[band, row_off:row_off+height, col_off:col_off+width] = 0

                        profile = src.profile

                    write_path = temp_file if overwrite else output_file

                    # Write modified data to file
                    with rasterio.open(write_path, 'w', **profile) as dst:
                        dst.write(data)

                    # Replace original file if overwriting
                    if overwrite and temp_file:
                        if os.path.exists(file_to_modify):
                            os.remove(file_to_modify)
                        shutil.move(temp_file, file_to_modify)

                    processed_files.append({
                        'file_path': file_to_modify,
                        'file_type': file_type,
                        'position': position,
                        'overlap_with': overlap_with_img if file_type == 'image' else overlap_with_mask,
                        'overlap_area': row['overlap_area'],
                        'modified_pixels': width * height,
                        'tile_id': tile_id
                    })

                except Exception as e:
                    error_msg = str(e)
                    print(f"Error processing {file_type} file {file_to_modify}: {error_msg}")
                    if overwrite and temp_file and os.path.exists(temp_file):
                        os.remove(temp_file)
                    failed_files.append({
                        'file_path': file_to_modify,
                        'file_type': file_type,
                        'position': position,
                        'error': error_msg
                    })

            pbar.update(1)

    # Summarize results
    if processed_files:
        results_df = pd.DataFrame(processed_files)
        print(f"Successfully processed {len(results_df)} files")
        if failed_files:
            failed_df = pd.DataFrame(failed_files)
            print(f"Failed to process {len(failed_df)} files")
            print("First few failures:")
            print(failed_df.head())
        return results_df
    else:
        if failed_files:
            failed_df = pd.DataFrame(failed_files)
            print(f"Failed to process all {len(failed_df)} files")
            print("First few failures:")
            print(failed_df.head())
        print("No files were processed successfully")
        return pd.DataFrame()

# Example usage
results = remove_overlap_in_geotiffs(overlap_df, gdf_dataset, overlap_positions=OVERLAP_POSITIONS, buffer_distance=BUFFER_DISTANCE)

# Display summary of results
if len(results) > 0:
    print("\nSummary of processed files:")
    print(f"Total modified files: {len(results)}")
    file_type_counts = results['file_type'].value_counts()
    print("\nFiles by type:")
    print(file_type_counts)
    print("\nSample of processed files:")
    display(results.head())

Processing 113 out of 194 overlaps that match position criteria


Processing overlaps:   0%|          | 0/113 [00:00<?, ?it/s]

Successfully processed 226 files

Summary of processed files:
Total modified files: 226

Files by type:
file_type
image    113
mask     113
Name: count, dtype: int64

Sample of processed files:


Unnamed: 0,file_path,file_type,position,overlap_with,overlap_area,modified_pixels,tile_id
0,datasets/supervisely/dataset_processed_2025052...,image,top,datasets/supervisely/dataset_processed_2025052...,819.2,327680,14_6_cc7d0d
1,datasets/supervisely/dataset_processed_2025052...,mask,top,datasets/supervisely/dataset_processed_2025052...,819.2,327680,14_6_cc7d0d
2,datasets/supervisely/dataset_processed_2025052...,image,top-right,datasets/supervisely/dataset_processed_2025052...,163.84,65536,15_6_7a6487
3,datasets/supervisely/dataset_processed_2025052...,mask,top-right,datasets/supervisely/dataset_processed_2025052...,163.84,65536,15_6_7a6487
4,datasets/supervisely/dataset_processed_2025052...,image,top-right,datasets/supervisely/dataset_processed_2025052...,163.84,65536,17_6_8a7785


#### Vérification

In [None]:
def verify_overlap_corrections(overlap_df, gdf_dataset, buffer_distance=0.01):
    """
    Checks if overlapping regions in GeoTIFF files have been corrected by verifying
    if one or both tiles contain background values (0) in the overlapping area.
    Handles buffered geometries as needed.

    Args:
        overlap_df: DataFrame with overlap information.
        gdf_dataset: GeoDataFrame with GeoTIFF paths and geometries.
        buffer_distance: Buffer distance for geometry, should match the value used in overlap detection.

    Returns:
        DataFrame with verification results for each overlapping pair.
    """

    verification_results = []
    skipped_pairs = 0

    if len(overlap_df) == 0:
        print("No overlaps to verify")
        return pd.DataFrame()

    print(f"Verifying {len(overlap_df)} overlapping pairs...")
    with tqdm(total=len(overlap_df), desc="Verifying overlaps") as pbar:
        for idx, row in overlap_df.iterrows():
            index1 = row['index1']
            index2 = row['index2']
            position = row['relative_position']

            tiff_path1 = gdf_dataset.iloc[index1]['processed_img_path_tif']
            tiff_path2 = gdf_dataset.iloc[index2]['processed_img_path_tif']

            geom1 = gdf_dataset.iloc[index1]['geometry']
            geom2 = gdf_dataset.iloc[index2]['geometry']

            result = {
                'tile_id1': row['tile_id1'],
                'tile_id2': row['tile_id2'],
                'position': position,
                'overlap_area': row['overlap_area'],
                'file1_has_zeros': False,
                'file2_has_zeros': False,
                'file1_zero_percentage': 0.0,
                'file2_zero_percentage': 0.0,
                'both_have_zeros': False,
                'either_has_zeros': False,
                'avg_zero_percentage': 0.0,
                'status': 'unchecked'
            }

            try:
                # Validate geometries
                if geom1 is None or not geom1.is_valid or geom2 is None or not geom2.is_valid:
                    result['status'] = 'invalid_geometry'
                    verification_results.append(result)
                    pbar.update(1)
                    skipped_pairs += 1
                    continue

                # Apply buffer if required
                use_buffer = row.get('buffered', True)
                if use_buffer:
                    buffered_geom1 = geom1.buffer(buffer_distance)
                    buffered_geom2 = geom2.buffer(buffer_distance)
                else:
                    buffered_geom1 = geom1
                    buffered_geom2 = geom2

                # Compute intersection
                intersection = buffered_geom1.intersection(buffered_geom2)

                # Skip if intersection is empty or has no area
                if intersection.is_empty or intersection.area <= 0:
                    result['status'] = 'empty_intersection'
                    verification_results.append(result)
                    pbar.update(1)
                    skipped_pairs += 1
                    continue

                # Check first file for zeros in overlap
                with rasterio.open(tiff_path1) as src1:
                    minx, miny, maxx, maxy = intersection.bounds
                    window1 = from_bounds(minx, miny, maxx, maxy, src1.transform)

                    # Validate window coordinates
                    if (np.isnan(window1.col_off) or np.isnan(window1.row_off) or 
                        np.isnan(window1.width) or np.isnan(window1.height)):
                        result['status'] = 'invalid_window_file1'
                        verification_results.append(result)
                        pbar.update(1)
                        skipped_pairs += 1
                        continue

                    # Ensure window is within image bounds
                    col_off1 = max(0, int(window1.col_off))
                    row_off1 = max(0, int(window1.row_off))
                    width1 = min(int(np.ceil(window1.width)), src1.width - col_off1)
                    height1 = min(int(np.ceil(window1.height)), src1.height - row_off1)

                    if width1 <= 0 or height1 <= 0:
                        result['status'] = 'invalid_dimensions_file1'
                        verification_results.append(result)
                        pbar.update(1)
                        skipped_pairs += 1
                        continue

                    data1 = src1.read(1, window=((row_off1, row_off1+height1), (col_off1, col_off1+width1)))
                    zero_count1 = np.sum(data1 == 0)
                    total_pixels1 = data1.size
                    zero_percentage1 = (zero_count1 / total_pixels1) * 100

                    result['file1_has_zeros'] = zero_count1 > 0
                    result['file1_zero_percentage'] = zero_percentage1

                # Check second file for zeros in overlap
                with rasterio.open(tiff_path2) as src2:
                    minx, miny, maxx, maxy = intersection.bounds
                    window2 = from_bounds(minx, miny, maxx, maxy, src2.transform)

                    if (np.isnan(window2.col_off) or np.isnan(window2.row_off) or 
                        np.isnan(window2.width) or np.isnan(window2.height)):
                        result['status'] = 'invalid_window_file2' if result['status'] == 'unchecked' else 'invalid_windows_both'
                        verification_results.append(result)
                        pbar.update(1)
                        skipped_pairs += 1
                        continue

                    col_off2 = max(0, int(window2.col_off))
                    row_off2 = max(0, int(window2.row_off))
                    width2 = min(int(np.ceil(window2.width)), src2.width - col_off2)
                    height2 = min(int(np.ceil(window2.height)), src2.height - row_off2)

                    if width2 <= 0 or height2 <= 0:
                        result['status'] = 'invalid_dimensions_file2' if result['status'] == 'unchecked' else 'invalid_dimensions_both'
                        verification_results.append(result)
                        pbar.update(1)
                        skipped_pairs += 1
                        continue

                    data2 = src2.read(1, window=((row_off2, row_off2+height2), (col_off2, col_off2+width2)))
                    zero_count2 = np.sum(data2 == 0)
                    total_pixels2 = data2.size
                    zero_percentage2 = (zero_count2 / total_pixels2) * 100

                    result['file2_has_zeros'] = zero_count2 > 0
                    result['file2_zero_percentage'] = zero_percentage2

                # Compute summary metrics for overlap
                if result['status'] == 'unchecked':
                    result['both_have_zeros'] = result['file1_has_zeros'] and result['file2_has_zeros']
                    result['either_has_zeros'] = result['file1_has_zeros'] or result['file2_has_zeros']
                    result['avg_zero_percentage'] = (result['file1_zero_percentage'] + result['file2_zero_percentage']) / 2

                    if result['both_have_zeros']:
                        result['status'] = 'both_have_zeros'
                    elif result['either_has_zeros']:
                        result['status'] = 'one_has_zeros'
                    else:
                        result['status'] = 'no_zeros'

            except Exception as e:
                result['status'] = f"error: {str(e)}"
                skipped_pairs += 1

            verification_results.append(result)
            pbar.update(1)

    if verification_results:
        df_verification = pd.DataFrame(verification_results)

        # Print summary statistics
        status_counts = df_verification['status'].value_counts()
        print("\nVerification results:")
        for status, count in status_counts.items():
            print(f"  {status}: {count} pairs ({count/len(df_verification)*100:.1f}%)")

        # Show statistics for valid results
        valid_df = df_verification[df_verification['status'].isin(['both_have_zeros', 'one_has_zeros', 'no_zeros'])]

        if len(valid_df) > 0:
            both_zeros_count = valid_df['both_have_zeros'].sum()
            either_zeros_count = valid_df['either_has_zeros'].sum()

            print(f"\n  Pairs where both tiles have zeros in overlap: {both_zeros_count} ({both_zeros_count/len(valid_df)*100:.1f}%)")
            print(f"  Pairs where at least one tile has zeros in overlap: {either_zeros_count} ({either_zeros_count/len(valid_df)*100:.1f}%)")

            avg_zero_pct = valid_df['avg_zero_percentage'].mean()
            print(f"  Average percentage of zeros in overlap areas: {avg_zero_pct:.1f}%")

            # Warn if any pairs have no background pixels in overlap
            failed_verification = valid_df[valid_df['status'] == 'no_zeros']
            if len(failed_verification) > 0:
                print(f"\nWARNING: {len(failed_verification)} pairs have no background pixels in overlap regions!")
                print("\nSample of problematic pairs:")
                display(failed_verification.head(5))

        print(f"\nSkipped {skipped_pairs} pairs due to geometry or window issues")

        return df_verification
    else:
        print("No verification results")
        return pd.DataFrame()


In [None]:
# Run verification to check overlap corrections in GeoTIFF files
df_verification = verify_overlap_corrections(overlap_df, gdf_dataset, buffer_distance=BUFFER_DISTANCE)

if len(df_verification) > 0:
    # Display pairs where the average percentage of zeros in the overlap is low (<10%)
    low_zeros = df_verification[df_verification['avg_zero_percentage'] < 10]
    if len(low_zeros) > 0:
        print("\nPairs with low zero percentage (<10%):")
        display(low_zeros[['tile_id1', 'tile_id2', 'position', 'file1_zero_percentage', 'file2_zero_percentage', 'status']])
    
    # Display pairs where both files have a high percentage of zeros in the overlap (>90%)
    high_zeros = df_verification[
        (df_verification['file1_zero_percentage'] > 90) & 
        (df_verification['file2_zero_percentage'] > 90)
    ]
    if len(high_zeros) > 0:
        print("\nPairs where both files have high zero percentage (>90%):")
        display(high_zeros[['tile_id1', 'tile_id2', 'position', 'file1_zero_percentage', 'file2_zero_percentage', 'status']])

Verifying 194 overlapping pairs...


Verifying overlaps:   0%|          | 0/194 [00:00<?, ?it/s]


Verification results:
  both_have_zeros: 180 pairs (92.8%)
  one_has_zeros: 14 pairs (7.2%)

  Pairs where both tiles have zeros in overlap: 180 (92.8%)
  Pairs where at least one tile has zeros in overlap: 194 (100.0%)
  Average percentage of zeros in overlap areas: 85.2%

Skipped 0 pairs due to geometry or window issues

Pairs where both files have high zero percentage (>90%):


Unnamed: 0,tile_id1,tile_id2,position,file1_zero_percentage,file2_zero_percentage,status
0,15_6_7a6487,14_6_cc7d0d,top,100.0,100.0,both_have_zeros
1,14_6_cc7d0d,15_6_7a6487,bottom,100.0,100.0,both_have_zeros
2,15_6_7a6487,14_7_b86707,top-right,100.0,100.0,both_have_zeros
3,14_7_b86707,15_6_7a6487,bottom-left,100.0,100.0,both_have_zeros
4,17_6_8a7785,16_7_68c3da,top-right,100.0,100.0,both_have_zeros
...,...,...,...,...,...,...
171,3_4_b3ad65,2_5_d310ba,top-right,100.0,100.0,both_have_zeros
172,2_5_d310ba,3_6_9b5376,bottom-right,100.0,100.0,both_have_zeros
173,3_6_9b5376,2_5_d310ba,top-left,100.0,100.0,both_have_zeros
186,3_5_677ebf,4_4_6684a3,bottom-left,100.0,100.0,both_have_zeros


In [None]:
def visualize_overlap_corrections(overlap_df, df_verification, gdf_dataset, dataset_output_checks_path, zero_threshold=99.9):
    """
    Visualize overlap corrections between GeoTIFF files, including mask overlays.

    Args:
        overlap_df: DataFrame with overlap information.
        df_verification: DataFrame with verification results.
        gdf_dataset: GeoDataFrame with GeoTIFF paths and geometries.
        dataset_output_checks_path: Directory to save visualizations.
        zero_threshold: Threshold (%) to consider a region as background (default 99.9).

    Returns:
        dict: Statistics about the visualizations.
    """

    # Create output directory with timestamp
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_dir = os.path.join(dataset_output_checks_path, f"overlap_check_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    # Counters for results
    successful = 0
    failed = 0
    mask_issues = 0

    def get_safe_window_data(src, intersection_bounds):
        """
        Extract window data from raster, ensuring dimensions are valid.
        """
        minx, miny, maxx, maxy = intersection_bounds
        window = from_bounds(minx, miny, maxx, maxy, src.transform)
        col_off = max(0, min(int(round(window.col_off)), src.width - 1))
        row_off = max(0, min(int(round(window.row_off)), src.height - 1))
        width = max(1, min(int(round(window.width)), src.width - col_off))
        height = max(1, min(int(round(window.height)), src.height - row_off))
        safe_window = Window(col_off, row_off, width, height)
        data = src.read(1, window=safe_window)
        return data, safe_window

    def visualize_pair(row, output_path):
        try:
            tile_id1 = row['tile_id1']
            tile_id2 = row['tile_id2']

            # Find indices for the tiles
            idx1 = gdf_dataset[gdf_dataset['tile_id'] == tile_id1].index[0]
            idx2 = gdf_dataset[gdf_dataset['tile_id'] == tile_id2].index[0]

            # Get file paths
            tiff_path1 = gdf_dataset.loc[idx1, 'processed_img_path_tif']
            tiff_path2 = gdf_dataset.loc[idx2, 'processed_img_path_tif']

            # Check file existence
            if not os.path.exists(tiff_path1) or not os.path.exists(tiff_path2):
                print(f"Files not found for {tile_id1} and {tile_id2}")
                return False

            # Check for mask files
            has_masks = False
            if 'processed_mask_path_tif' in gdf_dataset.columns:
                mask_path1 = gdf_dataset.loc[idx1, 'processed_mask_path_tif']
                mask_path2 = gdf_dataset.loc[idx2, 'processed_mask_path_tif']
                has_masks = (os.path.exists(mask_path1) and os.path.exists(mask_path2))
                if not has_masks:
                    print(f"Warning: Mask files not found for {tile_id1} and/or {tile_id2}")
            else:
                print("Warning: 'processed_mask_path_tif' column not found in dataset, masks will not be visualized")

            # Open raster files and analyze data
            with rasterio.open(tiff_path1) as src1, rasterio.open(tiff_path2) as src2:
                bounds1 = src1.bounds
                bounds2 = src2.bounds

                # Calculate intersection of bounds
                intersection = (
                    max(bounds1.left, bounds2.left),
                    max(bounds1.bottom, bounds2.bottom),
                    min(bounds1.right, bounds2.right),
                    min(bounds1.top, bounds2.top)
                )

                # Check for valid intersection
                if intersection[2] <= intersection[0] or intersection[3] <= intersection[1]:
                    print(f"No valid intersection for {tile_id1} and {tile_id2}")
                    return False

                # Extract overlap data using safe window extraction
                data1, window1 = get_safe_window_data(src1, intersection)
                data2, window2 = get_safe_window_data(src2, intersection)

                # Read full images for background
                full_data1 = src1.read(1)
                full_data2 = src2.read(1)

                # Create masks for overlap regions
                overlap_mask1 = np.zeros_like(full_data1, dtype=bool)
                overlap_mask1[window1.row_off:window1.row_off+window1.height, 
                             window1.col_off:window1.col_off+window1.width] = True

                overlap_mask2 = np.zeros_like(full_data2, dtype=bool)
                overlap_mask2[window2.row_off:window2.row_off+window2.height, 
                             window2.col_off:window2.col_off+window2.width] = True

                # Initialize mask variables
                has_mask_conflict = False
                mask_conflict_percentage = 0
                mask_data1 = None
                mask_data2 = None
                mask_overlap1 = None
                mask_overlap2 = None

                if has_masks:
                    try:
                        with rasterio.open(mask_path1) as mask_src1, rasterio.open(mask_path2) as mask_src2:
                            mask_data1 = mask_src1.read(1)
                            mask_data2 = mask_src2.read(1)
                            mask_overlap1 = mask_src1.read(1, window=window1)
                            mask_overlap2 = mask_src2.read(1, window=window2)

                            # Handle shape mismatches
                            if mask_overlap1.shape != mask_overlap2.shape:
                                print(f"Mask shape mismatch for {tile_id1} and {tile_id2}: {mask_overlap1.shape} vs {mask_overlap2.shape}")
                                min_height = min(mask_overlap1.shape[0], mask_overlap2.shape[0])
                                min_width = min(mask_overlap1.shape[1], mask_overlap2.shape[1])
                                mask_overlap1 = mask_overlap1[:min_height, :min_width]
                                mask_overlap2 = mask_overlap2[:min_height, :min_width]
                                data1 = data1[:min_height, :min_width]
                                data2 = data2[:min_height, :min_width]

                            # Check for mask conflicts in overlap
                            if mask_overlap1.shape == mask_overlap2.shape and mask_overlap1.size > 0:
                                mask_conflict = np.logical_and(mask_overlap1 > 0, mask_overlap2 > 0)
                                has_mask_conflict = np.any(mask_conflict)
                                mask_conflict_percentage = np.sum(mask_conflict) / mask_conflict.size * 100
                            else:
                                has_mask_conflict = False
                                mask_conflict_percentage = 0

                    except Exception as e:
                        print(f"Error reading mask files for {tile_id1} and {tile_id2}: {str(e)}")
                        has_masks = False
                        has_mask_conflict = False
                        mask_conflict_percentage = 0

                # Ensure image data has matching dimensions
                if data1.shape != data2.shape:
                    min_height = min(data1.shape[0], data2.shape[0])
                    min_width = min(data1.shape[1], data2.shape[1])
                    data1 = data1[:min_height, :min_width]
                    data2 = data2[:min_height, :min_width]

                # Create visualization figure
                fig, axs = plt.subplots(2, 3, figsize=(18, 12))

                # Row 1: Image analysis

                # Plot first tile with overlap highlighted
                axs[0, 0].imshow(full_data1, cmap='gray')
                highlighted1 = np.zeros((*full_data1.shape, 4))
                highlighted1[..., 0] = 1  # Red
                highlighted1[..., 3] = np.where(overlap_mask1, 0.4, 0)
                axs[0, 0].imshow(highlighted1)
                axs[0, 0].set_title(f"Tile {tile_id1}\nZero %: {row.get('file1_zero_percentage', 'N/A'):.1f}%")
                axs[0, 0].axis('off')

                # Plot second tile with overlap highlighted
                axs[0, 1].imshow(full_data2, cmap='gray')
                highlighted2 = np.zeros((*full_data2.shape, 4))
                highlighted2[..., 2] = 1  # Blue
                highlighted2[..., 3] = np.where(overlap_mask2, 0.4, 0)
                axs[0, 1].imshow(highlighted2)
                axs[0, 1].set_title(f"Tile {tile_id2}\nZero %: {row.get('file2_zero_percentage', 'N/A'):.1f}%")
                axs[0, 1].axis('off')

                # Composite image of overlap regions
                if data1.size > 0 and data2.size > 0:
                    composite = np.zeros((data1.shape[0], data1.shape[1] * 2))
                    composite[:, :data1.shape[1]] = data1
                    composite[:, data1.shape[1]:] = data2
                    axs[0, 2].imshow(composite, cmap='gray')
                    axs[0, 2].axvline(x=data1.shape[1], color='r', linestyle='--')

                    # Calculate zero percentage in overlap
                    zeros1 = np.sum(data1 == 0) / data1.size * 100
                    zeros2 = np.sum(data2 == 0) / data2.size * 100

                    # Determine content status
                    if zeros1 >= zero_threshold and zeros2 >= zero_threshold:
                        content_status = "both_background"
                    else:
                        content_status = "partial_image"

                    axs[0, 2].set_title(f"Overlap Comparison\nPosition: {row.get('position', 'N/A')}, Status: {content_status}")
                    axs[0, 2].text(data1.shape[1] * 0.5, data1.shape[0] * 0.9, 
                                  f"{zeros1:.1f}% zeros", ha='center', color='white',
                                  bbox=dict(facecolor='red', alpha=0.7))
                    axs[0, 2].text(data1.shape[1] * 1.5, data1.shape[0] * 0.9, 
                                  f"{zeros2:.1f}% zeros", ha='center', color='white',
                                  bbox=dict(facecolor='blue', alpha=0.7))
                else:
                    axs[0, 2].text(0.5, 0.5, "No overlap data available", 
                                 ha='center', va='center', fontsize=12)
                    content_status = "no_data"
                    zeros1 = zeros2 = 0

                axs[0, 2].axis('off')

                # Row 2: Mask analysis

                if has_masks and mask_data1 is not None and mask_data2 is not None:
                    # Plot first tile with mask overlay
                    axs[1, 0].imshow(full_data1, cmap='gray')
                    mask_overlay1 = np.zeros((*full_data1.shape, 4))
                    mask_overlay1[..., 0] = 1  # Red
                    mask_overlay1[..., 3] = np.where(mask_data1 > 0, 0.5, 0)
                    axs[1, 0].imshow(mask_overlay1)
                    axs[1, 0].set_title(f"Tile {tile_id1} with mask overlay")
                    axs[1, 0].axis('off')

                    # Plot second tile with mask overlay
                    axs[1, 1].imshow(full_data2, cmap='gray')
                    mask_overlay2 = np.zeros((*full_data2.shape, 4))
                    mask_overlay2[..., 2] = 1  # Blue
                    mask_overlay2[..., 3] = np.where(mask_data2 > 0, 0.5, 0)
                    axs[1, 1].imshow(mask_overlay2)
                    axs[1, 1].set_title(f"Tile {tile_id2} with mask overlay")
                    axs[1, 1].axis('off')

                    # Composite of overlap region with masks
                    if (mask_overlap1 is not None and mask_overlap2 is not None and 
                        data1.size > 0 and data2.size > 0):

                        mask_composite = np.zeros((data1.shape[0], data1.shape[1] * 2, 4))
                        for c in range(3):
                            if np.max(data1) > 0:
                                mask_composite[:, :data1.shape[1], c] = data1 / np.max(data1)
                            if np.max(data2) > 0:
                                mask_composite[:, data1.shape[1]:, c] = data2 / np.max(data2)
                        mask_composite[..., 3] = 1.0

                        if mask_overlap1.shape == data1.shape and mask_overlap2.shape == data2.shape:
                            mask_overlay_left = np.zeros((data1.shape[0], data1.shape[1], 4))
                            mask_overlay_left[..., 0] = 1.0
                            mask_overlay_left[..., 3] = np.where(mask_overlap1 > 0, 0.5, 0)
                            mask_overlay_right = np.zeros((data2.shape[0], data2.shape[1], 4))
                            mask_overlay_right[..., 2] = 1.0
                            mask_overlay_right[..., 3] = np.where(mask_overlap2 > 0, 0.5, 0)
                            axs[1, 2].imshow(mask_composite)
                            axs[1, 2].imshow(np.pad(mask_overlay_left, ((0,0), (0,data1.shape[1]), (0,0)), 'constant'))
                            axs[1, 2].imshow(np.pad(mask_overlay_right, ((0,0), (data1.shape[1],0), (0,0)), 'constant'))
                            axs[1, 2].axvline(x=data1.shape[1], color='yellow', linestyle='--')
                            if has_mask_conflict:
                                title = f"Mask Overlap Comparison\nWarning: {mask_conflict_percentage:.1f}% mask conflict!"
                            else:
                                title = "Mask Overlap Comparison\nNo mask conflicts"
                            axs[1, 2].set_title(title)
                            axs[1, 2].axis('off')
                        else:
                            axs[1, 2].text(0.5, 0.5, "Mask-image dimension mismatch", 
                                         ha='center', va='center', fontsize=12)
                            axs[1, 2].axis('off')
                    else:
                        axs[1, 2].text(0.5, 0.5, "Unable to process mask overlaps", 
                                     ha='center', va='center', fontsize=12)
                        axs[1, 2].axis('off')

                else:
                    # No masks available
                    for i in range(3):
                        axs[1, i].text(0.5, 0.5, "No mask files found", 
                                     ha='center', va='center', fontsize=12)
                        axs[1, i].axis('off')

                # Add main title
                plt.suptitle(f"Overlap Analysis: {tile_id1} and {tile_id2}", fontsize=16, y=0.98)
                plt.tight_layout()
                plt.subplots_adjust(top=0.92)
                plt.savefig(output_path, dpi=150, bbox_inches='tight')
                plt.close(fig)

                # Update row with analysis info
                row['content_status'] = content_status
                row['zeros1'] = zeros1
                row['zeros2'] = zeros2

                if has_masks:
                    row['has_masks'] = True
                    row['has_mask_conflict'] = has_mask_conflict
                    row['mask_conflict_percentage'] = mask_conflict_percentage
                else:
                    row['has_masks'] = False

                return True, has_masks and has_mask_conflict

        except Exception as e:
            print(f"Error visualizing pair {tile_id1} and {tile_id2}: {str(e)}")
            traceback.print_exc()
            return False, False

    # Process each verified pair
    if len(df_verification) > 0:
        print(f"Processing {len(df_verification)} verified pairs...")
        results_df = pd.DataFrame()
        for idx, row in tqdm(df_verification.iterrows(), total=len(df_verification)):
            tile_id1 = row['tile_id1']
            tile_id2 = row['tile_id2']
            position = row.get('position', 'unknown')
            row_copy = row.copy()
            filename = f"{tile_id1}_{tile_id2}_{position}.png"
            output_path = os.path.join(output_dir, filename)
            success, has_mask_issue = visualize_pair(row_copy, output_path)
            if success:
                successful += 1
                if has_mask_issue:
                    mask_issues += 1
                results_df = pd.concat([results_df, pd.DataFrame([row_copy])], ignore_index=True)
            else:
                failed += 1

        # Save results to CSV
        results_path = os.path.join(output_dir, "overlap_analysis_results.csv")
        results_df.to_csv(results_path, index=False)
        print(f"Saved results to {results_path}")

        # Save mask issues to separate CSV if any
        if mask_issues > 0:
            mask_issues_df = results_df[results_df.get('has_mask_conflict', False) == True]
            mask_issues_path = os.path.join(output_dir, "mask_issues.csv")
            mask_issues_df.to_csv(mask_issues_path, index=False)
            print(f"Found {mask_issues} tile pairs with mask issues. Saved to {mask_issues_path}")

    print(f"Visualization complete. Created {successful} visualizations in {output_dir}")
    print(f"- Successful visualizations: {successful}")
    print(f"- Failed visualizations: {failed}")
    print(f"- Pairs with mask issues: {mask_issues}")

    return {
        "successful": successful,
        "failed": failed,
        "mask_issues": mask_issues,
        "output_dir": output_dir,
        "results_path": results_path if len(df_verification) > 0 else None
    }


In [None]:
# Visualize overlap corrections between GeoTIFF files and save the results.
visualization_results = visualize_overlap_corrections(
    overlap_df=overlap_df,
    df_verification=df_verification,
    gdf_dataset=gdf_dataset,
    dataset_output_checks_path=DATASET_OUTPUT_CHECKS_PATH,
)

# Display summary of visualization results
print(f"Results saved to directory: {visualization_results['output_dir']}")
print(f"Successful visualizations: {visualization_results['successful']}")
print(f"Failed visualizations: {visualization_results['failed']}")
print(f"Tiles with mask issues: {visualization_results['mask_issues']}")

Processing 194 verified pairs...


  0%|          | 0/194 [00:00<?, ?it/s]

Saved results to datasets/supervisely/dataset_processed_20250523-173715/check_dataset/overlap_check_20250523-173751/overlap_analysis_results.csv
Visualization complete. Created 194 visualizations in datasets/supervisely/dataset_processed_20250523-173715/check_dataset/overlap_check_20250523-173751
- Successful visualizations: 194
- Failed visualizations: 0
- Pairs with mask issues: 0
Results saved to directory: datasets/supervisely/dataset_processed_20250523-173715/check_dataset/overlap_check_20250523-173751
Successful visualizations: 194
Failed visualizations: 0
Tiles with mask issues: 0


### Padding images pour 1280x1280

In [None]:
def standardize_image_dimensions(img_dir, mask_dir, target_size=(1280, 1280), overwrite=True):
    """
    Pads images and masks to a target size with identical padding for each pair.
    Overwrites original files or creates new ones, depending on the 'overwrite' flag.

    Args:
        img_dir (str): Directory containing GeoTIFF images.
        mask_dir (str): Directory containing GeoTIFF masks.
        target_size (tuple): Desired (width, height) for output images and masks.
        overwrite (bool): If True, overwrite original files. If False, create new files with '_padded' suffix.

    Returns:
        list: List of dictionaries for each modified file pair:
            {
                'img_file': path to modified image file,
                'mask_file': path to modified mask file,
                'from_size': original (width, height),
                'to_size': new (width, height),
                'padding': (start_x, start_y, pad_width, pad_height)
            }
    """
    # List image and mask files
    img_files = [f for f in os.listdir(img_dir) if f.endswith(('.tif', '.tiff'))]
    mask_files = [f for f in os.listdir(mask_dir) if f.endswith(('.tif', '.tiff'))]

    # Map mask base names to file names
    mask_map = {}
    for mask_file in mask_files:
        mask_basename = os.path.splitext(mask_file)[0]
        mask_map[mask_basename] = mask_file

    total_images = len(img_files)
    resized_pairs = 0
    errors = 0
    skipped = 0
    modified_files = []

    print(f"Processing {total_images} images to ensure {target_size[0]}x{target_size[1]} dimensions...")

    for img_filename in tqdm(img_files, desc="Standardizing images"):
        try:
            img_path = os.path.join(img_dir, img_filename)
            img_basename = os.path.splitext(img_filename)[0]
            mask_filename = mask_map.get(img_basename)

            if mask_filename:
                mask_path = os.path.join(mask_dir, mask_filename)
                if not os.path.exists(mask_path):
                    print(f"Warning: Mask file {mask_path} not found. Skipping pair.")
                    skipped += 1
                    continue
            else:
                print(f"Warning: No matching mask found for {img_filename}. Skipping.")
                skipped += 1
                continue

            # Open image to get dimensions
            with rasterio.open(img_path) as src:
                height, width = src.height, src.width

                # Skip if already at target size
                if (width, height) == target_size:
                    continue

                pad_width = max(0, target_size[0] - width)
                pad_height = max(0, target_size[1] - height)
                start_x = pad_width // 2
                start_y = pad_height // 2

                # Skip if image is larger than target
                if pad_width < 0 or pad_height < 0:
                    print(f"Warning: {img_filename} is larger than target size. Skipping pair.")
                    skipped += 1
                    continue

                # Pad image
                with rasterio.open(img_path) as src:
                    img_data = src.read()
                    bands = img_data.shape[0]
                    padded_data = np.zeros((bands, target_size[1], target_size[0]), dtype=img_data.dtype)
                    for b in range(bands):
                        padded_data[b, start_y:start_y+height, start_x:start_x+width] = img_data[b]

                    # Adjust georeferencing
                    transform = src.transform
                    xoff = transform.c - start_x * transform.a
                    yoff = transform.f - start_y * transform.e
                    new_transform = rasterio.Affine(transform.a, transform.b, xoff,
                                                    transform.d, transform.e, yoff)

                    meta = src.meta.copy()
                    meta.update({
                        'height': target_size[1],
                        'width': target_size[0],
                        'transform': new_transform
                    })

                    # Write padded image to temp file
                    with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp:
                        tmp_path = tmp.name
                    with rasterio.open(tmp_path, 'w', **meta) as dst:
                        dst.write(padded_data)
                    shutil.move(tmp_path, img_path)

                # Pad mask
                if mask_filename.lower().endswith('.png'):
                    mask_img = Image.open(mask_path)
                    mask_width, mask_height = mask_img.size
                    if (mask_width, mask_height) != (width, height):
                        print(f"Warning: Dimensions mismatch between {img_filename} and {mask_filename}. Using GeoTIFF dimensions.")

                    new_mask = Image.new(mask_img.mode, target_size, 0)
                    new_mask.paste(mask_img, (start_x, start_y))
                    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
                        tmp_path = tmp.name
                    new_mask.save(tmp_path)
                    shutil.move(tmp_path, mask_path)

                elif mask_filename.lower().endswith(('.tif', '.tiff')):
                    with rasterio.open(mask_path) as mask_src:
                        mask_height, mask_width = mask_src.height, mask_src.width
                        if (mask_width, mask_height) != (width, height):
                            print(f"Warning: Dimensions mismatch between {img_filename} and {mask_filename}. Using image dimensions.")

                        mask_data = mask_src.read()
                        mask_bands = mask_data.shape[0]
                        padded_mask_data = np.zeros((mask_bands, target_size[1], target_size[0]), dtype=mask_data.dtype)
                        for b in range(mask_bands):
                            padded_mask_data[b, start_y:start_y+mask_height, start_x:start_x+mask_width] = mask_data[b]

                        mask_meta = mask_src.meta.copy()
                        mask_meta.update({
                            'height': target_size[1],
                            'width': target_size[0],
                            'transform': new_transform
                        })

                        with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp:
                            tmp_path = tmp.name
                        with rasterio.open(tmp_path, 'w', **mask_meta) as dst:
                            dst.write(padded_mask_data)
                        shutil.move(tmp_path, mask_path)

                modified_files.append({
                    'img_file': img_path,
                    'mask_file': mask_path,
                    'from_size': (width, height),
                    'to_size': target_size,
                    'padding': (start_x, start_y, pad_width, pad_height)
                })
                resized_pairs += 1

        except Exception as e:
            errors += 1
            print(f"Error processing {img_filename}: {e}")

    print("Standardization complete:")
    print(f"- Total images processed: {total_images}")
    print(f"- Image/mask pairs resized and overwritten: {resized_pairs}")
    print(f"- Pairs skipped: {skipped}")
    print(f"- Errors encountered: {errors}")
    print(f"- Images already at target size: {total_images - resized_pairs - skipped - errors}")

    return modified_files

# Run the function and print summary
modified_files = standardize_image_dimensions(
    img_dir=DATASET_OUTPUT_IMG_PATH,
    mask_dir=DATASET_OUTPUT_MASKS_PATH,
    target_size=(1280, 1280)
)

if modified_files:
    print(f"\nModified {len(modified_files)} file pairs. First 5 examples:")
    for i, file_info in enumerate(modified_files[:5]):
        print(f"{i+1}. {os.path.basename(file_info['img_file'])}: {file_info['from_size']} -> {file_info['to_size']}")
else:
    print("\nNo files were modified.")


Processing 538 images to ensure 1280x1280 dimensions...


Standardizing images:   0%|          | 0/538 [00:00<?, ?it/s]

Standardization complete:
- Total images processed: 538
- Image/mask pairs resized and overwritten: 42
- Pairs skipped: 0
- Errors encountered: 0
- Images already at target size: 496

Modified 42 file pairs. First 5 examples:
1. 25011116_tile_13_19_d8b3fa.tif: (672, 1280) -> (1280, 1280)
2. 25001117_tile_19_7_894243.tif: (1280, 672) -> (1280, 1280)
3. 24971119_tile_9_19_924944.tif: (672, 1280) -> (1280, 1280)
4. 25061123_tile_19_5_1c6023.tif: (1280, 672) -> (1280, 1280)
5. 24911110_tile_0_19_c476a4.tif: (672, 1280) -> (1280, 1280)


In [None]:
def verify_padding(processed_img_dir, processed_mask_dir, output_dir, show_images=False, modified_files=None, 
                   modified_sample_count=5, unmodified_sample_count=5):
    """
    Verifies padding consistency for all GeoTIFF and mask pairs.
    Provides visualizations showing padding information and formatting.
    Saves summary and sample images to the specified output directory.

    Args:
        processed_img_dir (str): Directory containing processed GeoTIFF images.
        processed_mask_dir (str): Directory containing processed mask files.
        output_dir (str): Directory to save verification results and visualizations.
        show_images (bool): If True, displays images inline. Otherwise, saves and closes them.
        modified_files (list): List of dictionaries with information about modified files.
        modified_sample_count (int): Number of modified samples to visualize.
        unmodified_sample_count (int): Number of unmodified samples to visualize.

    Returns:
        dict: Summary of verification results.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # List all GeoTIFF files in the image directory
    all_tiff_files = [f for f in os.listdir(processed_img_dir) if f.endswith(('.tif', '.tiff'))]
    
    if not all_tiff_files:
        print("No GeoTIFF files found for verification.")
        return
    
    # Build a lookup for modified files and their padding info
    modified_info = {}
    if modified_files:
        for info in modified_files:
            filename = os.path.basename(info['img_file'])
            modified_info[filename] = info
    
    modified_paths = set(modified_info.keys())
    
    # Separate files into modified and unmodified groups
    modified_tiff_files = [f for f in all_tiff_files if f in modified_paths]
    unmodified_tiff_files = [f for f in all_tiff_files if f not in modified_paths]
    
    print(f"Found {len(modified_tiff_files)} modified files and {len(unmodified_tiff_files)} unmodified files")
    
    # Initialize counters for verification results
    total_files = len(all_tiff_files)
    verified_files = 0
    dimension_mismatches = 0
    missing_masks = 0
    
    print(f"Verifying all {total_files} image-mask pairs...")
    
    # Check dimensions for all image-mask pairs
    for img_filename in tqdm(all_tiff_files, desc="Verifying files"):
        img_path = os.path.join(processed_img_dir, img_filename)
        img_basename = os.path.splitext(img_filename)[0]
        
        # Find the corresponding mask file
        mask_filename = None
        for ext in ['.tif', '.tiff', '.png', '.PNG']:
            candidate_mask = img_basename + ext
            if os.path.exists(os.path.join(processed_mask_dir, candidate_mask)):
                mask_filename = candidate_mask
                break
        
        if not mask_filename:
            print(f"No matching mask found for {img_filename}.")
            missing_masks += 1
            continue
        
        mask_path = os.path.join(processed_mask_dir, mask_filename)
        
        try:
            # Read GeoTIFF image dimensions
            with rasterio.open(img_path) as src:
                geotiff_height, geotiff_width = src.height, src.width
            
            # Read mask dimensions
            if mask_filename.lower().endswith(('.tif', '.tiff')):
                with rasterio.open(mask_path) as mask_src:
                    mask_height, mask_width = mask_src.height, mask_src.width
            else:
                with Image.open(mask_path) as mask_img:
                    mask_width, mask_height = mask_img.size
            
            # Compare dimensions
            if (geotiff_height, geotiff_width) != (mask_height, mask_width):
                print(f"Dimension mismatch for {img_basename}: GeoTIFF {geotiff_width}x{geotiff_height}, "
                      f"Mask {mask_width}x{mask_height}")
                dimension_mismatches += 1
            else:
                verified_files += 1
                
        except Exception as e:
            print(f"Error verifying {img_filename}: {e}")
    
    # Print summary of verification
    print("\nVerification Summary:")
    print(f"- Total files checked: {total_files}")
    print(f"- Successfully verified pairs: {verified_files}")
    print(f"- Dimension mismatches: {dimension_mismatches}")
    print(f"- Missing masks: {missing_masks}")
    
    def visualize_sample(img_filename, sample_type):
        """
        Visualizes a single image-mask pair, showing the image, mask, and overlay.
        For modified files, highlights the original image area.
        Saves the visualization to the output directory.

        Args:
            img_filename (str): Filename of the image to visualize.
            sample_type (str): Label for the sample type ("Modified" or "Unmodified").

        Returns:
            bool: True if visualization was successful, False otherwise.
        """
        img_path = os.path.join(processed_img_dir, img_filename)
        img_basename = os.path.splitext(img_filename)[0]
        
        # Find the corresponding mask file
        mask_filename = None
        for ext in ['.tif', '.tiff', '.png', '.PNG']:
            candidate_mask = img_basename + ext
            if os.path.exists(os.path.join(processed_mask_dir, candidate_mask)):
                mask_filename = candidate_mask
                break
        
        if not mask_filename:
            print(f"No mask found for {img_filename}")
            return False
        
        mask_path = os.path.join(processed_mask_dir, mask_filename)
        
        try:
            # Read GeoTIFF image data
            with rasterio.open(img_path) as src:
                geotiff_data = src.read(1)
            
            # Read mask data
            if mask_filename.lower().endswith(('.tif', '.tiff')):
                with rasterio.open(mask_path) as mask_src:
                    mask_data = mask_src.read(1)
            else:
                mask_data = np.array(Image.open(mask_path))
                if len(mask_data.shape) == 3:
                    mask_data = mask_data[:, :, 0]
            
            # Prepare dimension information for display
            is_modified = img_filename in modified_paths
            if is_modified:
                padding_info = modified_info[img_filename]
                original_size = padding_info['from_size']
                dimension_text = f"Original: {original_size[0]}×{original_size[1]} → Current: 1280×1280"
            else:
                dimension_text = "Original: 1280×1280 (no change needed)"
            
            # Create visualization with three subplots
            fig, axes = plt.subplots(1, 3, figsize=(18, 6))
            
            # Show GeoTIFF image
            axes[0].imshow(geotiff_data, cmap='gray')
            axes[0].set_title(f"GeoTIFF: {img_filename}")
            axes[0].axis('off')
            
            # Show mask image
            axes[1].imshow(mask_data, cmap='gray')
            axes[1].set_title(f"Mask: {mask_filename}")
            axes[1].axis('off')
            
            # Prepare overlay for alignment check
            if geotiff_data.max() > geotiff_data.min():
                normalized_geotiff = (geotiff_data - geotiff_data.min()) / (geotiff_data.max() - geotiff_data.min())
            else:
                normalized_geotiff = np.zeros_like(geotiff_data)
            
            if mask_data.max() > mask_data.min():
                normalized_mask = (mask_data - mask_data.min()) / (mask_data.max() - mask_data.min())
            else:
                normalized_mask = np.zeros_like(mask_data)
            
            overlay = np.zeros((geotiff_data.shape[0], geotiff_data.shape[1], 3))
            overlay[:, :, 0] = normalized_geotiff  # Red channel for GeoTIFF
            overlay[:, :, 2] = normalized_mask     # Blue channel for Mask
            
            axes[2].imshow(overlay)
            axes[2].set_title("Overlay (purple shows alignment)")
            axes[2].axis('off')
            
            # Draw rectangle for original image area if modified
            if is_modified:
                padding_info = modified_info[img_filename]
                padding = padding_info['padding']  # (start_x, start_y, pad_width, pad_height)
                original_size = padding_info['from_size']
                
                start_x, start_y = padding[0], padding[1]
                width, height = original_size
                
                from matplotlib.patches import Rectangle
                rect_style = dict(linewidth=2, edgecolor='yellow', facecolor='none', linestyle='--')
                
                axes[0].add_patch(Rectangle((start_x, start_y), width, height, **rect_style))
                axes[1].add_patch(Rectangle((start_x, start_y), width, height, **rect_style))
                axes[2].add_patch(Rectangle((start_x, start_y), width, height, **rect_style))
            
            plt.suptitle(f"{sample_type} Sample: {img_basename}\n{dimension_text}", fontsize=16)
            plt.tight_layout()
            
            save_path = os.path.join(output_dir, f"{sample_type.lower()}_sample_{img_basename}.png")
            plt.savefig(save_path, dpi=150, bbox_inches='tight')
            
            if show_images:
                plt.show()
            else:
                plt.close()
            
            print(f"Visualization for {img_basename} ({sample_type}):")
            print(f"- GeoTIFF dimensions: {geotiff_data.shape}")
            print(f"- Mask dimensions: {mask_data.shape}")
            print(f"- {dimension_text}")
            print(f"- Verification image saved to: {save_path}")
            print("-" * 50)
            
            return True
            
        except Exception as e:
            print(f"Error visualizing {img_filename}: {e}")
            return False
    
    # Visualize a sample of modified files
    if modified_sample_count > 0 and modified_tiff_files:
        print(f"\nGenerating visualizations for {min(modified_sample_count, len(modified_tiff_files))} modified samples...")
        
        modified_samples = random.sample(modified_tiff_files, min(modified_sample_count, len(modified_tiff_files)))
        
        successful_visualizations = 0
        for img_filename in modified_samples:
            if visualize_sample(img_filename, "Modified"):
                successful_visualizations += 1
        
        print(f"Successfully created {successful_visualizations} modified sample visualizations")
    
    # Visualize a sample of unmodified files
    if unmodified_sample_count > 0 and unmodified_tiff_files:
        print(f"\nGenerating visualizations for {min(unmodified_sample_count, len(unmodified_tiff_files))} unmodified samples...")
        
        unmodified_samples = random.sample(unmodified_tiff_files, min(unmodified_sample_count, len(unmodified_tiff_files)))
        
        successful_visualizations = 0
        for img_filename in unmodified_samples:
            if visualize_sample(img_filename, "Unmodified"):
                successful_visualizations += 1
        
        print(f"Successfully created {successful_visualizations} unmodified sample visualizations")
    
    return {
        'total_files': total_files,
        'verified_files': verified_files, 
        'dimension_mismatches': dimension_mismatches,
        'missing_masks': missing_masks,
        'modified_files_count': len(modified_tiff_files),
        'unmodified_files_count': len(unmodified_tiff_files)
    }

# Example usage with improved graphics and summary output
verification_results = verify_padding(
    processed_img_dir=DATASET_OUTPUT_IMG_PATH,
    processed_mask_dir=DATASET_OUTPUT_MASKS_PATH,
    output_dir=DATASET_OUTPUT_CHECKS_PATH + "/padding_verification",
    modified_files=modified_files,
    modified_sample_count=5,
    unmodified_sample_count=3,
    show_images=False
)

print("\nFinal Verification Results:")
for key, value in verification_results.items():
    print(f"   {key}: {value}")

Found 42 modified files and 496 unmodified files
Verifying all 538 image-mask pairs...


Verifying files:   0%|          | 0/538 [00:00<?, ?it/s]


Verification Summary:
- Total files checked: 538
- Successfully verified pairs: 538
- Dimension mismatches: 0
- Missing masks: 0

Generating visualizations for 5 modified samples...
Visualization for 24911110_tile_0_19_c476a4 (Modified):
- GeoTIFF dimensions: (1280, 1280)
- Mask dimensions: (1280, 1280)
- Original: 672×1280 → Current: 1280×1280
- Verification image saved to: datasets/supervisely/dataset_processed_20250523-173715/check_dataset/padding_verification/modified_sample_24911110_tile_0_19_c476a4.png
--------------------------------------------------
Visualization for 25001117_tile_8_19_d29b79 (Modified):
- GeoTIFF dimensions: (1280, 1280)
- Mask dimensions: (1280, 1280)
- Original: 672×1280 → Current: 1280×1280
- Verification image saved to: datasets/supervisely/dataset_processed_20250523-173715/check_dataset/padding_verification/modified_sample_25001117_tile_8_19_d29b79.png
--------------------------------------------------
Visualization for 24971119_tile_19_9_e55938 (Modifi

## Validation données

In [None]:
# Remove files that are 100% zeros and update the dataset accordingly
print("PRE-VERIFICATION CLEANUP: REMOVING 100% ZERO FILES")
print("=" * 60)

def remove_zero_files_and_records(gdf_dataset, dataset_output_img_path, dataset_output_masks_path):
    """
    Remove image and mask files that are 100% zeros, and remove corresponding dataframe records.
    This is done before the final verification to clean up the dataset.
    
    Args:
        gdf_dataset: The GeoDataFrame with file paths.
        dataset_output_img_path: Path to processed images directory.
        dataset_output_masks_path: Path to processed masks directory.
        
    Returns:
        Cleaned GeoDataFrame with zero-content records removed.
    """
    print(f"Starting with {len(gdf_dataset)} records")
    gdf_cleaned = gdf_dataset.copy()
    zero_files_to_remove = []
    files_deleted = []
    records_to_remove = []
    print("\nScanning for 100% zero files...")

    for idx, row in gdf_cleaned.iterrows():
        img_path = row.get('processed_img_path_tif')
        mask_path = row.get('processed_mask_path_tif')
        tile_id = row['tile_id']
        img_all_zero = False
        mask_all_zero = False
        should_remove = False

        # Check if image is all zeros
        if pd.notna(img_path) and os.path.exists(img_path):
            try:
                with rasterio.open(img_path) as src:
                    img_data = src.read(1)
                    if np.all(img_data == 0):
                        img_all_zero = True
                        print(f"   Image is 100% zeros: {tile_id}")
            except Exception as e:
                print(f"   Error reading image {tile_id}: {e}")
                should_remove = True

        # Check if mask is all zeros
        if pd.notna(mask_path) and os.path.exists(mask_path):
            try:
                with rasterio.open(mask_path) as src:
                    mask_data = src.read(1)
                    if np.all(mask_data == 0):
                        mask_all_zero = True
                        print(f"   Mask is 100% zeros: {tile_id}")
            except Exception as e:
                print(f"   Error reading mask {tile_id}: {e}")
                should_remove = True

        # Remove if image is all zeros or if there was a read error
        if img_all_zero or should_remove:
            zero_files_to_remove.append(tile_id)
            records_to_remove.append(idx)
            if img_all_zero and mask_all_zero:
                print(f"   Marked for removal: {tile_id} (both image and mask are 100% zeros)")
            elif img_all_zero and not mask_all_zero:
                print(f"   Marked for removal: {tile_id} (image is 100% zeros, removing both)")
            elif should_remove:
                print(f"   Marked for removal: {tile_id} (file read errors)")
            if pd.notna(img_path) and os.path.exists(img_path):
                files_deleted.append(img_path)
            if pd.notna(mask_path) and os.path.exists(mask_path):
                files_deleted.append(mask_path)
        else:
            if not img_all_zero and mask_all_zero:
                print(f"   Keeping: {tile_id} (image has content, mask is 100% zeros - acceptable)")
            elif not img_all_zero and not mask_all_zero:
                print(f"   Keeping: {tile_id} (both image and mask have content)")

    print(f"\nFound {len(zero_files_to_remove)} records where image is 100% zeros")
    if zero_files_to_remove:
        print(f"   Records to remove: {zero_files_to_remove}")
        print("   Removing pairs only when image is 100% zeros (mask state doesn't matter)")

    # Delete files from disk
    print(f"\nDeleting {len(files_deleted)} files from disk...")
    deleted_count = 0
    delete_errors = 0
    for file_path in files_deleted:
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
                deleted_count += 1
                print(f"   Deleted: {os.path.basename(file_path)}")
            else:
                print(f"   File already missing: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"   Error deleting {os.path.basename(file_path)}: {e}")
            delete_errors += 1

    print(f"   Successfully deleted {deleted_count} files")
    if delete_errors > 0:
        print(f"   Failed to delete {delete_errors} files")

    # Remove records from dataframe
    if records_to_remove:
        print(f"\nRemoving {len(records_to_remove)} records from dataframe...")
        gdf_cleaned = gdf_cleaned.drop(records_to_remove)
        gdf_cleaned = gdf_cleaned.reset_index(drop=True)
        print(f"   Dataframe now has {len(gdf_cleaned)} records")
    else:
        print("\nNo records to remove from dataframe")

    # Cleanup summary
    print("\nCleanup summary:")
    print(f"   Original records: {len(gdf_dataset)}")
    print(f"   Records removed: {len(records_to_remove)}")
    print(f"   Final records: {len(gdf_cleaned)}")
    print(f"   Files deleted: {deleted_count}")

    return gdf_cleaned

# Filter out records with missing files
print("Filtering out records with missing files")
gdf_dataset_filtered = gdf_dataset.copy()
missing_file_records = []
for idx, row in gdf_dataset_filtered.iterrows():
    img_path = row.get('processed_img_path_tif')
    mask_path = row.get('processed_mask_path_tif')
    img_exists = pd.notna(img_path) and os.path.exists(img_path)
    mask_exists = pd.notna(mask_path) and os.path.exists(mask_path)
    if not (img_exists and mask_exists):
        missing_file_records.append(idx)

if missing_file_records:
    gdf_dataset_filtered = gdf_dataset_filtered.drop(missing_file_records)
    print(f"Removed {len(missing_file_records)} records with missing files")
else:
    print("No records with missing files found")

print(f"After filtering missing files: {len(gdf_dataset_filtered)} records")

# Remove 100% zero files and their records
print("\nRemoving 100% zero files and records")
gdf_dataset_cleaned = remove_zero_files_and_records(
    gdf_dataset=gdf_dataset_filtered,
    dataset_output_img_path=DATASET_OUTPUT_IMG_PATH,
    dataset_output_masks_path=DATASET_OUTPUT_MASKS_PATH
)

# Run final verification on cleaned dataset
print("\nRunning final verification on cleaned dataset")
print(f"Records going into verification: {len(gdf_dataset_cleaned)}")

def final_verification_checks_no_zeros(gdf_dataset, dataset_output_img_path, dataset_output_masks_path):
    """
    Perform final verification checks on the dataset.
    Zero-content checking is skipped as it was already done in preprocessing.
    """
    print("RUNNING FINAL VERIFICATION CHECKS (NO ZERO CHECK)")
    print("=" * 50)
    gdf_dataset = gdf_dataset.copy()
    gdf_dataset['validation_processing'] = 'ok'
    print(f"\nInitialized validation_processing column for {len(gdf_dataset)} records")

    # Verify all referenced files exist
    print("\nVerifying all referenced files exist...")
    missing_images = []
    missing_masks = []
    for idx, row in gdf_dataset.iterrows():
        img_path = row.get('processed_img_path_tif')
        if pd.notna(img_path) and not os.path.exists(img_path):
            missing_images.append(row['tile_id'])
            gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
        mask_path = row.get('processed_mask_path_tif')
        if pd.notna(mask_path) and not os.path.exists(mask_path):
            missing_masks.append(row['tile_id'])
            gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
    assert len(missing_images) == 0, f"Missing image files for tiles: {missing_images[:10]}"
    assert len(missing_masks) == 0, f"Missing mask files for tiles: {missing_masks[:10]}"
    print(f"   All {len(gdf_dataset)} image and mask files exist")

    # Check file dimensions
    print("\nVerifying file dimensions are 1280x1280...")
    target_size = (1280, 1280)
    dimension_errors = []
    sample_size = min(10, len(gdf_dataset))
    sample_indices = np.random.choice(len(gdf_dataset), sample_size, replace=False)
    for idx in sample_indices:
        row = gdf_dataset.iloc[idx]
        tile_id = row['tile_id']
        img_path = row.get('processed_img_path_tif')
        if pd.notna(img_path):
            with rasterio.open(img_path) as src:
                if (src.width, src.height) != target_size:
                    dimension_errors.append(f"Image {tile_id}: {src.width}x{src.height}")
                    gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
        mask_path = row.get('processed_mask_path_tif')
        if pd.notna(mask_path):
            with rasterio.open(mask_path) as mask_src:
                if (mask_src.width, mask_src.height) != target_size:
                    dimension_errors.append(f"Mask {tile_id}: {mask_src.width}x{mask_src.height}")
                    gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
    assert len(dimension_errors) == 0, f"Dimension errors found: {dimension_errors}"
    print(f"   Sample check: All {sample_size} files have correct 1280x1280 dimensions")

    # Validate critical columns
    print("\nValidating critical columns...")
    required_columns = ['tile_id', 'processed_img_path_tif', 'processed_mask_path_tif']
    for col in required_columns:
        assert col in gdf_dataset.columns, f"Required column missing: {col}"
        null_count = gdf_dataset[col].isnull().sum()
        assert null_count == 0, f"Found {null_count} null values in required column: {col}"
    print("   All required columns present with no null values")

    # Validate geometries
    print("\nValidating geometries...")
    if 'geometry' in gdf_dataset.columns:
        null_geoms = gdf_dataset['geometry'].isnull().sum()
        assert null_geoms == 0, f"Found {null_geoms} null geometries"
        invalid_geoms = []
        for idx, row in gdf_dataset.iterrows():
            if not row['geometry'].is_valid:
                invalid_geoms.append(row['tile_id'])
                gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
        assert len(invalid_geoms) == 0, f"Invalid geometries found for tiles: {invalid_geoms[:10]}"
        print(f"   All {len(gdf_dataset)} geometries are valid")

    # Verify directory structure
    print("\nVerifying directory structure...")
    assert os.path.exists(dataset_output_img_path), f"Image directory does not exist: {dataset_output_img_path}"
    assert os.path.exists(dataset_output_masks_path), f"Mask directory does not exist: {dataset_output_masks_path}"
    expected_img_files = set()
    expected_mask_files = set()
    for idx, row in gdf_dataset.iterrows():
        img_path = row.get('processed_img_path_tif')
        mask_path = row.get('processed_mask_path_tif')
        if pd.notna(img_path):
            expected_img_files.add(os.path.basename(img_path))
        if pd.notna(mask_path):
            expected_mask_files.add(os.path.basename(mask_path))
    actual_img_files = set([f for f in os.listdir(dataset_output_img_path) if f.endswith(('.tif', '.tiff'))])
    actual_mask_files = set([f for f in os.listdir(dataset_output_masks_path) if f.endswith(('.tif', '.tiff'))])
    missing_expected_imgs = expected_img_files - actual_img_files
    missing_expected_masks = expected_mask_files - actual_mask_files
    assert len(missing_expected_imgs) == 0, f"Expected image files missing: {list(missing_expected_imgs)[:5]}"
    assert len(missing_expected_masks) == 0, f"Expected mask files missing: {list(missing_expected_masks)[:5]}"
    print(f"   Directory structure correct: {len(expected_img_files)} expected images, {len(expected_mask_files)} expected masks")

    # Check basic data integrity
    print("\nChecking basic data integrity...")
    empty_rows = gdf_dataset.isnull().all(axis=1).sum()
    assert empty_rows == 0, f"Found {empty_rows} completely empty rows"
    assert len(gdf_dataset) > 0, "Dataset is empty"
    memory_mb = gdf_dataset.memory_usage(deep=True).sum() / 1024**2
    assert memory_mb < 1000, f"Dataset unusually large: {memory_mb:.1f} MB"
    print(f"   Dataset integrity OK: {len(gdf_dataset)} records, {memory_mb:.1f} MB")

    # Zero content check skipped (already done)
    print("\nZero content check: SKIPPED (already done in preprocessing)")

    # Check for duplicate images
    print("\nChecking for duplicate images...")
    image_hashes = {}
    duplicate_groups = []
    for idx, row in gdf_dataset.iterrows():
        img_path = row.get('processed_img_path_tif')
        tile_id = row['tile_id']
        if pd.notna(img_path) and os.path.exists(img_path):
            try:
                with rasterio.open(img_path) as src:
                    img_data = src.read()
                    img_hash = hash(img_data.tobytes())
                    if img_hash in image_hashes:
                        if len(image_hashes[img_hash]) == 1:
                            duplicate_groups.append(image_hashes[img_hash] + [tile_id])
                        else:
                            for group in duplicate_groups:
                                if image_hashes[img_hash][0] in group:
                                    group.append(tile_id)
                                    break
                        image_hashes[img_hash].append(tile_id)
                    else:
                        image_hashes[img_hash] = [tile_id]
            except Exception as e:
                print(f"   Error processing image {tile_id}: {e}")
                gdf_dataset.loc[idx, 'validation_processing'] = 'ko'
    duplicate_count = 0
    for group in duplicate_groups:
        for tile_id in group:
            tile_idx = gdf_dataset[gdf_dataset['tile_id'] == tile_id].index[0]
            gdf_dataset.loc[tile_idx, 'validation_processing'] = 'ko'
            duplicate_count += 1
    print(f"   Found {len(duplicate_groups)} duplicate groups affecting {duplicate_count} files")
    if duplicate_groups:
        print(f"   Example duplicate group: {duplicate_groups[0]}")

    # Validation summary
    print("\nValidation summary...")
    validation_counts = gdf_dataset['validation_processing'].value_counts()
    ok_count = validation_counts.get('ok', 0)
    ko_count = validation_counts.get('ko', 0)
    print(f"   Records marked 'ok': {ok_count}")
    print(f"   Records marked 'ko': {ko_count}")
    print(f"   Success rate: {ok_count/len(gdf_dataset)*100:.1f}%")
    print("\n" + "=" * 50)
    print("ALL VERIFICATION CHECKS PASSED")
    print(f"Dataset validated: {ok_count} OK, {ko_count} KO")
    print("Dataset is ready for saving")
    print("=" * 50)
    return gdf_dataset

# Run the verification on the cleaned dataset
gdf_dataset_final = final_verification_checks_no_zeros(
    gdf_dataset=gdf_dataset_cleaned,
    dataset_output_img_path=DATASET_OUTPUT_IMG_PATH,
    dataset_output_masks_path=DATASET_OUTPUT_MASKS_PATH
)

print("\n" + "=" * 60)
print("FINAL RESULTS AFTER CLEANUP AND VERIFICATION")
print("=" * 60)
print("Final validation counts:")
print(gdf_dataset_final['validation_processing'].value_counts())
print(f"\nDataset ready for saving: {len(gdf_dataset_final)} total records")
print(f"High-quality records: {(gdf_dataset_final['validation_processing'] == 'ok').sum()}")

assert (gdf_dataset_final['validation_processing'] == 'ko').sum() == 0, "There are still 'ko' records in the dataset"

PRE-VERIFICATION CLEANUP: REMOVING 100% ZERO FILES
STEP 1: Filter out records with missing files
Removed 1 records with missing files
After filtering missing files: 538 records

STEP 2: Remove 100% zero files and records
Starting with 538 records

1. Scanning for 100% zero files...
   KEEPING: 10_12_8d26a8 (both image and mask have content)
   KEEPING: 0_6_9201e5 (both image and mask have content)
   KEEPING: 18_4_b35ef3 (both image and mask have content)
   Mask is 100% zeros: 1_3_14c592
   KEEPING: 1_3_14c592 (image has content, mask is 100% zeros - acceptable)
   Mask is 100% zeros: 15_17_5212bb
   KEEPING: 15_17_5212bb (image has content, mask is 100% zeros - acceptable)
   Mask is 100% zeros: 18_16_c9d875
   KEEPING: 18_16_c9d875 (image has content, mask is 100% zeros - acceptable)
   KEEPING: 4_3_5bdc96 (both image and mask have content)
   KEEPING: 4_8_a4faea (both image and mask have content)
   KEEPING: 13_19_d8b3fa (both image and mask have content)
   KEEPING: 3_3_878a8d (bo

## Sauvegarder

In [None]:
# Save verification and dataset results to parquet files for future use
df_verification.to_parquet(VERIFICATION_OUTPUT_PARQUET_PATH, index=False)
gdf_dataset.to_parquet(DATASET_OUTPUT_PARQUET_PATH, index=False)
gdf_dataset_final.to_parquet(DATASET_FINAL_OUTPUT_PARQUET_PATH, index=False)