# GeoTIFF Tiling and Building Detection

**Objective:** Process SITG orthophotos to generate tiles containing building footprints.

**Workflow:**
1. Load GeoTIFF orthophotos and building footprint data
2. Split large GeoTIFFs into manageable tiles with buffers
3. Detect buildings within each tile using spatial indexing
4. Generate metadata for tiles containing buildings

## Imports

In [None]:
import os
import uuid
import rasterio
import geopandas as gpd
import pandas as pd
from rasterio.windows import Window
from shapely.geometry import box
from loguru import logger
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import cairosvg

from PIL import Image

## Configuration

In [None]:
VISUALISATION_04B_01_PNG_PATH = "data/notebook_04/graphics/04b_01_visualisation_04b.png"

GEOTIFF_ORTHO2019_PATH = "data/SITG/ortho2019"
GPKG_TO_MERGE_PATH = "data/SITG/CAD_BATIMENT_HORSOL_TOIT_2024-11-03.gpkg"
TILE_1024_FOLDER_PATH = "data/notebook_04/geotiff/tile_1024_split"
TILE_SIZE = 1024
TILE_BUFFER = int(TILE_SIZE/8)
LOG_FILE = "data/notebook_04/log/04b_split_geotiff.log"
CPU_COUNT = int(os.cpu_count() - 1)

In [None]:
# Validate input paths exist
assert(os.path.exists(GPKG_TO_MERGE_PATH))
assert(os.path.exists(GEOTIFF_ORTHO2019_PATH))
assert(os.path.exists(TILE_1024_FOLDER_PATH))

## Logger Setup

In [None]:
def setup_logger(log_file=None, log_level="INFO"):
    """
    Configure loguru logger with console and file handlers.
    
    Sets up structured logging with color-coded console output and
    optional file logging with rotation and compression.
    
    Parameters:
        log_file (str): Path to log file. If None, only console logging is enabled
        log_level (str): Logging level (DEBUG, INFO, WARNING, ERROR)
    """
    # Remove default handler
    logger.remove()
    
    # Add console handler with colors
    logger.add(
        sink=lambda msg: tqdm.write(msg, end=""),
        colorize=True,
        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
        level=log_level
    )
    
    # Add file handler if specified
    if log_file:
        logger.add(
            sink=log_file,
            format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {function}:{line} - {message}",
            level=log_level,
            rotation="10 MB",  # Rotate when file reaches 10 MB
            compression="zip",  # Compress rotated files
            retention="1 week"  # Keep logs for 1 week
        )
    
    logger.info("Logger initialized")

## Process Visualization

In [None]:
svg = """<?xml version="1.0" encoding="UTF-8"?>
<svg viewBox="0 0 800 600" xmlns="http://www.w3.org/2000/svg">
  <!-- Background -->
  <rect x="0" y="0" width="800" height="600" fill="#f8f9fa" rx="6" ry="6"/>
  
  <!-- Title -->
  <text x="400" y="30" font-family="Arial" font-size="22" font-weight="bold" text-anchor="middle" fill="#333">GeoTIFF Building Detection and Tiling Process</text>
  
  <!-- Input Files Section -->
  <rect x="50" y="60" width="700" height="80" rx="10" ry="10" fill="#e3f2fd" stroke="#2196f3" stroke-width="2"/>
  <text x="400" y="85" font-family="Arial" font-size="18" font-weight="bold" text-anchor="middle" fill="#0d47a1">Input Data</text>
  
  <!-- Input Icons -->
  <rect x="100" y="95" width="250" height="30" rx="5" ry="5" fill="#bbdefb" stroke="#1976d2" stroke-width="1"/>
  <text x="225" y="115" font-family="Arial" font-size="14" text-anchor="middle" fill="#0d47a1">GeoTIFF SITG 20000x20000</text>
  
  <rect x="450" y="95" width="250" height="30" rx="5" ry="5" fill="#bbdefb" stroke="#1976d2" stroke-width="1"/>
  <text x="575" y="115" font-family="Arial" font-size="14" text-anchor="middle" fill="#0d47a1">GPKG SITG Toitures</text>
  
  <!-- Process Flow -->
  <rect x="50" y="160" width="700" height="310" rx="10" ry="10" fill="#e8f5e9" stroke="#4caf50" stroke-width="2"/>
  <text x="400" y="185" font-family="Arial" font-size="18" font-weight="bold" text-anchor="middle" fill="#1b5e20">Processing Pipeline</text>

  <!-- Main Image Processing -->
  <rect x="70" y="200" width="320" height="250" rx="8" ry="8" fill="#fff" stroke="#2e7d32" stroke-width="1"/>
  <text x="230" y="225" font-family="Arial" font-size="16" font-weight="bold" text-anchor="middle" fill="#1b5e20">GeoTIFF 200000x20000</text>
  
  <!-- Main Image Representation -->
  <rect x="100" y="240" width="140" height="140" fill="#c8e6c9" stroke="#2e7d32" stroke-width="1"/>
  <line x1="100" y1="240" x2="240" y2="240" stroke="#2e7d32" stroke-width="1"/>
  <line x1="100" y1="270" x2="240" y2="270" stroke="#2e7d32" stroke-width="1"/>
  <line x1="100" y1="300" x2="240" y2="300" stroke="#2e7d32" stroke-width="1"/>
  <line x1="100" y1="330" x2="240" y2="330" stroke="#2e7d32" stroke-width="1"/>
  <line x1="100" y1="240" x2="100" y2="380" stroke="#2e7d32" stroke-width="1"/>
  <line x1="135" y1="240" x2="135" y2="380" stroke="#2e7d32" stroke-width="1"/>
  <line x1="170" y1="240" x2="170" y2="380" stroke="#2e7d32" stroke-width="1"/>
  <line x1="205" y1="240" x2="205" y2="380" stroke="#2e7d32" stroke-width="1"/>
  <line x1="240" y1="240" x2="240" y2="380" stroke="#2e7d32" stroke-width="1"/>
  
  <!-- Building Polygons -->
  <polygon points="125,260 145,250 165,265 155,280 135,275" fill="#f44336" fill-opacity="0.5" stroke="#d32f2f" stroke-width="1.5"/>
  <polygon points="205,320 220,310 230,330 210,340" fill="#f44336" fill-opacity="0.5" stroke="#d32f2f" stroke-width="1.5"/>
  <polygon points="110,340 130,330 140,350 120,360" fill="#f44336" fill-opacity="0.5" stroke="#d32f2f" stroke-width="1.5"/>
  
  <!-- Tiling Process -->
  <text x="170" y="395" font-family="Arial" font-size="14" text-anchor="middle" fill="#1b5e20">GeoTIFF + GPKG Toitures</text>
  
  <!-- Tiling Result -->
  <rect x="410" y="200" width="320" height="250" rx="8" ry="8" fill="#fff" stroke="#2e7d32" stroke-width="1"/>
  <text x="570" y="225" font-family="Arial" font-size="16" font-weight="bold" text-anchor="middle" fill="#1b5e20">Quadrillage 1024x1024</text>
  
  <!-- Tile Representations -->
  <rect x="430" y="240" width="90" height="90" fill="#c8e6c9" stroke="#2e7d32" stroke-width="2"/>
  <polygon points="470,260 485,250 500,265 490,280 475,275" fill="#f44336" fill-opacity="0.5" stroke="#d32f2f" stroke-width="1.5"/>
  <rect x="430" y="240" width="90" height="90" fill="none" stroke="#2e7d32" stroke-width="1" stroke-dasharray="4,2"/>
  
  <rect x="530" y="240" width="90" height="90" fill="#e3f2fd" stroke="#2196f3" stroke-width="1" fill-opacity="0.3"/>
  
  <rect x="430" y="340" width="90" height="90" fill="#e3f2fd" stroke="#2196f3" stroke-width="1" fill-opacity="0.3"/>
  
  <rect x="530" y="340" width="90" height="90" fill="#c8e6c9" stroke="#2e7d32" stroke-width="2"/>
  <polygon points="570,360 590,355 600,375 580,380" fill="#f44336" fill-opacity="0.5" stroke="#d32f2f" stroke-width="1.5"/>
  <rect x="530" y="340" width="90" height="90" fill="none" stroke="#2e7d32" stroke-width="1" stroke-dasharray="4,2"/>
  
  <!-- Buffer Illustration -->
  <rect x="420" y="230" width="110" height="110" fill="none" stroke="#ff9800" stroke-width="1.5" stroke-dasharray="5,3"/>
  <rect x="520" y="330" width="110" height="110" fill="none" stroke="#ff9800" stroke-width="1.5" stroke-dasharray="5,3"/>
  
  <text x="670" y="390" font-family="Arial" font-size="14" text-anchor="middle" fill="#1b5e20">Buffer 1m</text>
  
  <!-- Process Arrows -->
  <path d="M 250 310 L 400 310" stroke="#2e7d32" stroke-width="2" fill="none" marker-end="url(#arrowhead)"/>
  # <text x="325" y="300" font-family="Arial" font-size="14" text-anchor="middle" fill="#1b5e20">Quadrillage</text>
  
  <!-- Arrow Marker -->
  <defs>
    <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
      <polygon points="0 0, 10 3.5, 0 7" fill="#2e7d32"/>
    </marker>
  </defs>
  
  <!-- Output Section -->
  <rect x="50" y="490" width="700" height="80" rx="10" ry="10" fill="#fff3e0" stroke="#ff9800" stroke-width="2"/>
  <text x="400" y="515" font-family="Arial" font-size="18" font-weight="bold" text-anchor="middle" fill="#e65100">Output Data</text>
  
  <!-- Output Icons -->
  <rect x="100" y="525" width="170" height="30" rx="5" ry="5" fill="#ffe0b2" stroke="#f57c00" stroke-width="1"/>
  <text x="185" y="545" font-family="Arial" font-size="14" text-anchor="middle" fill="#e65100">Tiled GeoTIFF Images</text>
  
  <rect x="315" y="525" width="170" height="30" rx="5" ry="5" fill="#ffe0b2" stroke="#f57c00" stroke-width="1"/>
  <text x="400" y="545" font-family="Arial" font-size="14" text-anchor="middle" fill="#e65100">Tile Metadata (Parquet)</text>
  
  <rect x="530" y="525" width="170" height="30" rx="5" ry="5" fill="#ffe0b2" stroke="#f57c00" stroke-width="1"/>
  <text x="615" y="545" font-family="Arial" font-size="14" text-anchor="middle" fill="#e65100">Combined Metadata</text>
  
  <!-- Final Arrow -->
  <path d="M 400 470 L 400 490" stroke="#e65100" stroke-width="2" fill="none" marker-end="url(#arrowhead2)"/>
  
  <!-- Arrow Marker -->
  <defs>
    <marker id="arrowhead2" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
      <polygon points="0 0, 10 3.5, 0 7" fill="#e65100"/>
    </marker>
  </defs>

</svg>
"""

# Generate and display process visualization
cairosvg.svg2png(bytestring=svg, write_to=VISUALISATION_04B_01_PNG_PATH)
display(Image.open(VISUALISATION_04B_01_PNG_PATH))

## Core Processing Functions

In [None]:
def process_geotiff_with_buildings(geotiff_path, gpkg_path, output_dir, tile_size=1024, buffer_size=256):
    """
    Process a GeoTIFF file to create tiles containing building footprints.
    
    Splits large GeoTIFF images into smaller tiles, identifying which tiles
    contain buildings from the provided GeoPackage. Includes buffer zones
    to ensure buildings at tile boundaries are fully captured.
    
    Parameters:
        geotiff_path (str): Path to input GeoTIFF file
        gpkg_path (str): Path to GeoPackage containing building geometries
        output_dir (str): Directory for output tiles and metadata
        tile_size (int): Size of output tiles in pixels (default: 1024)
        buffer_size (int): Buffer around tiles in pixels (default: 256)
    
    Returns:
        str: Path to output parquet file with tile metadata, None if no buildings found
    """
    # Load building geometries with original geometry preserved
    buildings_gdf = gpd.read_file(gpkg_path)
    buildings_gdf['original_geometry'] = buildings_gdf.geometry  # Store for precise matching
    
    with rasterio.open(geotiff_path) as src:
        # Extract georeferencing information
        crs = src.crs if src.crs is not None else "EPSG:2056"
        transform = src.transform
        height, width = src.height, src.width
        
        # Create buffered bounding box for edge case handling
        bounds = src.bounds
        geotiff_bbox = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
        buffered_geotiff_bbox = geotiff_bbox.buffer(1)  # 1 meter buffer
        
        # Ensure consistent CRS
        if buildings_gdf.crs != crs:
            buildings_gdf = buildings_gdf.to_crs(crs)
        
        # Find buildings within image extent
        buildings_in_image = buildings_gdf[buildings_gdf.intersects(buffered_geotiff_bbox)].copy()
        
        if len(buildings_in_image) == 0:
            logger.warning(f"No buildings found in {geotiff_path}")
            return None
        
        # Create spatial index for efficient querying
        spatial_index = buildings_in_image.sindex if hasattr(buildings_in_image, 'sindex') else None
        
        # Calculate tile grid dimensions
        n_tiles_height = (height + tile_size - 1) // tile_size
        n_tiles_width = (width + tile_size - 1) // tile_size
        
        # Initialize metadata storage
        tiles_metadata = []
        processed_tiles = 0
        tiles_with_buildings = 0
        
        # Progress tracking
        pbar = tqdm(total=n_tiles_height * n_tiles_width, 
                   desc=f"Processing {os.path.basename(geotiff_path)}",
                   unit="tile")
        
        for i in range(n_tiles_height):
            for j in range(n_tiles_width):
                pbar.update(1)
                processed_tiles += 1
                
                # Calculate tile boundaries
                row_start = i * tile_size
                col_start = j * tile_size
                actual_height = min(tile_size, height - row_start)
                actual_width = min(tile_size, width - col_start)
                
                # Skip incomplete edge tiles
                if actual_height < tile_size // 2 or actual_width < tile_size // 2:
                    continue
                
                # Define tile extent
                tile_window = Window(col_start, row_start, actual_width, actual_height)
                tile_bounds = rasterio.windows.bounds(tile_window, transform)
                tile_bbox = box(*tile_bounds)
                
                # Expand search area with buffer
                buffered_tile_bbox = tile_bbox.buffer(buffer_size * transform.a)  # Convert pixels to meters
                
                # Find intersecting buildings efficiently
                if spatial_index:
                    possible_matches_index = list(spatial_index.intersection(buffered_tile_bbox.bounds))
                    possible_matches = buildings_in_image.iloc[possible_matches_index]
                    buildings_in_tile = possible_matches[possible_matches.intersects(buffered_tile_bbox)]
                else:
                    buildings_in_tile = buildings_in_image[buildings_in_image.intersects(buffered_tile_bbox)]
                
                if len(buildings_in_tile) == 0:
                    continue
                
                tiles_with_buildings += 1
                
                # Generate unique tile identifier
                tile_id = f"{i}_{j}_{uuid.uuid4().hex[:6]}"
                original_name = os.path.splitext(os.path.basename(geotiff_path))[0]
                tile_name = f"{original_name}_tile_{tile_id}"
                
                # Calculate buffered window bounds
                buffered_row_start = max(0, row_start - buffer_size)
                buffered_col_start = max(0, col_start - buffer_size)
                buffered_height = min(height - buffered_row_start, actual_height + 2 * buffer_size)
                buffered_width = min(width - buffered_col_start, actual_width + 2 * buffer_size)
                
                # Read tile data with buffer
                buffered_window = Window(buffered_col_start, buffered_row_start, buffered_width, buffered_height)
                tile_data = src.read(window=buffered_window)
                
                # Save tile as GeoTIFF
                tile_path = os.path.join(output_dir, f"{tile_name}.tif")
                tile_transform = rasterio.windows.transform(buffered_window, transform)
                
                profile = src.profile.copy()
                profile.update({
                    'height': buffered_height,
                    'width': buffered_width,
                    'transform': tile_transform,
                    'driver': 'GTiff'
                })
                
                with rasterio.open(tile_path, 'w', **profile) as dst:
                    dst.write(tile_data)
                
                # Store metadata for each building in tile
                for _, building in buildings_in_tile.iterrows():
                    metadata = {
                        'geotiff_path': geotiff_path,
                        'tile_path': tile_path,
                        'tile_id': tile_id,
                        'tile_row': i,
                        'tile_col': j,
                        'building_geometry': building.original_geometry.wkt,
                        'tile_bounds': str(tile_bounds),
                        'buffered_bounds': str(rasterio.windows.bounds(buffered_window, transform)),
                        'tile_size': tile_size,
                        'buffer_size': buffer_size,
                        'tile_pixel_size': (actual_width, actual_height)
                    }
                    
                    # Include all building attributes
                    for col in building.index:
                        if col not in ['geometry', 'original_geometry']:
                            metadata[f'CAD_BTT_HS_TOIT_{col}'] = building[col]
                    
                    tiles_metadata.append(metadata)
        
        pbar.close()
        
        if not tiles_metadata:
            logger.warning(f"No valid tiles with buildings found in {geotiff_path}")
            return None
        
        # Save metadata to parquet
        df = pd.DataFrame(tiles_metadata)
        parquet_path = os.path.join(output_dir, f"{original_name}_metadata.parquet")
        df.to_parquet(parquet_path)
        
        logger.success(f"Processed {len(df)} building tiles from {geotiff_path}")
        return parquet_path

def process_file(args):
    """
    Wrapper function for parallel processing of GeoTIFF files.
    
    Handles exceptions and logging for individual file processing
    within the multiprocessing pool.
    
    Parameters:
        args (tuple): Arguments tuple containing:
            - geotiff_file: Filename to process
            - geotiff_dir: Directory containing GeoTIFF
            - gpkg_path: Path to building GeoPackage
            - output_dir: Output directory
            - tile_size: Tile size in pixels
            - buffer_size: Buffer size in pixels
    
    Returns:
        str: Path to generated metadata file, None if processing failed
    """
    geotiff_file, geotiff_dir, gpkg_path, output_dir, tile_size, buffer_size = args
    try:
        geotiff_path = os.path.join(geotiff_dir, geotiff_file)
        logger.info(f"Processing {geotiff_file}")
        return process_geotiff_with_buildings(
            geotiff_path=geotiff_path,
            gpkg_path=gpkg_path,
            output_dir=output_dir,
            tile_size=tile_size,
            buffer_size=buffer_size
        )
    except Exception as e:
        logger.error(f"Failed on {geotiff_file}: {str(e)}")
        return None

def batch_process_geotiffs(geotiff_dir, gpkg_path, output_dir, log_file=None, 
                         tile_size=1024, buffer_size=256, cpu_count=4):
    """
    Process multiple GeoTIFF files in parallel.
    
    Coordinates parallel processing of all GeoTIFF files in a directory,
    generating tiles and combining metadata into a single output file.
    
    Parameters:
        geotiff_dir (str): Directory containing GeoTIFF files
        gpkg_path (str): Path to GeoPackage with building footprints
        output_dir (str): Directory for output tiles and metadata
        log_file (str): Path to log file (optional)
        tile_size (int): Size of tiles in pixels (default: 1024)
        buffer_size (int): Buffer size in pixels (default: 256)
        cpu_count (int): Number of CPU cores to use (default: 4)
    
    Returns:
        str: Path to combined metadata file, None if no files processed
    """
    setup_logger(log_file=log_file)
    
    # Prepare output directory
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Output directory: {output_dir}")

    # Clear existing files
    for f in os.listdir(output_dir):
        file_path = os.path.join(output_dir, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
    logger.info(f"Cleared output directory: {output_dir}")
    
    # Find all GeoTIFF files
    geotiff_files = sorted([
        f for f in os.listdir(geotiff_dir) 
        if f.lower().endswith(('.tif', '.tiff')) and not f.endswith('_with_crs.tif')
    ])
    
    if not geotiff_files:
        logger.error("No GeoTIFF files found")
        return
    
    logger.info(f"Found {len(geotiff_files)} GeoTIFFs to process")
    
    # Prepare arguments for parallel processing
    args_list = [(f, geotiff_dir, gpkg_path, output_dir, tile_size, buffer_size) 
                for f in geotiff_files]
    
    # Process files in parallel
    metadata_files = []
    with ProcessPoolExecutor(max_workers=cpu_count) as executor:
        futures = {executor.submit(process_file, args): args[0] for args in args_list}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Overall Progress"):
            geotiff_file = futures[future]
            try:
                result = future.result()
                if result:
                    metadata_files.append(result)
                    logger.success(f"Completed {geotiff_file}")
                else:
                    logger.warning(f"No results for {geotiff_file}")
            except Exception as e:
                logger.error(f"Error processing {geotiff_file}: {str(e)}")
    
    # Combine metadata from all processed files
    if metadata_files:
        logger.info(f"Combining {len(metadata_files)} metadata files...")
        combined_df = pd.concat([pd.read_parquet(f) for f in metadata_files], ignore_index=True)
        
        # Clean column names for readability
        combined_df.columns = [col.replace('CAD_BTT_HS_TOIT_', '') for col in combined_df.columns]
        combined_df['globalid'] = combined_df['globalid'].str.replace("{", "").str.replace("}", "")        

        # Save combined results
        combined_path = os.path.join(output_dir, "combined_metadata.parquet")
        combined_df.to_parquet(combined_path)
        logger.success(f"Saved combined metadata to {combined_path}")
        
        return combined_path
    else:
        logger.error("No metadata files were generated")
        return None


## Batch Processing Execution

In [None]:
# Execute batch processing
batch_process_geotiffs(
    geotiff_dir=GEOTIFF_ORTHO2019_PATH,
    gpkg_path=GPKG_TO_MERGE_PATH,
    tile_size=TILE_SIZE,
    buffer_size=TILE_BUFFER,
    output_dir=TILE_1024_FOLDER_PATH,
    log_file=LOG_FILE,
    cpu_count=CPU_COUNT
)

In [None]:
# Load and verify combined metadata
df = pd.read_parquet(TILE_1024_FOLDER_PATH + "/combined_metadata.parquet")

In [None]:
# Analyze tiles with missing buffer information
df[df["has_buffer"].isnull()][["egid", "SHAPE__Area", "SHAPE__Length", "globalid", "sia_cat", "tile_name"]].sort_values(by=["SHAPE__Area"], ascending=False)