# Dataset Selection

**Objective:** Select and prepare a balanced dataset from classified rooftops for analysis.

**Workflow:**
1. Load classified rooftop data and GeoTIFF metadata
2. Enrich data by joining rooftop classifications with tile information
3. Create balanced dataset sampling across SIA categories and area bins
4. Generate visualizations and export dataset for further processing

## Imports

In [None]:
import os
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import logging
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
from collections import Counter
import folium
import pyproj


## Variables

In [None]:
# Generate timestamp for output files
todays_date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Input data paths
GEOPARQUET_CLASSIFICATION_PATH = (
    "data/notebook_02/parquet/02_gdf_toiture_7_classification.parquet"
)
SPLIT_GEOTIFF_1024_PARQUET = "data/notebook_04/geotiff/tile_1024_split/combined_metadata.parquet"

# Sampling parameters
SAMPLES_PER_CAT = 8  # Number of samples per category/bin combination

# Output paths
NOTEBOOK_PATH = Path("data/notebook_05")
GRAPHICS_PATH = NOTEBOOK_PATH / "graphics"
GPKG_SELECTION_DATASET = NOTEBOOK_PATH / "parquet/05_02_dataset.gpkg"

# Visualization data
GPKG_CAD_COMMUNE = "data/SITG/CAD_COMMUNE_2024-11-03.gpkg"

# PNG conversion settings
CONVERT_GEOTIFF_TO_PNG = True
PNG_OUTPUT_PATH = NOTEBOOK_PATH / f"dataset_{todays_date}"
os.makedirs(PNG_OUTPUT_PATH, exist_ok=True)

In [None]:
# Create output directories if they do not exist
os.makedirs(GRAPHICS_PATH, exist_ok=True)
os.makedirs(GPKG_SELECTION_DATASET.parent, exist_ok=True)
os.makedirs(PNG_OUTPUT_PATH, exist_ok=True) 


# Verify all required paths exist
assert(os.path.exists(GEOPARQUET_CLASSIFICATION_PATH))
assert(os.path.exists(GRAPHICS_PATH))
assert(os.path.exists(PNG_OUTPUT_PATH))

## Load Data

In [None]:
# Load classified rooftop data
gdf_toiture_7_classification = gpd.read_parquet(GEOPARQUET_CLASSIFICATION_PATH)

# Validate data integrity
assert(gdf_toiture_7_classification["egid"].isna().sum() == 0)
gdf_toiture_7_classification["egid"].duplicated().sum() == 0

# Check unique building identifiers
len(gdf_toiture_7_classification["egid"].unique())

## Data Enrichment
### Load GeoTIFF Metadata

In [None]:
# Load tile metadata
df_split_geotiff = pd.read_parquet(SPLIT_GEOTIFF_1024_PARQUET)

# Display data types for validation
df_split_geotiff.dtypes
gdf_toiture_7_classification.dtypes

### Join Classification sia_cat with Tile Information

In [None]:
# Create working copy
df_split_geotiff_sia_cat = df_split_geotiff.copy()

# Identify common columns between datasets (excluding geometry)
common_columns = set(df_split_geotiff_sia_cat.columns) & set(gdf_toiture_7_classification.columns)
common_columns = list(common_columns - {"geometry"})

# Merge rooftop classifications with tile information
df_split_geotiff_sia_cat = pd.merge(
    gdf_toiture_7_classification, 
    df_split_geotiff_sia_cat, 
    on=list(common_columns), 
    how="left"
)

In [None]:
# Preview merged data
df_split_geotiff_sia_cat.head(5)

In [None]:
# Verify all rooftops have associated tiles
missing_tiles = df_split_geotiff_sia_cat[df_split_geotiff_sia_cat["tile_path"].isnull()]
assert(len(missing_tiles) == 0)

In [None]:
# Save enriched dataset for future reference
df_split_geotiff_sia_cat.to_parquet(NOTEBOOK_PATH / "05_02_split_geotiff_sia_cat.parquet")

## Dataset Selection

In [None]:
# Create randomized selection dataset
df_selection = df_split_geotiff_sia_cat.copy()
df_selection = df_selection.sample(frac=1, random_state=42).reset_index(drop=True)
display(df_selection.head(5))
display(df_selection.columns)

### Tile Analysis

In [None]:
# Count unique buildings per tile
df_selection.groupby("tile_id")["egid"].nunique().reset_index().sort_values(by=["egid"], ascending=False)

In [None]:
# Calculate total rooftop area per tile
df_selection.groupby("tile_id")["SHAPE__Area"].sum().reset_index().sort_values(by=["SHAPE__Area"], ascending=False)

In [None]:
# Count unique SIA categories per tile
df_selection.groupby("tile_id")["sia_cat"].nunique().reset_index().sort_values(by=["sia_cat"], ascending=False)

In [None]:
# Add aggregate metrics per tile
df_selection["egid_per_tile"] = df_selection.groupby("tile_id")["egid"].transform("nunique")
df_selection["sia_cat_per_tile"] = df_selection.groupby("tile_id")["sia_cat"].transform("nunique")
df_selection["SHAPE__Area_sum_per_tile"] = df_selection.groupby("tile_id")["SHAPE__Area"].transform("sum")

### Stratified Sampling by SIA Category and Area

In [None]:
# Aggregate data by tile
tile_groups = df_selection.groupby('tile_id').agg({
    'globalid': list,
    'geometry_x': list,
    'sia_cat': list,
    'altitude_min': 'mean',
    'altitude_max': 'mean',
    'date_leve': 'first',
    'tile_path': 'first',
    'tile_bounds': 'first',
    "SHAPE__Area": 'sum',
}).reset_index()

# Determine dominant SIA class per tile
tile_groups['dominant_class'] = tile_groups['sia_cat'].apply(
    lambda x: Counter(x).most_common(1)[0][0]
)

# Create area bins for stratification
area_bins = [0, 200, 500, 1000, 2000, 5000, np.inf]
tile_groups['area_bin'] = pd.cut(
    tile_groups['SHAPE__Area'],
    bins=area_bins,
    labels=[
        '0-200', '200-500', '500-1000',
        '1000-2000', '2000-5000', '5000+'
    ]
)

print("Area distribution across bins:")
print(tile_groups['area_bin'].value_counts().sort_index())

In [None]:
def sample_tiles(group, n_samples):
    """
    Sample tiles from a group with fallback for small groups.
    
    Parameters:
        group: DataFrame group to sample from
        n_samples: Target number of samples
        
    Returns:
        DataFrame: Sampled subset of the group
    """
    if len(group) > n_samples:
        return group.sample(n=n_samples, random_state=42)
    else:
        return group

# Perform stratified sampling
sampled_df = tile_groups.groupby(['dominant_class', 'area_bin']).apply(
    sample_tiles, n_samples=SAMPLES_PER_CAT
).reset_index(drop=True)

def convert_bounds_to_polygon(bounds):
    """
    Convert tile bounds to Polygon geometry.
    
    Parameters:
        bounds: Tuple or string representation of bounds (minx, miny, maxx, maxy)
        
    Returns:
        Polygon: Shapely polygon representing the tile bounds
    """
    if isinstance(bounds, str):
        bounds = tuple(map(float, bounds.strip("()").split(",")))
    if len(bounds) == 4:
        minx, miny, maxx, maxy = bounds
        return Polygon([(minx, miny), (maxx, miny), (maxx, maxy), (minx, maxy)])
    else:
        raise ValueError(f"Invalid bounds format: {bounds}")

# Convert bounds to geometry
sampled_df['geometry'] = sampled_df['tile_bounds'].apply(convert_bounds_to_polygon)

# Create GeoDataFrame
sampled_gdf = gpd.GeoDataFrame(
    sampled_df,
    geometry='geometry',
    crs='EPSG:2056'
)

# Save sampled dataset
sampled_gdf.to_file(GPKG_SELECTION_DATASET, driver="GPKG", layer="sampled_tiles")

## Visualization
### Statistical Plots

In [None]:
# Create distribution plot for dominant SIA classes
class_counts = tile_groups['dominant_class'].value_counts().reset_index()
class_counts.columns = ['Classe SIA', 'Nombre de tiles']

# Configure plot style
sns.set_style("whitegrid")

# Create bar plot
plt.figure(figsize=(6.5, 5.5))
plot = sns.barplot(
    x='Classe SIA', 
    y='Nombre de tiles', 
    data=class_counts.sort_values(by='Classe SIA'),
    palette='pastel',
)

# Format labels
plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment='right')

# Add title and labels
plt.title("Distribution des classes SIA dominantes par tuile", fontsize=12, pad=10, fontweight='bold')
plt.xlabel("Classe SIA", fontsize=10)
plt.ylabel("Nombre de tuiles", fontsize=10)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Add value labels on bars
for p in plot.patches:
    plot.annotate(
        f'{int(p.get_height())}', 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha='center', 
        va='bottom', 
        fontsize=10, 
        color='black',
        xytext=(0, 5), 
        textcoords='offset points'
    )

# Clean up plot
sns.despine()
plt.tight_layout()
plt.grid(False)

# Save plot
plt.savefig(os.path.join(GRAPHICS_PATH, "05_01_dominant_class_distribution.png"), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create distribution plot for area bins
plt.close()
area_counts = tile_groups['area_bin'].value_counts().reset_index()
area_counts.columns = ['Area Bin', 'Nombre de tiles']
area_counts = area_counts.sort_values(by='Area Bin')

sns.set_style("whitegrid")

plt.figure(figsize=(6.5, 4.5))
plot = sns.barplot(
    x='Area Bin', 
    y='Nombre de tiles', 
    data=area_counts,
    palette='pastel',
)

# Add title and labels
plt.title("Distribution des tuiles par intervalle de surface", fontsize=12, fontweight='bold', pad=10)
plt.xlabel("Somme des surfaces par tuile en m²", fontsize=10)
plt.ylabel("Nombre de tuiles", fontsize=10)
plt.tick_params(axis='both', labelsize=10)

# Add value labels
for p in plot.patches:
    plot.annotate(
        f'{int(p.get_height())}', 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha='center', 
        va='bottom', 
        fontsize=10, 
        color='black',
        xytext=(0, 5), 
        textcoords='offset points'
    )

# Clean up plot
sns.despine()
plt.tight_layout()
plt.grid(False)

# Save plot
plt.savefig(os.path.join(GRAPHICS_PATH, "05_02_area_bin_distribution.png"), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Create stacked bar chart showing sampling distribution
plt.close()
sns.set_style("whitegrid")

# Prepare data for stacking
stacked_data = sampled_df.groupby(['dominant_class', 'area_bin']).size().unstack().fillna(0)

# Ensure consistent ordering
area_order = ['0-200', '200-500', '500-1000', '1000-2000', '2000-5000', '5000+']
stacked_data = stacked_data.reindex(columns=area_order)

# Create color palette
colors = sns.color_palette("pastel", len(stacked_data.columns))

plt.figure(figsize=(6.5, 5.5))

# Create stacked bar plot
stacked_data.plot(
    kind='bar', 
    stacked=True, 
    color=colors,
    ax=plt.gca()
)

# Format plot
plt.xticks(rotation=45, ha='right')

total_samples = stacked_data.sum().sum()
plt.title(f"Distribution du dataset des classes SIA dominantes\npar intervalle de surface (n={int(total_samples)})", 
          fontsize=12, pad=20, fontweight='bold')
plt.xlabel("Classe SIA dominante", fontsize=10)
plt.ylabel("Nombre de tuiles", fontsize=10)
plt.tick_params(axis='both', labelsize=10)

# Configure legend
plt.legend(
    title="Surface totale\n par tuile (m²)", 
    bbox_to_anchor=(1.05, 1), 
    loc='upper left',
    frameon=True,
    fancybox=True,
    shadow=True,
    fontsize=10,
    title_fontsize=10,
    reverse=True
)

plt.ylim(0, 55)
sns.despine()
plt.grid(False)

# Add total counts on top of bars
for i, total in enumerate(stacked_data.sum(axis=1)):
    plt.text(i, total + 0.5, f'{int(total)}', 
             ha='center', va='bottom', fontsize=10, color='black')

# Add counts for small segments
for i, (index, row) in enumerate(stacked_data.iterrows()):
    bottom = 0
    annotation_count = 0
    
    for j, (area_bin, count) in enumerate(row.items()):
        if count > 0 and count < 8:
            segment_middle = bottom + count/2
            
            # Alternate label positions
            offset = 0.4
            if annotation_count % 2 == 0:
                x_offset = i + offset
            else:
                x_offset = i - offset
            
            plt.text(x_offset, segment_middle, f'{int(count)}', 
                     ha='center', va='center', fontsize=9, 
                     color='black', fontweight='demibold',)
            
            annotation_count += 1
        bottom += count

plt.tight_layout()
plt.savefig(os.path.join(GRAPHICS_PATH, "05_03_stacked.png"), dpi=300, bbox_inches='tight')
plt.show()

### Spatial Visualization

In [None]:
def bounds_to_polygon(bounds_str):
    """
    Convert bounds string to Polygon geometry.
    
    Parameters:
        bounds_str: String representation of bounds '(minx, miny, maxx, maxy)'
        
    Returns:
        Polygon: Shapely polygon or None if conversion fails
    """
    try:
        coords = [float(x) for x in bounds_str.strip("()").split(", ")]
        return Polygon([
            (coords[0], coords[1]),  # Bottom-left
            (coords[2], coords[1]),  # Bottom-right
            (coords[2], coords[3]),  # Top-right
            (coords[0], coords[3])   # Top-left
        ])
    except ValueError:
        logging.error(f"Error converting bounds to polygon: {bounds_str}")
        return None

# Configure plot style
sns.set_style("white")
sns.set_context("notebook", font_scale=1.2)

# Create geometry column
tile_groups['geometry'] = tile_groups['tile_bounds'].apply(bounds_to_polygon)

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
    tile_groups,
    geometry='geometry',
    crs="EPSG:2056"
)

# Create color mapping for SIA classes
unique_classes = sorted(gdf['dominant_class'].unique())
palette = sns.color_palette('tab20', n_colors=len(unique_classes))
color_dict = dict(zip(unique_classes, palette))

# Create spatial plot
ax = gdf.plot(
    figsize=(15, 15),
    column='dominant_class',
    legend=True,
    legend_kwds={'loc': 'upper left', 'bbox_to_anchor': (1, 1)},
    edgecolor='black',
    linewidth=0.3,
    alpha=0.7,
    color=gdf['dominant_class'].map(color_dict)
)

# Add commune boundaries
cad_commune = gpd.read_file(GPKG_CAD_COMMUNE, layer="CAD_COMMUNE")
cad_commune = cad_commune.to_crs("EPSG:2056")
cad_commune.plot(ax=ax, color="none", edgecolor="black", linewidth=0.2)

# Create custom legend
ax.legend(
    handles=[plt.Line2D([0], [0], marker='o', color='w', label=cls, 
                         markerfacecolor=color_dict[cls], markersize=10) for cls in unique_classes],
    title="Classe SIA dominante",
    loc='upper left',
    bbox_to_anchor=(1, 1)
)

ax.set_title(f"Tile par catégorie SIA dominante (total {len(tile_groups)} tiles)", fontsize=16, pad=20)
ax.set_axis_off()
plt.tight_layout()
plt.show()

# Save plot
plt.savefig(os.path.join(GRAPHICS_PATH, "05_04_map_tiles_group.png"), dpi=300, bbox_inches='tight')
plt.close()

In [None]:
# Create detailed map of sampled tiles
plt.close()

# Create geometry for sampled data
sampled_df['geometry'] = sampled_df['tile_bounds'].apply(bounds_to_polygon)

gdf = gpd.GeoDataFrame(
    sampled_df,
    geometry='geometry',
    crs="EPSG:2056"
)

unique_classes = sorted(gdf['dominant_class'].unique())
sns.set_style("whitegrid")

# Define vibrant colors for better visibility
vibrant_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57', 
                  '#FF9FF3', '#54A0FF', '#5F27CD', '#00D2D3', '#FF9F43',
                  '#A3CB38', '#C44569']

color_dict = dict(zip(unique_classes, vibrant_colors[:len(unique_classes)]))

fig, ax = plt.subplots(figsize=(6.5, 6.5), dpi=300)

# Plot sampled tiles
gdf.plot(
    column='dominant_class',
    ax=ax,
    edgecolor=gdf['dominant_class'].map(color_dict),
    linewidth=2,
    alpha=1,
    color=gdf['dominant_class'].map(color_dict)
)

# Add commune boundaries for context
cad_commune = gpd.read_file(GPKG_CAD_COMMUNE, layer="CAD_COMMUNE")
cad_commune = cad_commune.to_crs("EPSG:2056")
cad_commune.plot(ax=ax, color="none", edgecolor="grey", linewidth=0.8, alpha=0.6, ls='--')

# Create shortened labels for better readability
class_labels = {
    'I habitat collectif': 'I Habitat collectif',
    'II habitat individuel': 'II Habitat individuel', 
    'III administration': 'III Administration',
    'IV écoles': 'IV Écoles',
    'IX industrie': 'IX Industrie',
    'V commerce': 'V Commerce',
    'VI restauration': 'VI Restauration',
    'VII lieux de rassemblement': 'VII Lieux rassemblement',
    'VIII hôpitaux': 'VIII Hôpitaux',
    'X dépôts': 'X Dépôts',
    'XI installations sportives': 'XI Installations sportives',
    'XII piscines couvertes': 'XII Piscines couvertes'
}

# Create legend elements
legend_elements = []
for cls in unique_classes:
    label = class_labels.get(cls, cls)
    if len(label) > 25:
        label = label[:22] + "..."
    legend_elements.append(
        plt.Line2D([0], [0], marker='s', color='w', label=label,
                   markerfacecolor=color_dict[cls], markersize=8, 
                   markeredgecolor='black', markeredgewidth=0.5)
    )

# Add legend
legend = ax.legend(
    handles=legend_elements,
    title="Classe SIA dominante",
    loc='upper right',
    bbox_to_anchor=(0.35, 0.95),
    fontsize=9,
    title_fontsize=10,
)

ax.set_title(f"Tuiles par catégorie SIA dominante\n(total {len(sampled_df)} tuiles)", 
             fontsize=12, pad=-10, fontweight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.subplots_adjust(left=0.02, right=0.98, top=0.95, bottom=0.02)

# Save plot
plt.savefig(os.path.join(GRAPHICS_PATH, "05_05_map_sampled_df.png"), dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
# Define coordinate system transformers
swiss_cs = pyproj.CRS("EPSG:2056")  # Swiss LV95
wgs84_cs = pyproj.CRS("EPSG:4326")  # WGS84

transformer = pyproj.Transformer.from_crs(swiss_cs, wgs84_cs, always_xy=True)

def swiss_to_wgs84(east, north):
    """
    Convert Swiss coordinates to WGS84.
    
    Parameters:
        east: Easting in Swiss coordinates
        north: Northing in Swiss coordinates
        
    Returns:
        list: [latitude, longitude] in WGS84
    """
    lon, lat = transformer.transform(east, north)
    return [lat, lon]
   
# Calculate map bounds from all tiles
all_sw_points = []
all_ne_points = []

for tile_id in sampled_df['tile_id'].tolist():
    tile_row = sampled_df[sampled_df['tile_id'] == tile_id]
    if len(tile_row) == 0:
        continue
        
    bounds = tile_row['tile_bounds'].values[0]
    
    if isinstance(bounds, str):
        bounds_clean = bounds.strip('()').replace(' ', '')
        bounds_list = bounds_clean.split(',')
        bounds = tuple(float(x) for x in bounds_list if x)
    
    if len(bounds) >= 4:
        minx, miny, maxx, maxy = bounds[:4]
        
        sw = swiss_to_wgs84(minx, miny)
        ne = swiss_to_wgs84(maxx, maxy)
        
        all_sw_points.append(sw)
        all_ne_points.append(ne)

# Calculate center point
if all_sw_points and all_ne_points:
    avg_lat = np.mean([pt[0] for pt in all_sw_points + all_ne_points])
    avg_lon = np.mean([pt[1] for pt in all_sw_points + all_ne_points])
    print(f"Center of all tiles (lat, lon): {avg_lat}, {avg_lon}")
    
    m = folium.Map(location=[avg_lat, avg_lon], zoom_start=14)
else:
    print("No valid coordinates found")
    m = folium.Map(location=[46.2044, 6.1432], zoom_start=12)

# Add tiles to map
tiles_added = 0
for tile_id in sampled_df['tile_id'].tolist():
    tile_row = sampled_df[sampled_df['tile_id'] == tile_id]
    if len(tile_row) == 0:
        continue
        
    bounds = tile_row['tile_bounds'].values[0]
    
    if isinstance(bounds, str):
        bounds_clean = bounds.strip('()').replace(' ', '')
        bounds_list = bounds_clean.split(',')
        bounds = tuple(float(x) for x in bounds_list if x)
    
    if len(bounds) >= 4:
        minx, miny, maxx, maxy = bounds[:4]
        
        # Convert corners to WGS84
        sw = swiss_to_wgs84(minx, miny)
        ne = swiss_to_wgs84(maxx, maxy)
        nw = swiss_to_wgs84(minx, maxy)
        se = swiss_to_wgs84(maxx, miny)
        
        # Create polygon
        folium.Polygon(
            locations=[sw, nw, ne, se],
            color='blue',
            weight=1,
            fill=True,
            fill_opacity=0.4,
            tooltip=f"Tile ID: {tile_id}"
        ).add_to(m)
        
        tiles_added += 1

print(f"Added {tiles_added} tiles to the map")

# Display map
m


## GeoTIFF to PNG Conversion

Note: PNG was for Roboflow, which was not used in the methodology. Supervisely admits directly GeoTIFF files.

In [None]:
# Preview data before conversion
sampled_df

In [None]:
def convert_tiff_to_png(sample_df, output_dir):
    """
    Convert GeoTIFF files to PNG format for machine learning workflows.
    
    Uses GDAL to convert with optimized settings for image quality and
    file size. Creates timestamped output directory to track conversions.
    
    Parameters:
        sample_df: DataFrame containing tile_path column with GeoTIFF paths
        output_dir: Base directory for PNG output
        
    Returns:
        Path: Output directory path containing converted files
    """
    # Create timestamped output directory
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    output_dir = Path(output_dir) / f"PNG_dataset_roboflow_{timestamp}"
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    logging.info(f"Output directory created: {output_dir}")

    # Extract unique TIFF files from dataset
    tiff_files = sample_df["tile_path"].dropna().unique().tolist()
    
    # Ensure Path objects
    tiff_files = [Path(file) if not isinstance(file, Path) else file for file in tiff_files]
    
    logging.info(f"Found {len(tiff_files)} unique TIFF files to process")

    # Process each TIFF file
    for tiff_file in tqdm(tiff_files, desc="Converting to PNG"):
        # Validate source file
        if not Path(tiff_file).exists():
            logging.warning(f"Source file not found: {tiff_file}")
            continue

        # Define output path
        png_file = output_dir / f"{Path(tiff_file).stem}.png"

        # Skip if already converted
        if not png_file.exists():
            try:
                # Execute GDAL conversion with optimized parameters
                subprocess.run(
                    [
                        "gdal_translate",      # GDAL conversion tool
                        "-of", "PNG",          # Output format
                        "-co", "ZLEVEL=9",     # Maximum PNG compression
                        "-co", "PREDICTOR=2",  # Optimization for compression
                        "-ot", "Byte",         # 8-bit output
                        "-r", "nearest",       # Nearest neighbor resampling
                        "-co", "COMPRESS=PNG", # Use PNG compression
                        str(tiff_file),        # Input file
                        str(png_file),         # Output file
                    ],
                    check=True,
                    capture_output=True,
                    text=True,
                )
                logging.info(f"Converted {Path(tiff_file).name}")
            except subprocess.CalledProcessError as e:
                logging.error(f"Error converting {tiff_file}: {e.stderr}")
            except Exception as e:
                logging.error(f"Unexpected error processing {tiff_file}: {str(e)}")
        else:
            logging.info(f"Skipping {Path(tiff_file).name} (already exists)")
    
    logging.info(f"Conversion complete. Files saved to {output_dir}")
    
    # Save metadata for reference
    sample_df.to_csv(output_dir / "sampled_tiles.csv", index=False)
    
    return output_dir

In [None]:
# Execute conversion if enabled
if CONVERT_GEOTIFF_TO_PNG:
    convert_tiff_to_png(
        sampled_df,
        PNG_OUTPUT_PATH,
    )