### Whisp a feature collection

Setup
- NB use a virtual environment to avoid altering your python environment (https://docs.python.org/3/tutorial/venv.html)

In [None]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

try:
    ee.Initialize(project='ee-andyarnellgee', opt_url='https://earthengine-highvolume.googleapis.com')
except Exception:
    ee.Authenticate()
    ee.Initialize(project='ee-andyarnellgee', opt_url='https://earthengine-highvolume.googleapis.com')

In [None]:
# Install openforis-whisp (uncomment line if not already installed)
# !pip install --pre openforis-whisp

In [None]:
import openforis_whisp as whisp

Get a feature collection

In [None]:
GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")

print (GEOJSON_EXAMPLE_FILEPATH)

In [None]:
import os
import logging
import time
import requests
import geopandas as gpd
import pandas as pd
from pathlib import Path
from datetime import datetime


In [None]:
# Configure logging
def setup_logging(log_level=logging.INFO):
    """Set up logging with consistent formatting."""
    logging.basicConfig(
        level=log_level,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    return logging.getLogger('whisp_processor')

logger = setup_logging()

# //step 1: Load GeoJSON features and add unique IDs
def convert_geojson_to_ee_bbox(geojson_filepath) -> ee.FeatureCollection:
    """
    Reads a GeoJSON file, creates bounding boxes for each feature,
    and converts to Earth Engine FeatureCollection.
    
    Args:
        geojson_filepath (Any): The filepath to the GeoJSON file.
        
    Returns:
        ee.FeatureCollection: Earth Engine FeatureCollection of bounding boxes.
    """
    # import os
    # from pathlib import Path
    # import geopandas as gpd
    # import ee
    
    # Read the GeoJSON file using geopandas
    if isinstance(geojson_filepath, (str, Path)):
        file_path = os.path.abspath(geojson_filepath)
        print(f"Reading GeoJSON file from: {file_path}")
        
        try:
            # Load GeoJSON directly with geopandas
            gdf = gpd.read_file(file_path)
        except Exception as e:
            raise ValueError(f"Error reading GeoJSON file: {str(e)}")
    else:
        raise ValueError("Input must be a file path (str or Path)")
    
    # Check if GeoDataFrame is empty
    if len(gdf) == 0:
        raise ValueError("GeoJSON contains no features")
    
    # Add internal_id if not present
    if 'internal_id' not in gdf.columns:
        gdf['internal_id'] = range(1, len(gdf) + 1)
    
    # Create a new GeoDataFrame with bounding boxes
    bbox_features = []
    for idx, row in gdf.iterrows():
        try:
            # Get the bounds of the geometry (minx, miny, maxx, maxy)
            minx, miny, maxx, maxy = row.geometry.bounds
            
            # Create an Earth Engine Rectangle geometry
            ee_geometry = ee.Geometry.Rectangle([minx, miny, maxx, maxy])
            
            # Copy properties from the original feature
            properties = {col: row[col] for col in gdf.columns if col != 'geometry'}
            
            # Convert numpy types to native Python types for proper serialization
            for key, value in properties.items():
                if hasattr(value, 'item'):  # Check if it's a numpy type
                    properties[key] = value.item()  # Convert to Python native type
                elif pd.isna(value):
                    properties[key] = None
            
            # Create an Earth Engine feature with the bbox geometry
            ee_feature = ee.Feature(ee_geometry, properties)
            bbox_features.append(ee_feature)
            
        except Exception as e:
            print(f"Error processing feature {idx}: {str(e)}")
    
    # Check if any features were created
    if not bbox_features:
        raise ValueError("No valid features found in GeoJSON")
    
    # Create the Earth Engine FeatureCollection
    feature_collection = ee.FeatureCollection(bbox_features)
    print(f"Created Earth Engine FeatureCollection with {len(bbox_features)} bounding box features")
    
    return feature_collection

# Download GeoTIFF for feature using Earth Engine
def download_geotiff_for_feature(ee_feature, image, output_dir, scale=10, max_retries=3, retry_delay=5):
    """
    Download a GeoTIFF for a specific Earth Engine feature by clipping the image.
    
    Args:
        ee_feature: Earth Engine feature to clip the image to
        image: Earth Engine image to download (e.g., whisp.combine_datasets())
        output_dir: Directory to save the GeoTIFF
        scale: Resolution in meters (default 10m)
        max_retries: Maximum number of retry attempts for download
        retry_delay: Seconds to wait between retries
        
    Returns:
        output_path: Path to the downloaded GeoTIFF file
    """
    # Get the feature ID
    try:
        internal_id = ee_feature.get('internal_id').getInfo()
        logger.info(f"Downloading GeoTIFF for feature {internal_id}")
    except Exception as e:
        logger.error(f"Error getting internal_id from feature: {str(e)}")
        internal_id = f"unknown_{datetime.now().strftime('%Y%m%d%H%M%S')}"
    
    # Ensure output directory exists
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Create a unique filename
    filename = f"feature_{internal_id}.tif"
    output_path = output_dir / filename
    
    # If file already exists, don't re-download
    if output_path.exists():
        logger.info(f"File {filename} already exists, skipping download")
        return output_path
    
    # Track retries
    retries = 0
    
    while retries < max_retries:
        try:
            # Clip the image to the feature
            clipped_image = image.clip(ee_feature.geometry())
            
            # Generate the download URL with timeout handling
            logger.debug(f"Generating download URL for feature {internal_id}")
            start_time = time.time()
            download_url = clipped_image.getDownloadURL({
                'format': 'GeoTIFF',  # Note: Earth Engine accepts 'GeoTIFF' 
                'region': ee_feature.geometry(),
                'scale': scale,
                'crs': 'EPSG:4326'
            })
            url_time = time.time() - start_time
            logger.debug(f"URL generated in {url_time:.2f}s: {download_url[:80]}...")
            
            # Download the image with timeout
            logger.info(f"Downloading to {output_path}")
            response = requests.get(download_url, timeout=300)  # 5-minute timeout
            
            if response.status_code == 200:
                # Check if the response is actually a GeoTIFF
                content_type = response.headers.get('Content-Type', '')
                if 'tiff' in content_type.lower() or 'zip' in content_type.lower():
                    with open(output_path, 'wb') as f:
                        f.write(response.content)
                    logger.info(f"Successfully downloaded {filename}")
                    return output_path
                else:
                    # Log error if the response isn't a GeoTIFF
                    logger.error(f"Download returned non-TIFF content: {content_type}")
                    # Save the response for debugging
                    error_file = output_dir / f"error_{internal_id}.txt"
                    with open(error_file, 'wb') as f:
                        f.write(response.content[:2000])  # Save first part for debugging
                    logger.error(f"Saved error content to {error_file}")
                    retries += 1
            else:
                logger.error(f"Failed to download (status {response.status_code}): {response.text[:200]}")
                retries += 1
                
            # Wait before retrying
            if retries < max_retries:
                sleep_time = retry_delay * (2 ** retries)  # Exponential backoff
                logger.info(f"Retrying in {sleep_time} seconds (attempt {retries+1}/{max_retries})")
                time.sleep(sleep_time)
        
        except Exception as e:
            logger.error(f"Error downloading feature {internal_id}: {str(e)}", exc_info=True)
            retries += 1
            if retries < max_retries:
                logger.info(f"Retrying in {retry_delay} seconds (attempt {retries+1}/{max_retries})")
                time.sleep(retry_delay)
    
    logger.error(f"Maximum retries reached for feature {internal_id}")
    return None

def download_geotiffs_for_feature_collection(feature_collection, image, output_dir=None, 
                                            scale=10, max_features=None, max_workers=None,
                                            max_retries=3, retry_delay=5):
    """
    Download GeoTIFFs for an entire Earth Engine FeatureCollection, with parallel processing option.
    
    Args:
        feature_collection: Earth Engine FeatureCollection to process
        image: Earth Engine image to clip and download
        output_dir: Directory to save the GeoTIFFs (default: ~/Downloads/whisp_features)
        scale: Resolution in meters (default 10m)
        max_features: Maximum number of features to process (default: all)
        max_workers: Maximum number of parallel workers (default: None, sequential processing)
        max_retries: Maximum number of retry attempts for each download
        retry_delay: Base delay in seconds between retries (uses exponential backoff)
        
    Returns:
        List of paths to successfully downloaded GeoTIFF files
    """
    import logging
    import concurrent.futures
    from pathlib import Path
    import ee
    
    logger = logging.getLogger('whisp_processor')
    
    # Set default output directory
    if output_dir is None:
        output_dir = Path.home() / 'Downloads' / 'whisp_features'
    
    # Create directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Get collection size and limit if needed
    collection_size = feature_collection.size().getInfo()
    logger.info(f"Processing Earth Engine FeatureCollection with {collection_size} features")
    
    if max_features and max_features < collection_size:
        feature_collection = feature_collection.limit(max_features)
        collection_size = max_features
        logger.info(f"Limited to processing first {max_features} features")
    
    # Get features as a list
    features = feature_collection.toList(collection_size)
    
    # Create a function to download a single feature given its index
    def download_feature(index):
        try:
            ee_feature = ee.Feature(features.get(index))
            return download_geotiff_for_feature(
                ee_feature=ee_feature,
                image=image,
                output_dir=output_dir,
                scale=scale,
                max_retries=max_retries,
                retry_delay=retry_delay
            )
        except Exception as e:
            logger.error(f"Error processing feature at index {index}: {str(e)}", exc_info=True)
            return None
    
    results = []
    
    # Parallel processing if max_workers is specified and > 1
    if max_workers and max_workers > 1:
        logger.info(f"Using parallel processing with {max_workers} workers")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_index = {
                executor.submit(download_feature, i): i 
                for i in range(collection_size)
            }
            
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    path = future.result()
                    if path:
                        results.append(path)
                        logger.info(f"Completed feature {index+1}/{collection_size}")
                    else:
                        logger.warning(f"Failed to download feature {index+1}/{collection_size}")
                except Exception as e:
                    logger.error(f"Exception occurred while processing feature {index+1}: {str(e)}")
    else:
        # Sequential processing
        logger.info("Processing features sequentially")
        for i in range(collection_size):
            logger.info(f"Processing feature {i+1}/{collection_size}")
            path = download_feature(i)
            if path:
                results.append(path)
    
    logger.info(f"Completed downloading {len(results)}/{collection_size} features successfully")
    return results


In [None]:
ee_bbox_collection = convert_geojson_to_ee_bbox(GEOJSON_EXAMPLE_FILEPATH)

Functions for further obscuring ee features  

In [None]:

def extend_bbox(minx, miny, maxx, maxy, extension_distance=None, extension_range=None):
    """
    Extends a bounding box by a fixed distance or a random distance within a range.
    
    Args:
        minx, miny, maxx, maxy: The original bounding box coordinates
        extension_distance: Fixed distance to extend in all directions
        extension_range: List [min_dist, max_dist] for random extension
        
    Returns:
        Tuple of (minx, miny, maxx, maxy) for the extended bounding box
    """
    import random
    
    if extension_distance is None and extension_range is None:
        return minx, miny, maxx, maxy
    
    # Determine the extension distance
    if extension_range is not None:
        min_dist, max_dist = extension_range
        dist = random.uniform(min_dist, max_dist)
    else:
        dist = extension_distance
    
    # Extend the bounding box
    extended_minx = minx - dist
    extended_miny = miny - dist
    extended_maxx = maxx + dist
    extended_maxy = maxy + dist
    
    return extended_minx, extended_miny, extended_maxx, extended_maxy


def shift_bbox(minx, miny, maxx, maxy, max_shift_distance, pixel_length=0.0001):
    """
    Shifts a bounding box in a random direction within max_shift_distance.
    
    Args:
        minx, miny, maxx, maxy: The bounding box coordinates
        max_shift_distance: Maximum distance to shift
        pixel_length: Length of a pixel to avoid accuracy loss
        
    Returns:
        Tuple of (minx, miny, maxx, maxy) for the shifted bounding box
    """
    import random
    import math
    
    if max_shift_distance <= 0:
        return minx, miny, maxx, maxy
    
    # Calculate the effective max shift (max_shift - pixel_length)
    effective_max_shift = max(0, max_shift_distance - pixel_length)
    
    # Random shift distance (less than effective_max_shift)
    shift_distance = random.uniform(0, effective_max_shift)
    
    # Random angle in radians
    angle = random.uniform(0, 2 * math.pi)
    
    # Calculate shift components
    dx = shift_distance * math.cos(angle)
    dy = shift_distance * math.sin(angle)
    
    # Apply shift
    shifted_minx = minx + dx
    shifted_miny = miny + dy
    shifted_maxx = maxx + dx
    shifted_maxy = maxy + dy
    
    return shifted_minx, shifted_miny, shifted_maxx, shifted_maxy

def generate_random_geometries(gdf, max_distance, proportion=0.5):
    """
    Generates random geometries near the original features in a GeoDataFrame.
    Each random geometry is placed within the specified distance of a randomly selected
    existing feature, rather than anywhere within the overall extent.
    
    Args:
        gdf: GeoDataFrame with the original features
        max_distance: Maximum distance from original features
        proportion: Proportion of extra geometries to create (relative to original count)
        
    Returns:
        List of Earth Engine features with random geometries
    """
    import random
    import ee
    import pandas as pd
    import math
    
    if proportion <= 0 or max_distance <= 0 or len(gdf) == 0:
        return []
    
    random_features = []
    
    # Get the dimensions and centroids from original features
    feature_info = []
    
    for idx, row in gdf.iterrows():
        minx, miny, maxx, maxy = row.geometry.bounds
        width = maxx - minx
        height = maxy - miny
        centroid_x = (minx + maxx) / 2
        centroid_y = (miny + maxy) / 2
        feature_info.append({
            'width': width,
            'height': height,
            'center_x': centroid_x,
            'center_y': centroid_y,
            'bounds': (minx, miny, maxx, maxy)
        })
    
    # Calculate number of random features to create
    num_random_features = max(1, int(len(gdf) * proportion))
    
    # Generate random features
    for i in range(num_random_features):
        # Select a random original feature to be near
        random_feature_idx = random.randint(0, len(feature_info)-1)
        selected_feature = feature_info[random_feature_idx]
        
        # Get the original feature's dimensions
        width = selected_feature['width']
        height = selected_feature['height']
        orig_x = selected_feature['center_x']
        orig_y = selected_feature['center_y']
        
        # Add some variation to dimensions (± 20%)
        width_variation = random.uniform(0.8, 1.2)
        height_variation = random.uniform(0.8, 1.2)
        width *= width_variation
        height *= height_variation
        
        # Generate a random position within max_distance of the selected feature
        # First, select a random angle
        angle = random.uniform(0, 2 * math.pi)
        
        # Then, select a random distance (within max_distance)
        distance = random.uniform(0, max_distance)
        
        # Calculate the new center point
        center_x = orig_x + (distance * math.cos(angle))
        center_y = orig_y + (distance * math.sin(angle))
        
        # Calculate corners for the random rectangle
        r_minx = center_x - (width / 2)
        r_miny = center_y - (height / 2)
        r_maxx = center_x + (width / 2)
        r_maxy = center_y + (height / 2)
        
        # Create Earth Engine Rectangle geometry
        ee_geometry = ee.Geometry.Rectangle([r_minx, r_miny, r_maxx, r_maxy])
        
        # Create random properties
        properties = {
            'random_feature': True, 
            'internal_id': f'random_{i + 1000}',  # Use high numbers to avoid conflicts
            'obscured': True,
            'near_feature_id': random_feature_idx + 1  # Store which feature it's near
        }
        
        # Create an Earth Engine feature
        ee_feature = ee.Feature(ee_geometry, properties)
        random_features.append(ee_feature)
    
    return random_features

def convert_geojson_to_ee_bbox_obscured(
    geojson_filepath, 
    extension_distance=None, 
    extension_range=None,
    shift_geometries=False,
    shift_proportion=0.5,  # NEW: Control how much of extension is used for shifting
    pixel_length=0.0001,
    add_random_features=False,
    max_distance=0.1,
    random_proportion=0.5
) -> ee.FeatureCollection:
    """
    Reads a GeoJSON file, creates bounding boxes for each feature,
    and converts to Earth Engine FeatureCollection with options to obscure locations.
    
    Args:
        geojson_filepath (str or Path): The filepath to the GeoJSON file
        extension_distance (float): Fixed distance to extend bounding boxes
        extension_range (list): [min_dist, max_dist] for random extension
        shift_geometries (bool): Whether to shift bounding boxes randomly
        shift_proportion (float): How much of extension can be used for shifting (0-1, default 0.5)
        pixel_length (float): Length of a pixel to avoid accuracy loss
        add_random_features (bool): Whether to add random decoy features
        max_distance (float): Maximum distance for random features
        random_proportion (float): Proportion of random features to add
        
    Returns:
        ee.FeatureCollection: Earth Engine FeatureCollection of bounding boxes
    """
    import os
    import geopandas as gpd
    import ee
    import pandas as pd
    from pathlib import Path
    
    # Read the GeoJSON file using geopandas
    if isinstance(geojson_filepath, (str, Path)):
        file_path = os.path.abspath(geojson_filepath)
        print(f"Reading GeoJSON file from: {file_path}")
        
        try:
            # Load GeoJSON directly with geopandas
            gdf = gpd.read_file(file_path)
        except Exception as e:
            raise ValueError(f"Error reading GeoJSON file: {str(e)}")
    else:
        raise ValueError("Input must be a file path (str or Path)")
    
    # Check if GeoDataFrame is empty
    if len(gdf) == 0:
        raise ValueError("GeoJSON contains no features")
    
    # Add internal_id if not present
    if 'internal_id' not in gdf.columns:
        gdf['internal_id'] = range(1, len(gdf) + 1)
    
    # Create a new list with bounding boxes
    bbox_features = []
    
    # Validate shift_proportion to be between 0 and 1
    shift_proportion = max(0, min(1, shift_proportion))
    
    for idx, row in gdf.iterrows():
        try:
            # Get the bounds of the geometry (minx, miny, maxx, maxy)
            minx, miny, maxx, maxy = row.geometry.bounds
            
            # Apply bounding box extension if requested
            if extension_distance is not None or extension_range is not None:
                minx, miny, maxx, maxy = extend_bbox(
                    minx, miny, maxx, maxy, 
                    extension_distance=extension_distance,
                    extension_range=extension_range
                )
            
            # Apply random shift if requested
            if shift_geometries:
                # Determine max shift distance - limit by shift_proportion
                max_shift = 0
                if extension_distance is not None:
                    max_shift = extension_distance * shift_proportion
                elif extension_range is not None:
                    max_shift = extension_range[1] * shift_proportion  # Use max of range
                
                if max_shift > 0:
                    minx, miny, maxx, maxy = shift_bbox(
                        minx, miny, maxx, maxy, 
                        max_shift, pixel_length
                    )
                else:
                    print(f"Warning: No shifting applied to feature {idx} due to missing extension parameters")
            
            # Create an Earth Engine Rectangle geometry
            ee_geometry = ee.Geometry.Rectangle([minx, miny, maxx, maxy])
            
            # Copy properties from the original feature
            properties = {col: row[col] for col in gdf.columns if col != 'geometry'}
            
            # Convert numpy types to native Python types for proper serialization
            for key, value in properties.items():
                if hasattr(value, 'item'):  # Check if it's a numpy type
                    properties[key] = value.item()  # Convert to Python native type
                elif pd.isna(value):
                    properties[key] = None
            
            # Create an Earth Engine feature with the bbox geometry
            ee_feature = ee.Feature(ee_geometry, properties)
            bbox_features.append(ee_feature)
            
        except Exception as e:
            print(f"Error processing feature {idx}: {str(e)}")
    
    # Check if any features were created
    if not bbox_features:
        raise ValueError("No valid features found in GeoJSON")
    
    # Add random decoy features if requested
    if add_random_features:
        random_features = generate_random_geometries(
            gdf, max_distance, random_proportion
        )
        
        if random_features:
            bbox_features.extend(random_features)
            print(f"Added {len(random_features)} random decoy features to obscure real locations")
    
    # Create the Earth Engine FeatureCollection
    feature_collection = ee.FeatureCollection(bbox_features)
    print(f"Created Earth Engine FeatureCollection with {len(bbox_features)} bounding box features")
    
    return feature_collection

Examples

In [None]:
ee_collection = whisp.convert_geojson_to_ee(
 GEOJSON_EXAMPLE_FILEPATH
)

# Example 1: Just extend bounding boxes by fixed distance
extended_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_distance=0.001  # where 0.001 degrees is ~100m at equator
)

# Example 2: Use random extension distances for each feature
random_extended_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_range=[0.0005, 0.001]  # Random extension (0.001 degrees is ~100m at equator)
)

# Example 3: Extend and shift geometries
shifted_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_distance=0.001,
    shift_geometries=True,
    shift_proportion=0.5,  # Use X% of the extension distance for shifting
    pixel_length=0.0001  # ~10m at equator
)

# Example 4: add random features
random_extras_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    add_random_features=True,
    random_proportion=0.1  # Add X% more features as decoys
)

# Example 5: Full obscuration - extend, shift, and add random features
fully_obscured_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_range=[0.002, 0.003],
    shift_geometries=True,
    shift_proportion=0.9,
    pixel_length=0.0001,  # ~10m at equator
    add_random_features=True,
    max_distance=0.05,  # xkm at equator
    random_proportion= 1  # Add X more features as decoys
)

View

In [None]:
import geemap

map = geemap.Map()
# map.addLayer(extended_collection, {}, "Extended Collection")
# map.addLayer(random_extended_collection, {}, "Random Extended Collection")
# map.addLayer(shifted_collection, {}, "Shifted Collection")
# map.addLayer(random_extras_collection, {}, "Random Extras Collection")
map.addLayer(fully_obscured_collection, {}, "Fully Obscured Collection")
# map.addLayer (ee_bbox_collection, {}, "Original bbox Collection")
map.addLayer (ee_collection, {}, "Original Collection")
map.centerObject(ee_bbox_collection.first(), 10)

map

In [None]:
# # Sequential processing
# geotiff_paths = download_geotiffs_for_feature_collection(
#     feature_collection=ee_bbox_collection,
#     image=whisp.combine_datasets(),
#     output_dir=Path.home() / 'whisp_outputs',
#     max_features=10
# )

# # Parallel processing (faster for many features)
# geotiff_paths = download_geotiffs_for_feature_collection(
#     feature_collection=ee_bbox_collection,
#     # feature_collection=fully_obscured_collection,
#     image=whisp.combine_datasets(),
#     max_features=100,
#     max_workers=40  # Process X features concurrently
# )

Sequential stats (client side using exact extract)

In [None]:
def geotiff_stats_by_feature_id(
    geojson_path, 
    tiff_dir=None, 
    output_csv=None, 
    tiff_id_pattern=r'feature_(\d+)\.tif', 
    id_column='internal_id',
    ops=['sum'],
    max_features=None
):
    """
    Process GeoTIFF files that match feature IDs in a GeoJSON, run exactextract,
    and save the results to CSV.
    
    Args:
        geojson_path (str or Path): Path to the GeoJSON file with features
        tiff_dir (str or Path): Directory containing GeoTIFF files (default: ~/Downloads/whisp_features)
        output_csv (str or Path): Path to save the output CSV (default: uses timestamp)
        tiff_id_pattern (str): Regex pattern to extract ID from GeoTIFF filename
        id_column (str): Column name in GeoJSON containing feature IDs
        ops (list): List of operations to perform with exactextract
        max_features (int): Maximum number of features to process
        
    Returns:
        pd.DataFrame: Combined results DataFrame
        str: Path to the output CSV file
    """
    import os
    import re
    import pandas as pd
    import geopandas as gpd
    from pathlib import Path
    from datetime import datetime
    import logging
    from exactextract import exact_extract
    
    logger = logging.getLogger('whisp_processor')
    
    # Set default directory if not specified
    if tiff_dir is None:
        tiff_dir = Path.home() / 'Downloads' / 'whisp_features'
    else:
        tiff_dir = Path(tiff_dir)
    
    # Load the GeoJSON
    logger.info(f"Loading GeoJSON from {geojson_path}")
    gdf = gpd.read_file(geojson_path)
    
    # Ensure ID column exists
    if id_column not in gdf.columns:
        logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
        gdf[id_column] = range(1, len(gdf) + 1)
    
    # Apply max_features if specified
    if max_features and max_features < len(gdf):
        logger.info(f"Limiting to first {max_features} features")
        gdf = gdf.iloc[:max_features]
    
    # Find all GeoTIFF files in the directory
    tiff_files = []
    for file in os.listdir(tiff_dir):
        if file.endswith('.tif') or file.endswith('.tiff'):
            tiff_files.append(file)
    
    logger.info(f"Found {len(tiff_files)} GeoTIFF files in {tiff_dir}")
    
    # Set up output CSV
    if output_csv is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = Path(tiff_dir) / f"feature_stats_{timestamp}.csv"
    else:
        output_csv = Path(output_csv)
    
    # Create empty results DataFrame
    all_results = pd.DataFrame()
    id_pattern = re.compile(tiff_id_pattern)
    
    # Track processed features for reporting
    processed_count = 0
    matched_count = 0
    
    # Process each GeoTIFF file
    for tiff_file in tiff_files:
        # Extract ID from filename using regex
        match = id_pattern.search(tiff_file)
        if not match:
            logger.debug(f"Could not extract ID from filename: {tiff_file}, skipping")
            continue
        
        tiff_id = int(match.group(1))
        processed_count += 1
        
        # Find matching feature in GeoJSON
        matching_feature = gdf[gdf[id_column] == tiff_id]
        if len(matching_feature) == 0:
            logger.debug(f"No matching feature found for ID {tiff_id}, skipping")
            continue
        
        matched_count += 1
        logger.info(f"Processing feature ID: {tiff_id} ({matched_count} of {processed_count} matched)")
        
        # Full path to GeoTIFF file
        tiff_path = tiff_dir / tiff_file
        
        try:
            # Run exactextract
            logger.debug(f"Running exactextract on {tiff_file}")
            stats = exact_extract(
                rast=str(tiff_path),
                vec=matching_feature,
                ops=ops,
                output='pandas',
                include_cols=[id_column]
            )
            
            # Add the geometry column to the results
            stats['geometry'] = matching_feature.iloc[0].geometry
            
            # Append to results
            if all_results.empty:
                all_results = stats
                # Write header to CSV
                stats.to_csv(output_csv, index=False)
            else:
                all_results = pd.concat([all_results, stats], ignore_index=True)
                # Append to CSV without header
                stats.to_csv(output_csv, mode='a', header=False, index=False)
            
            logger.info(f"Feature {tiff_id} processed successfully")
            
        except Exception as e:
            logger.error(f"Error processing feature {tiff_id}: {str(e)}")
    
    # Convert to GeoDataFrame for spatial analysis
    if not all_results.empty:
        try:
            result_gdf = gpd.GeoDataFrame(all_results, geometry='geometry')
            if gdf.crs:
                result_gdf = result_gdf.set_crs(gdf.crs)
                
            logger.info(f"Processed {matched_count}/{processed_count} GeoTIFF files with matching features")
            logger.info(f"Results saved to {output_csv}")
            return result_gdf, str(output_csv)
        except Exception as e:
            logger.error(f"Error creating GeoDataFrame from results: {str(e)}")
    
    if all_results.empty:
        logger.warning("No results generated")
    
    return all_results, str(output_csv)

simple approach for single raster or list of rasters and all features in geojson

list tif files in the directory

In [None]:
import os
import glob
from pathlib import Path

# Define the folder path
folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'

# Using Path from pathlib (more modern approach)
print("\n=== TIFF files using pathlib ===")
folder = Path(folder_path)
tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))

if tif_files:
    for i, file_path in enumerate(tif_files, 1):
        file_size = file_path.stat().st_size / (1024 * 1024)  # Convert to MB
        print(f"{i}. {file_path.name} - {file_size:.2f} MB")
else:
    print("No TIFF files found in the directory")

print(f"\nTotal: {len(tif_files)} TIFF files")

imports

In [None]:
    import os
    import re
    import pandas as pd
    import geopandas as gpd
    from pathlib import Path
    from datetime import datetime
    import logging
    from exactextract import exact_extract
    import concurrent.futures
    import threading
    import time

try avoiding gdal using rio_vrt

In [None]:
# !pip install rio-vrt
from rio_vrt import build_vrt



In [None]:

# Define the folder containing your GeoTIFFs
folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001_2'
folder = Path(folder_path)

# Get list of TIFF files
tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
tif_file_paths = [str(file) for file in tif_files]

# Output VRT path
output_vrt = str(folder / "combined_rasters.vrt")



In [None]:
# # Create the VRT file
# vrt_file = build_vrt(output_vrt, tif_file_paths)

# print(f"VRT file created at: {output_vrt}")

In [None]:
import rasterio
import numpy as np
from pathlib import Path

def convert_to_16bit(input_path, output_path, signed=True):
    """
    Convert a raster to 16-bit (int16 or uint16)
    
    Args:
        input_path: Path to input raster
        output_path: Path to save the output raster
        signed: Whether to use signed (int16) or unsigned (uint16) data type
    """
    dtype = 'int16' if signed else 'uint16'
    
    with rasterio.open(input_path) as src:
        # Read source metadata
        meta = src.meta.copy()
        
        # Read data
        data = src.read()
        
        # Determine scaling if needed (depends on your data values)
        if data.min() < 0 and not signed:
            print("Warning: Negative values found but converting to unsigned int16")
            # You might need to add an offset or rescale
        
        # Update metadata with new data type
        meta.update({
            'dtype': dtype,
            'driver': 'GTiff',
            'compress': 'lzw'  # Optional compression
        })
        
        # Convert and write data
        with rasterio.open(output_path, 'w', **meta) as dst:
            # Convert data to new dtype (with appropriate scaling if needed)
            dst.write(data.astype(dtype))
            
    print(f"Converted {input_path} to 16-bit ({dtype}) at {output_path}")

In [None]:
def batch_convert_to_16bit(folder_path, output_folder=None, signed=True):
    """
    Convert all rasters in a folder to 16-bit
    
    Args:
        folder_path: Path containing raster files
        output_folder: Where to save the outputs (defaults to subfolder "16bit")
        signed: Whether to use signed (int16) or unsigned (uint16) data type
    """
    folder = Path(folder_path)
    
    # Set output folder
    if output_folder is None:
        output_folder = folder / "16bit"
    else:
        output_folder = Path(output_folder)
        
    # Create output directory if it doesn't exist
    output_folder.mkdir(exist_ok=True, parents=True)
    
    # Find all raster files
    tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
    
    print(f"Found {len(tif_files)} raster files to convert")
    
    for i, file_path in enumerate(tif_files, 1):
        output_path = output_folder / file_path.name
        print(f"Converting ({i}/{len(tif_files)}): {file_path.name}")
        try:
            convert_to_16bit(file_path, output_path, signed=signed)
        except Exception as e:
            print(f"Error converting {file_path.name}: {e}")
            
    print(f"Conversion complete. Output files saved to: {output_folder}")

In [None]:
# batch_convert_to_16bit(folder_path, output_folder=folder_path+'_2', signed=True)

In [None]:
# import rasterio
# from rasterio.vrt import WarpedVRT
# import glob
# from pathlib import Path

# # Define the folder path containing your GeoTIFFs
# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'
# folder = Path(folder_path)

# # Get list of TIFF files
# tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
# raster_files = ["example.tif", "example2.tif", "...", "examplen.tif"]

# tif_file_paths = [str(file) for file in tif_files]

# # Create a mosaic using rasterio
# output_vrt = str(folder / "combined_rasters.vrt")

# # Build a simple VRT manually
# with open(output_vrt, 'w') as f:
#     f.write('<VRTDataset rasterXSize="10000" rasterYSize="10000">\n')
#     for idx, tif in enumerate(tif_file_paths):
#         f.write(f'  <VRTRasterBand dataType="Float32" band="{idx+1}">\n')
#         f.write(f'    <SimpleSource>\n')
#         f.write(f'      <SourceFilename relativeToVRT="0">{tif}</SourceFilename>\n')
#         f.write(f'      <SourceBand>1</SourceBand>\n')
#         f.write(f'    </SimpleSource>\n')
#         f.write(f'  </VRTRasterBand>\n')
#     f.write('</VRTDataset>')

# print(f"Virtual raster created manually: {output_vrt}")

In [None]:
# import os
# from pathlib import Path
# import rasterio
# from rasterio.vrt import WarpedVRT

# # Define the folder path containing your GeoTIFFs
# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'
# folder = Path(folder_path)

# # Get list of TIFF files
# tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
# tif_file_paths = [str(file) for file in tif_files]

# # Output VRT path
# output_vrt = str(folder / "combined_rasters.vrt")

# # Get information from the first raster to determine VRT parameters
# with rasterio.open(tif_file_paths[0]) as src:
#     crs = src.crs
#     nodata = src.nodata
#     dtype = src.dtypes[0]

# # Create VRT with rasterio's build_vrt function
# with rasterio.Env():
#     # Create a temporary text file with the list of rasters
#     list_file = str(folder / "raster_list.txt")
#     with open(list_file, 'w') as f:
#         for tif in tif_file_paths:
#             f.write(f"{tif}\n")
    
#     # Use gdalbuildvrt through subprocess
#     import subprocess
#     cmd = ['gdalbuildvrt', '-separate', '-overwrite', output_vrt, '-input_file_list', list_file]
#     subprocess.run(cmd)
    
#     # Remove temporary file
#     os.remove(list_file)

# print(f"Virtual raster created: {output_vrt}")

In [None]:
# import os
# from pathlib import Path
# import subprocess

# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'
# folder = Path(folder_path)
# tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
# tif_file_paths = [str(f) for f in tif_files]

# # Write list of TIFFs to a text file for gdalbuildvrt
# list_file = folder / "raster_list.txt"
# with open(list_file, 'w') as f:
#     for tif in tif_file_paths:
#         f.write(f"{tif}\n")

# # Build the VRT using gdalbuildvrt
# output_vrt = folder / "combined_rasters.vrt"
# cmd = [
#     'gdalbuildvrt', '-separate', '-overwrite',
#     str(output_vrt),
#     '-input_file_list', str(list_file)
# ]
# subprocess.run(cmd, check=True)

# # Remove the temporary list file
# os.remove(list_file)

# print(f"VRT created at: {output_vrt}")

In [None]:
# import rasterio
# from rasterio.merge import merge
# from rasterio.vrt import WarpedVRT
# from pathlib import Path

# # Folder with TIFFs
# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'
# folder = Path(folder_path)

# # Find all tiffs
# tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))

# # Open the datasets
# datasets = [rasterio.open(str(tif)) for tif in tif_files]

# # Merge them into a single mosaic (in-memory)
# mosaic, out_trans = merge(datasets)

# # Save a proper VRT
# vrt_options = {
#     'resampling': rasterio.enums.Resampling.nearest
# }

# # Save as a VRT file
# vrt_path = str(folder / "combined_rasters.vrt")

# with rasterio.open(
#     vrt_path, 'w',
#     driver='VRT',
#     width=mosaic.shape[2],
#     height=mosaic.shape[1],
#     count=1,
#     dtype=mosaic.dtype.name,
#     transform=out_trans,
#     crs=datasets[0].crs
# ) as dst:
#     dst.write(mosaic[0], 1)

# print(f"Proper VRT saved: {vrt_path}")


In [None]:
# import rasterio
# from pathlib import Path

# # Define the folder path
# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'
# folder = Path(folder_path)

# # Get list of TIFF files
# tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
# tif_file_paths = [str(file) for file in tif_files]

# # Output VRT path
# output_vrt = str(folder / "combined_rasters.vrt")

# # Create XML content for VRT file
# with open(output_vrt, 'w') as f:
#     # Open the first file to get dimensions
#     with rasterio.open(tif_file_paths[0]) as src:
#         width = src.width
#         height = src.height
    
#     # Write VRT header
#     f.write(f'<VRTDataset rasterXSize="{width}" rasterYSize="{height}">\n')
    
#     # Add each input file as a separate band
#     for i, tif_path in enumerate(tif_file_paths, 1):
#         f.write(f'  <VRTRasterBand dataType="Int8" band="{i}">\n')
#         f.write('    <SimpleSource>\n')
#         f.write(f'      <SourceFilename relativeToVRT="0">{tif_path}</SourceFilename>\n')
#         f.write('      <SourceBand>1</SourceBand>\n')
#         f.write('    </SimpleSource>\n')
#         f.write('  </VRTRasterBand>\n')
    
#     # Close VRT
#     f.write('</VRTDataset>')

# print(f"VRT file created at: {output_vrt}")

In [None]:
import random
import json
import math
from shapely.geometry import Polygon, mapping
from shapely.validation import make_valid
import pyproj
from shapely.ops import transform
from functools import partial

def generate_random_polygon(min_lon, min_lat, max_lon, max_lat, min_area_ha=1, max_area_ha=10, vertex_count=6):
    """
    Generate a random polygon within bounds with area in specified range (hectares)
    
    Args:
        min_lon, min_lat, max_lon, max_lat: Boundary coordinates
        min_area_ha: Minimum area in hectares
        max_area_ha: Maximum area in hectares
        vertex_count: Number of vertices for the polygon
    """
    # Helper function to calculate area in hectares
    def calculate_area_ha(polygon, lon, lat):
        # Create a projection from WGS84 to UTM for the specific location
        proj_string = f"+proj=utm +zone={int((lon + 180) / 6) + 1} +ellps=WGS84 +datum=WGS84 +units=m +no_defs"
        project = partial(
            pyproj.transform,
            pyproj.Proj('EPSG:4326'),  # source coordinate system (WGS84)
            pyproj.Proj(proj_string)   # target coordinate system (UTM)
        )
        
        # Transform the polygon to UTM to calculate area in square meters
        polygon_utm = transform(project, polygon)
        # Convert square meters to hectares (1 hectare = 10,000 sq meters)
        return polygon_utm.area / 10000
    
    # Initial parameters
    target_area_ha = random.uniform(min_area_ha, max_area_ha)
    max_attempts = 10
    
    for attempt in range(max_attempts):
        # Start with a random center point
        center_lon = random.uniform(min_lon, max_lon)
        center_lat = random.uniform(min_lat, max_lat)
        
        # Dynamic size adjustment - start with a reasonable guess
        # This is a very rough approximation - about 0.01 degrees ~ 1000m at equator
        size_estimate = math.sqrt(target_area_ha / 100) * 0.01
        
        # Create vertices around the center
        vertices = []
        for i in range(vertex_count):
            angle = 2 * math.pi * i / vertex_count
            # Add some irregularity
            angle += random.uniform(-0.3, 0.3)
            # Random distance from center (to make irregular polygon)
            distance = random.uniform(0.8, 1.2) * size_estimate
            
            # Calculate vertex position
            lon = center_lon + distance * math.cos(angle)
            lat = center_lat + distance * math.sin(angle)
            
            # Ensure within bounds
            lon = min(max(lon, min_lon), max_lon)
            lat = min(max(lat, min_lat), max_lat)
            
            vertices.append((lon, lat))
        
        # Close the polygon
        vertices.append(vertices[0])
        
        # Create and validate the polygon
        poly = Polygon(vertices)
        if not poly.is_valid:
            poly = make_valid(poly)
            if poly.geom_type != 'Polygon':  # Skip if make_valid doesn't return a simple polygon
                continue
        
        # Calculate actual area
        actual_area_ha = calculate_area_ha(poly, center_lon, center_lat)
        
        # Check if within target range (+/- 20%)
        if min_area_ha * 0.8 <= actual_area_ha <= max_area_ha * 1.2:
            return poly, actual_area_ha
        
        # If we're close, try to scale the polygon
        if attempt < max_attempts - 1:
            scale_factor = math.sqrt(target_area_ha / actual_area_ha)
            # Apply scale factor for next attempt's size estimate
            size_estimate *= scale_factor
    
    # If we reach here, return the last attempt's polygon anyway
    return poly, actual_area_ha

def generate_random_properties(area_ha):
    """Generate random properties for features including the actual area"""
    land_uses = ["forest", "agriculture", "settlement", "water", "grassland"]
    risk_levels = ["low", "medium", "high"]
    
    return {
        "id": random.randint(1000, 9999),
        "area_ha": round(area_ha, 2),
        "land_use": random.choice(land_uses),
        "risk": random.choice(risk_levels),
    }

def create_geojson(bounds, num_polygons=25, min_area_ha=1, max_area_ha=10):
    """Create a GeoJSON file with random polygons within area range"""
    min_lon, min_lat, max_lon, max_lat = bounds
    
    features = []
    for i in range(num_polygons):
        # Random vertex count between 4 and 8
        vertices = random.randint(4, 8)
        
        # Generate polygon with area control
        polygon, actual_area = generate_random_polygon(
            min_lon, min_lat, max_lon, max_lat, 
            min_area_ha=min_area_ha,
            max_area_ha=max_area_ha,
            vertex_count=vertices
        )
        
        # Create GeoJSON feature with actual area
        properties = generate_random_properties(actual_area)
        feature = {
            "type": "Feature",
            "properties": properties,
            "geometry": mapping(polygon)
        }
        
        features.append(feature)
    
    # Create the GeoJSON feature collection
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return geojson


make random geometries within bounds

In [None]:
folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T194155Z-002_COGS' #COGS

# Define bounds from the provided Earth Engine geometry
bounds = [
    -3.04548260909834,  # min_lon
    5.253961384163733,  # min_lat
    -1.0179939534016594,  # max_lon
    7.48307210714245    # max_lat
]


In [None]:

# Generate the GeoJSON
random_polygons = create_geojson(bounds, num_polygons=1000, min_area_ha=.5, max_area_ha=1)
# Save to file
with open(folder_path+"/random_polygons.geojson", "w") as f:
    json.dump(random_polygons, f, indent=2)

print("Created random_polygons.geojson with random polygons")

In [None]:
ops = ['sum']
# tiff_path = r'C:\Users\Arnell\Downloads\whisp_image_clip_v0.tif'

# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001_2' #goetiffs (16bit)
list_of_tiffs = glob.glob(folder_path + '/*.tif') 
list_of_tiffs

In [None]:

stats = exact_extract(
    # rast=str(tiff_path),
    rast = list_of_tiffs,
    # rast = folder_path+'/combined_rasters.vrt',# slow with normal tiffs 
    # vec=GEOJSON_EXAMPLE_FILEPATH,
    vec=folder_path+'/random_polygons.geojson',
    # strategy="raster-sequential",#"feature-sequential
    ops=ops,
    output='pandas',
    # include_cols=[id_column]
)


In [None]:
print(stats)


In [None]:
stats.to_csv(folder_path+'/combined_rasters.csv', index=False)


STORING AI answer to canb i run exact extract on a cloud bucket:
 Use GDAL's virtual file system (advanced)
For Cloud-Optimized GeoTIFFs, you can use GDAL's virtual file system with the /vsigs/ prefix:

In [None]:
# import os
# import gdal
# from exactextract import exact_extract
# import geopandas as gpd

# # Set GCS credentials environment variable
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/credentials.json'

# # GCS path using GDAL's virtual file system
# gcs_path = "/vsigs/your-bucket-name/path/to/file.tif"

# vector_file = "your_vector_data.geojson"
# gdf = gpd.read_file(vector_file)

# # Try with exactextract
# try:
#     stats = exact_extract(
#         rast=gcs_path, 
#         vec=gdf,
#         ops=["mean"],
#         output='pandas'
#     )
#     print(stats)
# except Exception as e:
#     print(f"Direct access failed: {e}")
#     print("You may need to download the file first.")

In [None]:
# Basic usage with default settings
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH
# )

# # Custom directory and operations 
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     tiff_dir=Path.home() / 'my_geotiffs',
#     ops=['sum', 'mean', 'count'],
#     max_features=5
# )

# # Custom ID pattern for different filename format (e.g., 'parcel_123_ndvi.tif')
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     tiff_id_pattern=r'parcel_(\d+)_ndvi\.tif',
#     id_column='parcel_id'
# )

# # Specify output CSV location
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     output_csv=Path.home() / 'analysis' / 'whisp_results.csv'
# )

Stats for fc - parallel processing in batches (client side) 

In [None]:
def geotiff_stats_by_feature_id_threaded(
    geojson_path, 
    tiff_dir=None, 
    output_csv=None, 
    tiff_id_pattern=r'feature_(\d+)\.tif', 
    id_column='internal_id',
    ops=['sum'],
    max_features=None,
    max_workers=4,
    batch_size=5  # Process files in batches for better performance
):
    """
    Process GeoTIFF files that match feature IDs in a GeoJSON using thread-based
    parallelism, which avoids the serialization issues of multiprocessing.
    
    Args:
        geojson_path (str or Path): Path to the GeoJSON file with features
        tiff_dir (str or Path): Directory containing GeoTIFF files (default: ~/Downloads/whisp_features)
        output_csv (str or Path): Path to save the output CSV (default: uses timestamp)
        tiff_id_pattern (str): Regex pattern to extract ID from GeoTIFF filename
        id_column (str): Column name in GeoJSON containing feature IDs
        ops (list): List of operations to perform with exactextract
        max_features (int): Maximum number of features to process
        max_workers (int): Maximum number of concurrent workers (default: 4)
        batch_size (int): Number of files to process in each batch
        
    Returns:
        pd.DataFrame: Combined results DataFrame
        str: Path to the output CSV file
    """

    
    start_time = time.time()
    logger = logging.getLogger('whisp_processor')
    
    # Set default directory if not specified
    if tiff_dir is None:
        tiff_dir = Path.home() / 'Downloads' / 'whisp_features'
    else:
        tiff_dir = Path(tiff_dir)
    
    # Load the GeoJSON
    logger.info(f"Loading GeoJSON from {geojson_path}")
    gdf = gpd.read_file(geojson_path)
    
    # Ensure ID column exists
    if id_column not in gdf.columns:
        logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
        gdf[id_column] = range(1, len(gdf) + 1)
    
    # Apply max_features if specified
    if max_features and max_features < len(gdf):
        logger.info(f"Limiting to first {max_features} features")
        gdf = gdf.iloc[:max_features]
    
    # Create feature lookup dictionary for faster access
    feature_dict = {}
    for idx, row in gdf.iterrows():
        feature_id = row[id_column]
        feature_dict[feature_id] = idx
    
    # Find matching GeoTIFF files
    tiff_files = []
    id_pattern = re.compile(tiff_id_pattern)
    
    for file in os.listdir(tiff_dir):
        if file.endswith('.tif') or file.endswith('.tiff'):
            match = id_pattern.search(file)
            if match:
                tiff_id = int(match.group(1))
                if tiff_id in feature_dict:
                    tiff_files.append(file)
    
    logger.info(f"Found {len(tiff_files)} matching GeoTIFF files in {tiff_dir}")
    
    # Set up output CSV
    if output_csv is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = Path(tiff_dir) / f"feature_stats_{timestamp}.csv"
    else:
        output_csv = Path(output_csv)
    
    # CSV writing synchronization
    csv_created = False
    csv_lock = threading.Lock()
    
    # Create batches of files for processing
    batches = []
    for i in range(0, len(tiff_files), batch_size):
        batches.append(tiff_files[i:i+batch_size])
    
    logger.info(f"Created {len(batches)} batches with up to {batch_size} files each")
    
    # Function to process a batch of files
    def process_batch(batch_files):
        batch_results = []
        
        for tiff_file in batch_files:
            try:
                # Extract ID from filename
                match = id_pattern.search(tiff_file)
                if not match:
                    logger.debug(f"Could not extract ID from filename: {tiff_file}")
                    continue
                    
                tiff_id = int(match.group(1))
                
                # Find the corresponding feature in the GeoDataFrame
                if tiff_id not in feature_dict:
                    logger.debug(f"No matching feature found for ID {tiff_id}")
                    continue
                
                # Get the feature from the dataframe
                feature_idx = feature_dict[tiff_id]
                feature = gdf.iloc[[feature_idx]]
                
                # Full path to GeoTIFF file
                tiff_path = tiff_dir / tiff_file
                
                # Execute exactextract
                logger.info(f"Processing feature ID: {tiff_id}")
                stats = exact_extract(
                    rast=str(tiff_path),
                    vec=feature,
                    ops=ops,
                    output='pandas',
                    include_cols=[id_column]
                )
                
                # Add the geometry column to the results
                stats['geometry'] = feature.iloc[0].geometry
                
                # Add to batch results
                batch_results.append(stats)
                logger.info(f"Feature {tiff_id} processed successfully")
                
            except Exception as e:
                logger.error(f"Error processing file {tiff_file}: {str(e)}")
        
        return batch_results
    
    # Process batches (in parallel if max_workers > 1)
    all_results = []
    
    if max_workers > 1:
        logger.info(f"Processing batches in parallel with {max_workers} threads")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit batch tasks
            future_to_batch = {executor.submit(process_batch, batch): i for i, batch in enumerate(batches)}
            
            # Collect results
            for future in concurrent.futures.as_completed(future_to_batch):
                batch_idx = future_to_batch[future]
                try:
                    batch_results = future.result()
                    for result in batch_results:
                        # Write to CSV with proper synchronization
                        with csv_lock:
                            if not csv_created:
                                result.to_csv(output_csv, index=False)
                                csv_created = True
                            else:
                                result.to_csv(output_csv, mode='a', header=False, index=False)
                        # Collect results
                        all_results.append(result)
                    
                    logger.info(f"Completed batch {batch_idx+1}/{len(batches)}")
                except Exception as e:
                    logger.error(f"Exception in batch {batch_idx}: {str(e)}")
    else:
        # Process sequentially
        logger.info("Processing batches sequentially")
        for i, batch in enumerate(batches):
            batch_results = process_batch(batch)
            for result in batch_results:
                # Write to CSV
                if not csv_created:
                    result.to_csv(output_csv, index=False)
                    csv_created = True
                else:
                    result.to_csv(output_csv, mode='a', header=False, index=False)
                # Collect results
                all_results.append(result)
            
            logger.info(f"Completed batch {i+1}/{len(batches)}")
    
    # Combine all results
    if all_results:
        all_results_df = pd.concat(all_results, ignore_index=True)
        
        # Convert to GeoDataFrame
        try:
            result_gdf = gpd.GeoDataFrame(all_results_df, geometry='geometry')
            if gdf.crs:
                result_gdf = result_gdf.set_crs(gdf.crs)
            
            logger.info(f"Total processing time: {time.time() - start_time:.2f}s")
            logger.info(f"Results saved to {output_csv}")
            return result_gdf, str(output_csv)
        except Exception as e:
            logger.error(f"Error creating GeoDataFrame from results: {str(e)}")
    
    if not all_results:
        logger.warning("No results generated")
    
    logger.info(f"Total processing time: {time.time() - start_time:.2f}s")
    return pd.DataFrame(), str(output_csv)

In [None]:
# # Use thread-based parallelism (more reliable than processes for GIS operations)
# results_df, csv_path = geotiff_stats_by_feature_id_threaded(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     max_workers=10,  # Adjust based on your machine's capabilities
#     batch_size=10    # Smaller batch size for better load balancing
# )

In [None]:
import gc
# import os
from exactextract import exact_extract

def safely_extract_stats(tiff_path, feature, ops=['sum'], id_column=None):
    """
    A safer wrapper around exactextract that ensures resources are properly released.
    
    This function isolates the exactextract call and ensures cleanup even if exceptions occur.
    """
    
    result = None
    try:
        # Process in its own scope
        result = exact_extract(
            rast=tiff_path,
            vec=feature,
            ops=ops,
            output='pandas',
            include_cols=[id_column] if id_column else None
        )
        
    except Exception as e:
        print(f"Error processing {tiff_path}: {str(e)}")
        
    finally:
        # Explicit cleanup to help release the file
        gc.collect()
        
        # On Windows, add a brief delay which sometimes helps release file locks
        try:
            import time
            time.sleep(0.1)  # Short delay
        except:
            pass
    
    return result

Chain with exact extract

In [None]:
def download_and_extract_stats_for_collection(
    feature_collection, 
    image, 
    geojson_path=None,
    output_dir=None,
    output_csv=None,
    scale=10, 
    max_features=None, 
    max_workers=None,
    max_retries=3, 
    retry_delay=3,
    ops=['sum'],
    id_column='internal_id',
    keep_geotiffs=True
):
    """
    Combined function that downloads GeoTIFFs for features in a collection and
    immediately runs exactextract to calculate statistics with proper band names.
    
    Args:
        feature_collection: Earth Engine FeatureCollection to process
        image: Earth Engine image to clip and download
        geojson_path: Path to matching GeoJSON file (optional, for more precise polygon extraction)
        output_dir: Directory to save the GeoTIFFs (default: ~/Downloads/whisp_features)
        output_csv: Path to save the output CSV (default: uses timestamp)
        scale: Resolution in meters (default 10m)
        max_features: Maximum number of features to process (default: all)
        max_workers: Maximum number of parallel workers (default: None, sequential)
        max_retries: Maximum number of retry attempts for each download
        retry_delay: Base delay in seconds between retries
        ops: List of operations to perform with exactextract
        id_column: Column name in GeoJSON containing feature IDs
        keep_geotiffs: Whether to keep the downloaded GeoTIFF files (default: True)
        
    Returns:
        gdf: GeoDataFrame with extracted statistics
        csv_path: Path to the saved CSV file
    """
    import ee
    import os
    import time
    import logging
    import concurrent.futures
    import pandas as pd
    import geopandas as gpd
    import requests
    import rasterio
    from pathlib import Path
    from datetime import datetime
    import threading
    from exactextract import exact_extract
    
    # Set up logging
    logger = logging.getLogger('whisp_processor')
    
    # Get band names from the Earth Engine image
    try:
        # Get band information from the Earth Engine image
        band_names = image.bandNames().getInfo()
        logger.info(f"Retrieved band names from image: {band_names}")
    except Exception as e:
        logger.warning(f"Failed to get band names from image: {str(e)}")
        band_names = None
    
    # Set default output directory
    if output_dir is None:
        output_dir = Path.home() / 'Downloads' / 'whisp_features'
    
    # Create directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Set up output CSV
    if output_csv is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = output_dir / f"feature_stats_{timestamp}.csv"
    else:
        output_csv = Path(output_csv)
    
    # Load GeoJSON if provided (for more accurate extraction)
    if geojson_path:
        logger.info(f"Loading GeoJSON from {geojson_path}")
        source_gdf = gpd.read_file(geojson_path)
        
        # Ensure ID column exists
        if id_column not in source_gdf.columns:
            logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
            source_gdf[id_column] = range(1, len(source_gdf) + 1)
            
        # Create feature lookup dictionary for faster access
        feature_dict = {}
        for idx, row in source_gdf.iterrows():
            feature_id = row[id_column]
            feature_dict[feature_id] = idx
    else:
        source_gdf = None
        feature_dict = None
    
    # Get collection size and limit if needed
    collection_size = feature_collection.size().getInfo()
    logger.info(f"Processing Earth Engine FeatureCollection with {collection_size} features")
    
    if max_features and max_features < collection_size:
        feature_collection = feature_collection.limit(max_features)
        collection_size = max_features
        logger.info(f"Limited to processing first {max_features} features")
    
    # Get features as a list
    features = feature_collection.toList(collection_size)
    
    # CSV writing synchronization
    csv_created = False
    csv_lock = threading.Lock()
    all_results = []
    
    
    def rename_band_columns(df, band_names, ops=['sum']):
        """
        Rename generic band index columns to meaningful band names in a DataFrame.
        
        Args:
            df (pd.DataFrame): DataFrame with stats columns to rename
            band_names (list): List of band names from Earth Engine image
            ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
            
        Returns:
            pd.DataFrame: DataFrame with renamed columns
        """
        # Create a copy of the input DataFrame to avoid modifying the original
        renamed_df = df.copy()
        
        # Create a mapping from generic band names to actual band names
        column_mapping = {}
        for op in ops:
            for i, band_name in enumerate(band_names):
                # Check both possible formats
                format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
                format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
                
                if format1 in renamed_df.columns:
                    column_mapping[format1] = band_name
                elif format2 in renamed_df.columns:
                    column_mapping[format2] = band_name
        
        # Apply the renaming and return
        return renamed_df.rename(columns=column_mapping)


    def download_and_process_feature(index):
        try:
            # Get the feature
            ee_feature = ee.Feature(features.get(index))
            
            # Get the feature ID
            try:
                internal_id = ee_feature.get(id_column).getInfo()
                logger.info(f"Processing feature {internal_id} ({index+1}/{collection_size})")
            except Exception:
                internal_id = f"unknown_{index}"
                logger.warning(f"Could not get ID for feature {index}, using {internal_id}")
            
            # Create a unique filename
            filename = f"feature_{internal_id}.tif"
            output_path = output_dir / filename
            
            # Skip download if file exists
            if output_path.exists():
                logger.info(f"File {filename} already exists, skipping download")
                # Always use process_downloaded_feature for consistency
                return process_downloaded_feature(ee_feature, str(output_path), internal_id)
            
            # Download the file
            retries = 0
            while retries < max_retries:
                try:
                    # Clip the image to the feature
                    clipped_image = image.clip(ee_feature.geometry())
                    
                    # Generate the download URL
                    logger.debug(f"Generating download URL for feature {internal_id}")
                    download_url = clipped_image.getDownloadURL({
                        'format': 'GeoTIFF',
                        'region': ee_feature.geometry(),
                        'scale': scale,
                        'crs': 'EPSG:4326'
                    })
                    
                    # Download the image with timeout
                    logger.info(f"Downloading to {output_path}")
                    response = requests.get(download_url, timeout=300)
                    
                    if response.status_code == 200:
                        # Check if the response is actually a GeoTIFF
                        content_type = response.headers.get('Content-Type', '')
                        if 'tiff' in content_type.lower() or 'zip' in content_type.lower():
                            with open(output_path, 'wb') as f:
                                f.write(response.content)
                            logger.info(f"Successfully downloaded {filename}")
                            
                            # Process the downloaded file
                            return process_downloaded_feature(ee_feature, str(output_path), internal_id)
                        else:
                            logger.error(f"Download returned non-TIFF content: {content_type}")
                            error_file = output_dir / f"error_{internal_id}.txt"
                            with open(error_file, 'wb') as f:
                                f.write(response.content[:2000])
                            retries += 1
                    else:
                        logger.error(f"Failed to download (status {response.status_code})")
                        retries += 1
                    
                    # Wait before retrying
                    if retries < max_retries:
                        sleep_time = retry_delay * (2 ** retries)
                        logger.info(f"Retrying in {sleep_time} seconds (attempt {retries+1}/{max_retries})")
                        time.sleep(sleep_time)
                
                except Exception as e:
                    logger.error(f"Error downloading feature {internal_id}: {str(e)}", exc_info=True)
                    retries += 1
                    if retries < max_retries:
                        logger.info(f"Retrying in {retry_delay} seconds (attempt {retries+1}/{max_retries})")
                        time.sleep(retry_delay)
            
            logger.error(f"Maximum retries reached for feature {internal_id}")
            return None
        
        except Exception as e:
            logger.error(f"Error processing feature at index {index}: {str(e)}", exc_info=True)
            return None
        
    
    # Helper function to process a downloaded GeoTIFF file
    def process_downloaded_feature(ee_feature, tiff_path, feature_id):

        def get_band_names(tif_path):
            """Extract band names from a GeoTIFF file."""
            with rasterio.open(tif_path) as src:
                if src.descriptions and all(src.descriptions):
                    return list(src.descriptions)
                else:
                    return [f"Band {i+1}" for i in range(src.count)]
        try:
            # Ensure we have a plain string path, not a Path object
            tiff_path_str = str(tiff_path)
            
            logger.info(f"Processing downloaded file: {tiff_path_str}")

            # # Get band names from the TIFF file if not already available
            
            local_band_names = band_names

            if local_band_names is None:
                print(f"Band names not available from image, trying to read from file: {tiff_path_str}")
                try:
                    local_band_names = get_band_names(tiff_path_str)
                    local_band_names = image.bandNames().getInfo()

                    with rasterio.open(tiff_path_str) as src:
                        # If raster has descriptions, use them as band names
                        if src.descriptions and all(src.descriptions):
                            local_band_names = list(src.descriptions)
                            logger.info(f"Using band descriptions from GeoTIFF: {local_band_names}")
                        else:
                            # Otherwise create generic names
                            local_band_names = [f"band_{i+1}" for i in range(src.count)]
                            logger.info(f"Using generic band names: {local_band_names}")
                except Exception as e:
                    logger.warning(f"Failed to read band names from GeoTIFF: {str(e)}")
                    # Fallback to generic names if reading fails
                    local_band_names = image.bandNames().getInfo()
            


            # Find the corresponding feature in source GeoJSON if available
            if source_gdf is not None and feature_id in feature_dict:
                feature_idx = feature_dict[feature_id]
                feature = source_gdf.iloc[[feature_idx]]
            else:
                logger.warning(f"Feature ID {feature_id} not found in GeoJSON")
                # # Use the EE feature's geometry (less precise but works)
                # feature_geom = gpd.GeoDataFrame(
                #     {'internal_id': [feature_id]},
                #     geometry=[gpd.GeoSeries.from_wkt([ee_feature.geometry().toWkt().getInfo()])[0]]
                # )
                # feature = feature_geom
            
            # Use exactextract with plain string path
            logger.debug(f"Running exactextract on {tiff_path_str}")
            stats = exact_extract(
                rast=tiff_path_str,
                vec=feature,
                ops=ops,
                output='pandas',
                include_cols=[id_column]
            )


            # Rename columns to use actual band names if available
            if local_band_names:
                stats = rename_band_columns(stats, local_band_names, ops=ops)
                logger.debug(f"Renamed columns using band names: {local_band_names}")
            

            
            # Add the geometry column
            stats['geometry'] = feature.iloc[0].geometry
            
            # Delete the GeoTIFF if not keeping
            if not keep_geotiffs:
                try:
                    os.remove(tiff_path_str)
                    logger.debug(f"Deleted temporary file {tiff_path_str}")
                except Exception as e:
                    logger.warning(f"Failed to delete temporary file {tiff_path_str}: {str(e)}")
            
            return stats
        except Exception as e:
            logger.error(f"Error extracting stats for feature {feature_id}: {str(e)}")
            return None
    
    # Process features (parallel or sequential)
    if max_workers and max_workers > 1:
        logger.info(f"Processing features in parallel with {max_workers} workers")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_index = {
                executor.submit(download_and_process_feature, i): i 
                for i in range(collection_size)
            }
            
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    stats = future.result()
                    if stats is not None:
                        # Write to CSV with proper synchronization
                        with csv_lock:
                            if not csv_created:
                                stats.to_csv(output_csv, index=False)
                                csv_created = True
                            else:
                                stats.to_csv(output_csv, mode='a', header=False, index=False)
                        # Add to results
                        all_results.append(stats)
                        logger.info(f"Completed feature {index+1}/{collection_size}")
                    else:
                        logger.warning(f"Failed to process feature {index+1}/{collection_size}")
                except Exception as e:
                    logger.error(f"Exception occurred while processing feature {index+1}: {str(e)}")
    else:
        # Sequential processing
        logger.info("Processing features sequentially")
        for i in range(collection_size):
            logger.info(f"Processing feature {i+1}/{collection_size}")
            stats = download_and_process_feature(i)
            if stats is not None:
                # Write to CSV
                if not csv_created:
                    stats.to_csv(output_csv, index=False)
                    csv_created = True
                else:
                    stats.to_csv(output_csv, mode='a', header=False, index=False)
                # Add to results
                all_results.append(stats)
    
    # Combine all results
    if all_results:
        try:
            all_results_df = pd.concat(all_results, ignore_index=True)

            result_gdf = gpd.GeoDataFrame(all_results_df, geometry='geometry')
            
            if source_gdf is not None and source_gdf.crs:
                result_gdf = result_gdf.set_crs(source_gdf.crs)
            
            logger.info(f"Completed processing {len(all_results)}/{collection_size} features successfully")
            logger.info(f"Results saved to {output_csv}")                    
                
            return result_gdf, str(output_csv)
        except Exception as e:
            logger.warning(f"Error creating final GeoDataFrame: {str(e)}")
    
    if not all_results:
        logger.warning("No results generated")
    return None, str(output_csv)

In [None]:
# # Basic usage with defaults
# results_df, csv_path = download_and_extract_stats_for_collection(
#     feature_collection=ee_bbox_collection,
#     image=whisp.combine_datasets(),
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH
# )

GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

# ee_bbox_collection = convert_geojson_to_ee_bbox(GEOJSON_EXAMPLE_FILEPATH)

# Example 5: Full obscuration - extend, shift, and add random features
fully_obscured_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    # extension_range=[0.002, 0.003],
    # shift_geometries=True,
    # shift_proportion=0.9,
    # pixel_length=0.0001,  # ~10m at equator
    # add_random_features=False,
    # max_distance=0.05,  # xkm at equator
    # random_proportion= 0.1  # Add X more features as decoys
)

# Advanced usage
results_df, csv_path = download_and_extract_stats_for_collection(
    feature_collection=fully_obscured_collection,
    image=whisp.combine_datasets(),
    geojson_path=GEOJSON_EXAMPLE_FILEPATH,
    output_dir=folder_path+ "/"+'whisp_on_the_fly_v4',
    # output_csv=Path.home() / 'whisp_analysis' / 'results.csv',
    ops=['sum'],# 'mean', 'count'],
    max_features=1000,
    max_workers=30,
    keep_geotiffs=True  # Delete GeoTIFFs after processing to save space
)

In [None]:
# import rasterio
# tif_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_9.tif"
# def get_band_names(tif_path):
#     """Extract band names from a GeoTIFF file."""
#     with rasterio.open(tif_path) as src:
#         if src.descriptions and all(src.descriptions):
#             return list(src.descriptions)
#         else:
#             return [f"Band {i+1}" for i in range(src.count)]
            
# # Check the updated band names
# bands = get_band_names(tif_path)
# print(bands)

In [None]:
# def rename_band_columns(df, band_names, ops=['sum']):
#     """
#     Rename generic band index columns to meaningful band names in a DataFrame.
    
#     Args:
#         df (pd.DataFrame): DataFrame with stats columns to rename
#         band_names (list): List of band names from Earth Engine image
#         ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
        
#     Returns:
#         pd.DataFrame: DataFrame with renamed columns
#     """
#     # Create a copy of the input DataFrame to avoid modifying the original
#     renamed_df = df.copy()
    
#     # Create a mapping from generic band names to actual band names
#     column_mapping = {}
#     for op in ops:
#         for i, band_name in enumerate(band_names):
#             # Check both possible formats
#             format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
#             format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
            
#             if format1 in renamed_df.columns:
#                 column_mapping[format1] = band_name
#             elif format2 in renamed_df.columns:
#                 column_mapping[format2] = band_name
    
#     # Apply the renaming and return
#     return renamed_df.rename(columns=column_mapping)



# def rename_stats_columns_with_band_names(csv_path, band_names, ops=['sum']):
#     """
#     Rename statistics columns in a CSV file by replacing generic band indices 
#     with actual band names from Earth Engine image bands.
    
#     Args:
#         csv_path (str): Path to the CSV file with statistics
#         band_names (list): List of band names from image.bandNames().getInfo()
#         ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
        
#     Returns:
#         pd.DataFrame: DataFrame with renamed columns
#     """
#     # import pandas as pd
    
#     # Load the CSV
#     stats = pd.read_csv(csv_path)
    
#     # Print current columns to help diagnose format
#     print("Current columns:", stats.columns.tolist())
#     rename_band_columns
#     # Create a mapping from generic band names to actual band names
#     column_mapping = {}
#     for op in ops:
#         for i, band_name in enumerate(band_names):
#             # Check both possible formats
#             format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
#             format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
            
#             if format1 in stats.columns:
#                 column_mapping[format1] = band_name
#             elif format2 in stats.columns:
#                 column_mapping[format2] = band_name
    
#     # Print mapping for verification
#     print("Column mapping:", column_mapping)
    
#     # Apply the renaming
#     stats = stats.rename(columns=column_mapping)
    
#     return stats

In [None]:
# csv_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_stats_20250424_190341.csv"
# stats = pd.read_csv(csv_path)
# ops = ['sum']
# band_names = whisp.combine_datasets().bandNames().getInfo()
# stats = rename_band_columns(stats, band_names, ops)

In [None]:
# stats

In [None]:
# import rasterio

# def get_band_names(tif_path):
#     """
#     Extract band names from a GeoTIFF file.
    
#     Args:
#         tif_path (str): Path to the GeoTIFF file
        
#     Returns:
#         list: List of band names/descriptions
#     """
#     with rasterio.open(tif_path) as src:
#         # Try to get band descriptions (often contain band names)
#         band_descriptions = src.descriptions
        
#         # If descriptions are available and not empty, use them
#         if band_descriptions and all(band_descriptions):
#             print(f"Found {len(band_descriptions)} bands with descriptions")
#             return list(band_descriptions)
            
#         # Check for band metadata that might contain names
#         band_names = []
#         for i in range(1, src.count + 1):
#             band_meta = src.tags(i)
#             if band_meta and 'name' in band_meta:
#                 band_names.append(band_meta['name'])
#             else:
#                 # Fall back to generic naming
#                 band_names.append(f"Band {i}")
        
#         print(f"Found {src.count} bands")
#         return band_names


In [None]:

# Example usage
# Replace with your file path
tif_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_1.tif"
band_names = get_band_names(tif_path)
for i, name in enumerate(band_names):
    print(f"Band {i+1}: {name}")

In [None]:
def check_open_files():
    """Check for open TIFF files and return their paths"""
    import psutil
    process = psutil.Process()
    open_files = process.open_files()
    tiff_files = [f.path for f in open_files if f.path.endswith(('.tif', '.tiff'))]
    print(f"Open TIFF files: {tiff_files}")
    return tiff_files  # Return the list of file paths

In [None]:
check_open_files()

In [None]:
def force_release_tiff_files():
    """Force release of stubborn TIFF file locks using multiple approaches"""
    import gc
    import os
    import psutil
    import sys
    
    print("Starting aggressive TIFF cleanup...")
    
    # 1. First garbage collection pass
    gc.collect()
    
    # 2. Try to identify what's holding the files
    process = psutil.Process()
    tiff_files = [f for f in process.open_files() if f.path.endswith(('.tif', '.tiff'))]
    for file in tiff_files:
        print(f"Locked file: {file}")
    
    # 3. Try to reset libraries that commonly lock files
    try:
        # Reset GDAL
        from osgeo import gdal
        gdal.UseExceptions()  # Make GDAL throw exceptions
        print("Resetting GDAL cache...")
        gdal.SetConfigOption('GDAL_MAX_DATASET_POOL_SIZE', '0')  # Disable dataset pooling
        gdal.SetCacheMax(0)  # Clear caches
        
    except ImportError:
        print("GDAL not directly imported")
    
    # 4. Reset exactextract if it's loaded
    if 'exactextract' in sys.modules:
        print("Removing exactextract from sys.modules...")
        del sys.modules['exactextract']
    
    # 5. If rasterio is being used
    try:
        import rasterio
        from rasterio.errors import RasterioIOError
        
        print("Cleaning rasterio environment...")
        rasterio.env.GDALEnv(CPL_DEBUG=True)  # Create a new environment with debug on
        
        # Try to deliberately close the files
        for file in tiff_files:
            try:
                # This might raise an error, but sometimes forces release
                with rasterio.open(file.path, 'r') as src:
                    pass  # Just open and close it to reset
            except RasterioIOError:
                pass  # Ignore errors, we're just trying to force close
            
    except ImportError:
        print("Rasterio not installed")
    
    # 6. More aggressive garbage collection
    for _ in range(3):
        gc.collect()
    
    # 7. Check what's left
    remaining = [f for f in psutil.Process().open_files() if f.path.endswith(('.tif', '.tiff'))]
    print(f"After aggressive cleanup: {len(remaining)} files still locked")
    
    return remaining

In [None]:
force_release_tiff_files()

In [None]:
def safely_process_with_isolation(tiff_paths):
    """
    Process a list of TIFF files by completely isolating the exactextract module
    to prevent file locks from persisting.
    """
    import gc
    import sys
    import importlib
    
    print(f"Attempting to unlock {len(tiff_paths)} files using isolation method...")
    
    # Step 1: Force reset all relevant modules that might hold locks
    modules_to_reload = []
    for module_name in list(sys.modules.keys()):
        if any(keyword in module_name for keyword in ['gdal', 'rasterio', 'exactextract', 'fiona', 'osgeo']):
            modules_to_reload.append(module_name)
    
    # Force unload these modules
    for module_name in modules_to_reload:
        if module_name in sys.modules:
            try:
                del sys.modules[module_name]
                print(f"Unloaded: {module_name}")
            except:
                pass
                
    # Step 2: Run aggressive garbage collection
    print("Running multiple garbage collection cycles...")
    for _ in range(3):
        gc.collect()
    
    # Step 3: Attempt a direct file copy approach to break locks
    import os
    import shutil
    from pathlib import Path
    
    for tiff_path in tiff_paths:
        try:
            path = Path(tiff_path)
            # Create a temporary copy with a different name
            temp_path = path.with_name(f"temp_{path.name}")
            
            try:
                # Copy the file data rather than moving the file handle
                shutil.copy2(tiff_path, temp_path)
                print(f"Created temporary copy: {temp_path}")
                
                # Remove original (might fail if still locked)
                try:
                    os.remove(tiff_path)
                    # Rename temp back to original
                    os.rename(temp_path, tiff_path)
                    print(f"Successfully unlocked: {tiff_path}")
                except:
                    print(f"Original file still locked, will keep temporary copy: {temp_path}")
            except Exception as e:
                print(f"Error copying file {tiff_path}: {str(e)}")
        except Exception as e:
            print(f"Error processing {tiff_path}: {str(e)}")
    
    print("Isolation process complete")

In [None]:
safely_process_with_isolation(check_open_files())

Chain for downloading and stats

In [None]:
# from pathlib import Path

# tiff_path = Path("C:/Users/Arnell/Downloads/whisp_features/feature_1.tif")
# print("Exists:", tiff_path.exists())
# print("Is file:", tiff_path.is_file())
# print("Absolute path:", tiff_path.resolve())
# print(str(tiff_path))

In [None]:
# # internal_id ="1"        
# gdf = gpd.read_file(GEOJSON_EXAMPLE_FILEPATH)
# # Find matching feature in the GeoDataFrame
# feature = gdf.iloc[0]
# # if len(feature) == 0:
# #     logger.warning(f"No matching feature found for ID {internal_id}, skipping")
# #     return []

# # Get the geometry from the feature
# geom = feature.geometry#.iloc[0]

In [None]:
# exact_extract(rast=str(tiff_path),
#                vec=gdf,
#                ops=['sum'],
#                output='pandas',
#             #    include_cols=['internal_id']
#             )



In [None]:
# df = whisp.whisp_formatted_stats_ee_to_df(convert_geojson_to_ee_bbox(GEOJSON_EXAMPLE_FILEPATH))
# df

In [None]:
df_formatted_stats 

Parallel processing test


Whisp it

In [None]:
df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(folder_path+'/random_polygons.geojson')
# df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)



Display table

In [None]:
df_formatted_stats

In [None]:
# Define the output folder (if running in Sepal change path to preferred folder) 
out_directory = Path.home() / 'Downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table_stats.csv'

# Save the CSV file
df_formatted_stats.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Calculate risk category

In [None]:
# add risk columns to end of dataframe
df_w_risk = whisp.whisp_risk(df=df_formatted_stats)

Display table with risk columns

In [None]:
df_w_risk

Export table to CSV

In [None]:
# Define the output folder 
# e.g. in running in Sepal this might be: Path.home() / 'module_results/whisp/'
out_directory = Path.home() / 'Downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table_w_risk.csv'

# Save the CSV file
df_w_risk.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Export to GeoJSON (optional)

In [None]:
# Define the output file path for GeoJSON
geojson_output_file = out_directory / 'whisp_output_table.geojson'

# Save the GeoJSON file
whisp.convert_df_to_geojson(df_w_risk, geojson_output_file)  # builds a geojson file containing Whisp columns. Uses the geometry column "geo" to create the spatial features.
print(f"GeoJSON file saved to: {geojson_output_file}")