In [None]:
def run_parallel_downloads(image=None, number_of_samples=3, max_workers=4, hectares=4, 
                           band_indices=None, output_dir=None, calculate_zonal_stats=False):
    """
    Run parallel downloads of Whisp datasets for random global locations with optional zonal statistics.
    
    Args:
        image: Earth Engine image to process (default: will use whisp.combine_datasets())
        number_of_samples (int): Number of random locations to sample (default: 3)
        max_workers (int): Number of parallel download threads (default: 4)
        hectares (int): Area size for each sample in hectares (default: 4)
        band_indices (list): Optional list of specific band indices to select
        output_dir (Path): Directory to save downloaded files (default: Downloads/whisp_samples)
        calculate_zonal_stats (bool): Whether to calculate zonal statistics (default: False)
        
    Returns:
        dict: Statistics about the processing times, download results, and zonal stats path
    """
 
    
    # For zonal statistics
    if calculate_zonal_stats:
        try:
            import rasterio
            import rasterstats
        except ImportError:
            print("Warning: rasterstats and/or rasterio packages not found.")
            print("Installing required packages for zonal statistics...")
            import subprocess
            subprocess.check_call(["pip", "install", "rasterstats", "rasterio"])
            import rasterio
            import rasterstats
    
    # Start timing
    start_time = time.time()
    start_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Define output directory
    if output_dir is None:
        output_dir = Path.home() / 'Downloads' / 'whisp_samples'
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Check or create image
    if image is None:
        print("No image provided, using whisp.combine_datasets()")
        image = whisp.combine_datasets()
    
    # Define regions with forest coverage
    forest_regions = [
        [-120, -40, -20, 50, "americas"],
        [-20, 40, -30, 60, "europe_africa"],
        [60, 150, -40, 60, "asia_oceania"]
    ]
    
    # Generate random locations
    random_locations = []
    for i in range(number_of_samples):
        region = random.choice(forest_regions)
        lon = round(random.uniform(region[0], region[1]), 3)
        lat = round(random.uniform(region[2], region[3]), 3)
        random_locations.append((lon, lat, region[4]))
    
    # Track timing information
    download_times = []
    success_count = 0
    failures = []
    
    # For zonal statistics tracking
    zonal_stats_results = []
    
    # Use ThreadPoolExecutor for parallel downloads
    print(f"Starting {number_of_samples} parallel downloads with {max_workers} workers at {start_datetime}...")
    
    # Modified process_location function with timing and zonal stats
    def process_with_timing(location_data):
        item_start_time = time.time()
        lon, lat, region = location_data
        try:
            # Create the bounding box with specified hectares
            bbox_feature = create_bbox(lon, lat, hectares=hectares)
            
            # Create the combined dataset
            combined_image = image.clip(bbox_feature)
            
            # Select specific bands if requested
            if band_indices is not None:
                combined_image = combined_image.select(band_indices)
                bands_info = f"bands_{'_'.join(map(str, band_indices))}"
            else:
                # Default to all bands
                bands_info = "all_bands"
            
            # Get the download URL
            download_url = combined_image.getDownloadURL({
                'format': 'GeoTIFF',
                'region': bbox_feature.geometry(),
                'scale': 10,
                'crs': 'EPSG:4326'
            })
            
            # Create a unique filename
            filename = f"whisp_{bands_info}_{region}_{lon}_{lat}_{hectares}h.tif"
            output_path = output_dir / filename
            
            # Download the image
            response = requests.get(download_url)
            if response.status_code == 200:
                with open(output_path, 'wb') as f:
                    f.write(response.content)
                
                # Calculate zonal statistics if requested
                local_stats = []
                if calculate_zonal_stats:
                    try:
                        # Open the file with rasterio
                        with rasterio.open(output_path) as src:
                            num_bands = src.count
                            
                            # Create a simple polygon from the bounding box
                            bbox_geom = bbox_feature.geometry().bounds().getInfo()
                            geom = {
                                'type': 'Polygon',
                                'coordinates': [[
                                    [bbox_geom[0], bbox_geom[1]],
                                    [bbox_geom[2], bbox_geom[1]],
                                    [bbox_geom[2], bbox_geom[3]],
                                    [bbox_geom[0], bbox_geom[3]],
                                    [bbox_geom[0], bbox_geom[1]],
                                ]]
                            }
                            print(bbox_geom)
                            # Calculate statistics for each band
                            for band in range(1, num_bands + 1):
                                band_name = f"B{band}" if band_indices is None else f"B{band_indices[band-1]}"
                                
                                # Calculate zonal statistics
                                stats = rasterstats.zonal_stats(
                                    geom, 
                                    src.read(band), 
                                    affine=src.transform,
                                    stats=["min", "max", "mean", "median", "std", "count"]
                                )[0]
                                
                                # Add to results
                                for stat_name, stat_value in stats.items():
                                    if stat_value is not None:  # Skip None values
                                        local_stats.append({
                                            "longitude": lon,
                                            "latitude": lat,
                                            "region": region,
                                            "filename": filename,
                                            "band": band_name,
                                            "statistic": stat_name,
                                            "value": stat_value,
                                            "hectares": hectares
                                        })
                    except Exception as e:
                        print(f"Error calculating zonal statistics for {filename}: {str(e)}")
                
                elapsed_time = time.time() - item_start_time
                return True, f"Successfully downloaded {filename} in {elapsed_time:.2f}s", elapsed_time, local_stats
            else:
                elapsed_time = time.time() - item_start_time
                return False, f"Failed to download {filename}: Status {response.status_code} in {elapsed_time:.2f}s", elapsed_time, []
            
        except Exception as e:
            elapsed_time = time.time() - item_start_time
            return False, f"Error processing location {lon}, {lat}: {str(e)} in {elapsed_time:.2f}s", elapsed_time, []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_location = {executor.submit(process_with_timing, loc): loc for loc in random_locations}
        
        for future in concurrent.futures.as_completed(future_to_location):
            location = future_to_location[future]
            try:
                success, result, elapsed_time, local_stats = future.result()
                download_times.append(elapsed_time)
                
                # Add zonal stats results to the global list
                if local_stats:
                    zonal_stats_results.extend(local_stats)
                
                if success:
                    success_count += 1
                else:
                    failures.append(result)
                print(f"Location {location[0]}, {location[1]}: {result}")
            except Exception as e:
                print(f"Location {location[0]}, {location[1]} generated an exception: {str(e)}")
                failures.append(str(e))
    
    end_time = time.time()
    end_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    total_elapsed = end_time - start_time
    
    # Create a zonal statistics CSV if requested and available
    zonal_stats_file = None
    if calculate_zonal_stats and zonal_stats_results:
        # Convert to DataFrame and save as CSV
        df_stats = pd.DataFrame(zonal_stats_results)
        zonal_stats_file = output_dir / f"whisp_zonal_stats_{start_datetime.replace(':', '-').replace(' ', '_')}.csv"
        df_stats.to_csv(zonal_stats_file, index=False)
        print(f"\nZonal statistics saved to: {zonal_stats_file}")
    
    # processing speed statistics
    stats = {
        "start_time": start_datetime,
        "end_time": end_datetime,
        "total_time_seconds": total_elapsed,
        "success_count": success_count,
        "failure_count": len(failures),
        "total_samples": number_of_samples,
        "success_rate": success_count / number_of_samples * 100 if number_of_samples > 0 else 0,
        "zonal_stats_file": str(zonal_stats_file) if zonal_stats_file else None,
        "zonal_stats_count": len(zonal_stats_results)
    }
    
    if download_times:
        stats.update({
            "avg_time": mean(download_times),
            "median_time": median(download_times),
            "min_time": min(download_times),
            "max_time": max(download_times)
        })
        
        # Calculate standard deviation if more than one download
        if len(download_times) > 1:
            stats["std_dev"] = stdev(download_times)
    
    # Print summary
    print(f"\nDownload Summary:")
    print(f"  Start time: {start_datetime}")
    print(f"  End time: {end_datetime}")
    print(f"  Total processing time: {total_elapsed:.2f}s")
    print(f"  Success rate: {stats['success_rate']:.1f}% ({success_count}/{number_of_samples})")
    
    if download_times:
        print(f"\nDownload Time Statistics:")
        print(f"  Average time: {stats.get('avg_time', 0):.2f}s")
        print(f"  Median time: {stats.get('median_time', 0):.2f}s")
        print(f"  Min time: {stats.get('min_time', 0):.2f}s")
        print(f"  Max time: {stats.get('max_time', 0):.2f}s")
        if 'std_dev' in stats:
            print(f"  Standard Deviation: {stats['std_dev']:.2f}s")
    
    if calculate_zonal_stats:
        print(f"\nZonal Statistics:")
        print(f"  Statistics calculated: {stats['zonal_stats_count']}")
        if zonal_stats_file:
            print(f"  Statistics saved to: {zonal_stats_file}")
        else:
            print("  No zonal statistics were generated.")
    
    print(f"\nDownloaded images are saved to: {output_dir}")
    
    return stats

In [None]:
import concurrent.futures
import random
import requests
import time
from pathlib import Path
from statistics import mean, median, stdev

# Define output directory
out_directory = Path.home() / 'Downloads' / 'whisp_samples'
out_directory.mkdir(exist_ok=True, parents=True)

# Define regions with forest coverage
forest_regions = [
    [-120, -40, -20, 50, "americas"],
    [-20, 40, -30, 60, "europe_africa"],
    [60, 150, -40, 60, "asia_oceania"]
]

# Generate random locations
random_locations = []
for i in range(3):
    region = random.choice(forest_regions)
    lon = round(random.uniform(region[0], region[1]), 3)
    lat = round(random.uniform(region[2], region[3]), 3)
    random_locations.append((lon, lat, region[4]))

# Track timing information
download_times = []

# Use ThreadPoolExecutor for parallel downloads
print("Starting parallel downloads...")
total_start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_to_location = {executor.submit(process_location, loc): loc for loc in random_locations}
    
    for future in concurrent.futures.as_completed(future_to_location):
        location = future_to_location[future]
        try:
            result, elapsed_time = future.result()
            download_times.append(elapsed_time)
            print(f"Location {location[0]}, {location[1]}: {result}")
        except Exception as e:
            print(f"Location {location[0]}, {location[1]} generated an exception: {str(e)}")

total_elapsed = time.time() - total_start_time

# Calculate statistics
if download_times:
    avg_time = mean(download_times)
    med_time = median(download_times)
    min_time = min(download_times)
    max_time = max(download_times)
    
    # Calculate standard deviation if more than one download
    if len(download_times) > 1:
        std_dev = stdev(download_times)
        std_dev_info = f"Standard Deviation: {std_dev:.2f}s"
    else:
        std_dev_info = "Standard Deviation: N/A (need at least 2 samples)"
    
    print(f"\nDownload Time Statistics:")
    print(f"  Average time: {avg_time:.2f}s")
    print(f"  Median time: {med_time:.2f}s")
    print(f"  Min time: {min_time:.2f}s")
    print(f"  Max time: {max_time:.2f}s")
    print(f"  {std_dev_info}")
    print(f"  Total time for all downloads: {total_elapsed:.2f}s")
else:
    print("No successful downloads to calculate statistics")

print(f"\nDownloaded images are saved to: {out_directory}")

zonal stats testing



In [None]:
    import rasterio
    import pandas as pd
    import geopandas as gpd
    import os
    import re
    from pathlib import Path
    from datetime import datetime
    from shapely.geometry import box
    
    # Ensure we have the required libraries
    try:
        import rasterio
        import exactextract
        from exactextract import exact_extract
        import geopandas
    except ImportError:
        print("Installing required packages for zonal statistics...")
        import subprocess
        subprocess.check_call(["pip", "install", "rasterio", "exactextract", "geopandas"])
        import rasterio
        import exactextract
        from exactextract import exact_extract
        import geopandas as gpd

In [None]:
def calculate_zonal_stats_for_existing_files(directory=None, output_csv=None):
    """
    Calculate zonal statistics for all GeoTIFF files in the specified directory
    using exactextract for better performance.
    
    Args:
        directory (Path or str): Directory containing GeoTIFF files
        output_csv (Path or str): Path to save the output CSV file (default: same dir with timestamp)
        
    Returns:
        Path: Path to the generated CSV file with statistics
    """
    
    # Set default directory if none provided
    if directory is None:
        directory = Path.home() / 'Downloads' / 'whisp_samples'
    else:
        directory = Path(directory)
    
    if not directory.exists():
        raise ValueError(f"Directory not found: {directory}")
    
    # Find all GeoTIFF files
    tiff_files = list(directory.glob("*.tif"))
    if not tiff_files:
        print(f"No GeoTIFF files found in {directory}")
        return None
    
    print(f"Found {len(tiff_files)} GeoTIFF files for analysis")
    
    # Store all statistics
    all_stats = []
    
    # Process each file
    for tiff_file in tiff_files:
        filename = tiff_file.name
        print(f"Processing {filename}...")
        
        try:
            # Extract location info from filename using regex
            # Looking for patterns like "whisp_all_bands_americas_-65.234_25.789_4h.tif"
            match = re.search(r'([a-z_]+)_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d+)h', filename)
            
            if match:
                region = match.group(1)
                lon = float(match.group(2))
                lat = float(match.group(3))
                hectares = int(match.group(4))
            else:
                # Fallback if filename doesn't match pattern
                region = "unknown"
                lon = 0.0
                lat = 0.0
                hectares = 0
            
            # Open the GeoTIFF and get metadata
            with rasterio.open(tiff_file) as src:
                # Get bounds and transform
                bounds = src.bounds
                transform = src.transform
                num_bands = src.count
                
                # Create a polygon from the bounds
                geom = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
                
                # Create a GeoDataFrame with the polygon
                gdf = gpd.GeoDataFrame({'id': [1], 'geometry': [geom]}, crs=src.crs)
                
                # Calculate statistics for each band
                for band_idx in range(1, num_bands + 1):
                    # Read band data
                    band_data = src.read(band_idx)
                    band_name = f"B{band_idx}"
                    
                    # Calculate zonal statistics using exactextract
                    # Remove the problematic parameter
                    stats = exact_extract(
                        band_data,
                        gdf,
                        transform,
                    #     ['min', 'max', 'mean', 'median', 'stdev', 'count']
                    
                    )
                    
                    # Check if stats is valid
                    if stats is not None and len(stats) > 0:
                        stats_dict = stats.iloc[0].to_dict()
                        
                        # Add each statistic to results with standardized names
                        for stat_name, stat_value in stats_dict.items():
                            # Map exactextract stat names to our standardized names
                            if stat_name == 'stdev':
                                stat_name = 'std'
                                
                            if stat_value is not None:  # Skip None values
                                all_stats.append({
                                    "filename": filename,
                                    "longitude": lon,
                                    "latitude": lat,
                                    "region": region,
                                    "band": band_name,
                                    "statistic": stat_name,
                                    "value": stat_value,
                                    "hectares": hectares
                                })
            
            print(f"  Successfully calculated statistics for band {band_idx}")
                
        except Exception as e:
            print(f"  Error processing {filename}: {str(e)}")
    
    # Create dataframe with results
    if all_stats:
        df_stats = pd.DataFrame(all_stats)
        
        # Create output filename if not specified
        if output_csv is None:
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            output_csv = directory / f"whisp_zonal_stats_{timestamp}.csv"
        else:
            output_csv = Path(output_csv)
        
        # Save to CSV
        df_stats.to_csv(output_csv, index=False)
        print(f"\nZonal statistics saved to: {output_csv}")
        
        # Print summary
        print("\nSummary statistics:")
        print(f"  Files processed: {len(tiff_files)}")
        print(f"  Total statistics calculated: {len(all_stats)}")
        print(f"  Unique bands: {df_stats['band'].nunique()}")
        
        # Show average value for each statistic type
        print("\nAverage values by statistic type:")
        for stat_type in df_stats['statistic'].unique():
            avg_val = df_stats[df_stats['statistic'] == stat_type]['value'].mean()
            print(f"  {stat_type}: {avg_val:.4f}")
        
        return output_csv
    else:
        print("No statistics were calculated.")
        return None

Teesting numpy (fast but issues with fractional pixels)

In [None]:
    import numpy as np
    import pandas as pd
    import rasterio
    from pathlib import Path

In [None]:
# Run zonal statistics on all existing GeoTIFF files in the default directory
# stats_file = calculate_zonal_stats_for_existing_files()

# # Or specify a custom directory and output file
# # custom_dir = Path.home() / 'my_geotiffs'
custom_dir = Path.home() / 'Downloads' / 'whisp_samples'
output_file = Path.home() / 'Downloads' / 'A2_my_custom_stats.csv'
stats_file = calculate_zonal_stats_for_existing_files(directory=custom_dir, output_csv=output_file)

In [None]:
def simple_band_sums(directory=None, output_csv=None):
    """
    Super simple function that just outputs filename and sum of each band.
    
    Args:
        directory: Directory containing GeoTIFF files
        output_csv: Path for output CSV
    """

    
    # Setup directory
    directory = Path(directory or Path.home() / 'Downloads' / 'whisp_samples')
    output_csv = output_csv or directory / "band_sums.csv"
    
    # Find all tiff files
    tiff_files = list(directory.glob("*.tif"))
    print(f"Found {len(tiff_files)} GeoTIFF files")
    
    # Store results
    results = []
    
    # Process each file
    for tiff_file in tiff_files:
        try:
            with rasterio.open(tiff_file) as src:
                # Process each band with direct NumPy sum
                for band in range(1, src.count + 1):
                    # Read band data and handle NoData
                    band_data = src.read(band)
                    if src.nodata is not None:
                        band_data = band_data.astype('float64')
                        band_data[band_data == src.nodata] = np.nan
                    
                    # Calculate sum ignoring NaN values
                    sum_value = np.nansum(band_data)
                    
                    # Add to results
                    results.append({
                        'filename': tiff_file.name,
                        'band': band,
                        'sum': sum_value
                    })
            
        except Exception as e:
            print(f"Error processing {tiff_file.name}: {str(e)}")
    
    # Create and save DataFrame
    if results:
        df = pd.DataFrame(results)
        df.to_csv(output_csv, index=False)
        print(f"Band sums saved to: {output_csv}")
        return df
    else:
        print("No results were calculated")
        return None

In [None]:
# Just run it
sums_df = simple_band_sums()

# View the results directly
print(sums_df)

#pretty quick if already clipped (they werent): 2 mins for 1000 10ha tifs

In [None]:
testing extact extract package (NOT )

In [None]:
    import pandas as pd
    import rasterio
    import geopandas as gpd
    import re
    from pathlib import Path
    from datetime import datetime
    from shapely.geometry import box
    from exactextract import exact_extract
    

In [None]:
# def simple_zonal_stats_exactextract(directory=None, output_csv=None):
#     """
#     Ultra-simple zonal statistics calculator using exactextract.
    
#     Args:
#         directory: Directory containing GeoTIFF files
#         output_csv: Path for output CSV
#     """

#     # Setup directory
#     directory = Path(directory or Path.home() / 'Downloads' / 'whisp_samples')
#     output_csv = output_csv or directory / f"whisp_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
#     # Find GeoTIFF files
#     tiff_files = list(directory.glob("*.tif"))
#     print(f"Found {len(tiff_files)} GeoTIFF files")
    
#     # Store results
#     all_results = []
    
#     # Process each file
#     for tiff_file in tiff_files:
#         print(f"Processing {tiff_file.name}...")
        
#         try:
#             # Extract location info from filename
#             match = re.search(r'([a-z_]+)_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d+)h?', tiff_file.name)
#             region = match.group(1) if match else "unknown"
#             lon = float(match.group(2)) if match else 0.0
#             lat = float(match.group(3)) if match else 0.0
#             hectares = int(match.group(4)) if match else 0
            
#             # Open the file and get bounds
#             with rasterio.open(tiff_file) as src:
#                 # Create a polygon from the bounds
#                 geom = box(*src.bounds)
#                 gdf = gpd.GeoDataFrame({'id': [1], 'geometry': [geom]}, crs=src.crs)
                
#                 # Process each band
#                 for band in range(1, src.count + 1):
#                     # Calculate all statistics at once
#                     stats_df = exact_extract(
#                         src,
#                         gdf,
#                         # ['min', 'max', 'mean', 'median', 'stdev', 'count'],
#                         ['count'],
#                         include_cols=['id'],
#                         output='pandas'
#                     )
                    
#                     # Transform the results to our desired format
#                     for stat_name in ['count']:#['min', 'max', 'mean', 'median', 'stdev', 'count']:
#                         if stat_name in stats_df.columns:
#                             all_results.append({
#                                 'filename': tiff_file.name,
#                                 'longitude': lon,
#                                 'latitude': lat,
#                                 'region': region, 
#                                 'band': f"B{band}",
#                                 'statistic': 'std' if stat_name == 'stdev' else stat_name,
#                                 'value': stats_df.iloc[0][stat_name],
#                                 'hectares': hectares
#                             })
            
#             print(f"  Successfully processed {src.count} bands")
                
#         except Exception as e:
#             print(f"  Error processing {tiff_file.name}: {str(e)}")
    
#     # Create and save DataFrame
#     if all_results:
#         df = pd.DataFrame(all_results)
#         df.to_csv(output_csv, index=False)
#         print(f"\nStatistics saved to: {output_csv}")
#         return output_csv
#     else:
#         print("No statistics were calculated")
#         return None

In [None]:
# Run on default directory
# stats_csv = simple_zonal_stats_exactextract()

# Or specify custom paths
custom_dir = Path.home() / 'Downloads' / 'whisp_samples'
output_file = Path.home() / 'Downloads' / 'a3_whisp_stats.csv'
stats_csv = simple_zonal_stats_exactextract(directory=custom_dir, output_csv=output_file)




Found 3 GeoTIFF files
Processing whisp_all_bands_americas_-77.55_28.798_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_americas_-90.96_25.267_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif...




  Successfully processed 167 bands
No statistics were calculated
Found 3 GeoTIFF files
Processing whisp_all_bands_americas_-77.55_28.798_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_americas_-90.96_25.267_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif...
  Successfully processed 167 bands
No statistics were calculated


In [None]:
    import pandas as pd
    import rasterio
    import geopandas as gpd
    import re
    from pathlib import Path
    from datetime import datetime
    from shapely.geometry import box
    from exactextract import exact_extract
    
# def simple_zonal_stats_exactextract(directory=None, output_csv=None):
#     """
#     Ultra-simple zonal statistics calculator using exactextract.
    
#     Args:
#         directory: Directory containing GeoTIFF files
#         output_csv: Path for output CSV
#     """

#     # Setup directory
#     directory = Path(directory or Path.home() / 'Downloads' / 'whisp_samples')
#     output_csv = output_csv or directory / f"whisp_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
#     # Find GeoTIFF files
#     tiff_files = list(directory.glob("*.tif"))
#     print(f"Found {len(tiff_files)} GeoTIFF files")
    
#     # Store results
#     all_results = []
    
#     # Process each file
#     for tiff_file in tiff_files:
#         print(f"Processing {tiff_file.name}...")
        
#         try:
#             # Extract location info from filename
#             match = re.search(r'([a-z_]+)_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d+)h?', tiff_file.name)
#             region = match.group(1) if match else "unknown"
#             lon = float(match.group(2)) if match else 0.0
#             lat = float(match.group(3)) if match else 0.0
#             hectares = int(match.group(4)) if match else 0
            
#             # Open the file and get bounds
#             with rasterio.open(tiff_file) as src:
#                 # Create a polygon from the bounds
#                 geom = box(*src.bounds)
#                 gdf = gpd.GeoDataFrame({'id': [1], 'geometry': [geom]}, crs=src.crs)
                
#                 # Process each band
#                 for band in range(1, src.count + 1):
#                     # Calculate all statistics at once
#                     stats_df = exact_extract(
#                         src,
#                         gdf,
#                         # ['min', 'max', 'mean', 'median', 'stdev', 'count'],
#                         ['count'],
#                         include_cols=['id'],
#                         output='pandas'
#                     )
                    
#                     # Transform the results to our desired format
#                     for stat_name in ['count']:#['min', 'max', 'mean', 'median', 'stdev', 'count']:
#                         if stat_name in stats_df.columns:
#                             all_results.append({
#                                 'filename': tiff_file.name,
#                                 'longitude': lon,
#                                 'latitude': lat,
#                                 'region': region, 
#                                 'band': f"B{band}",
#                                 'statistic': 'std' if stat_name == 'stdev' else stat_name,
#                                 'value': stats_df.iloc[0][stat_name],
#                                 'hectares': hectares
#                             })
            
#             print(f"  Successfully processed {src.count} bands")
                
#         except Exception as e:
#             print(f"  Error processing {tiff_file.name}: {str(e)}")
    
#     # Create and save DataFrame
#     if all_results:
#         df = pd.DataFrame(all_results)
#         df.to_csv(output_csv, index=False)
#         print(f"\nStatistics saved to: {output_csv}")
#         return output_csv
#     else:
#         print("No statistics were calculated")
#         return None

In [None]:
import rasterio
import numpy as np
from pathlib import Path

def convert_to_16bit(input_path, output_path, signed=True):
    """
    Convert a raster to 16-bit (int16 or uint16)
    
    Args:
        input_path: Path to input raster
        output_path: Path to save the output raster
        signed: Whether to use signed (int16) or unsigned (uint16) data type
    """
    dtype = 'int16' if signed else 'uint16'
    
    with rasterio.open(input_path) as src:
        # Read source metadata
        meta = src.meta.copy()
        
        # Read data
        data = src.read()
        
        # Determine scaling if needed (depends on your data values)
        if data.min() < 0 and not signed:
            print("Warning: Negative values found but converting to unsigned int16")
            # You might need to add an offset or rescale
        
        # Update metadata with new data type
        meta.update({
            'dtype': dtype,
            'driver': 'GTiff',
            'compress': 'lzw'  # Optional compression
        })
        
        # Convert and write data
        with rasterio.open(output_path, 'w', **meta) as dst:
            # Convert data to new dtype (with appropriate scaling if needed)
            dst.write(data.astype(dtype))
            
    print(f"Converted {input_path} to 16-bit ({dtype}) at {output_path}")

In [None]:
def batch_convert_to_16bit(folder_path, output_folder=None, signed=True):
    """
    Convert all rasters in a folder to 16-bit
    
    Args:
        folder_path: Path containing raster files
        output_folder: Where to save the outputs (defaults to subfolder "16bit")
        signed: Whether to use signed (int16) or unsigned (uint16) data type
    """
    folder = Path(folder_path)
    
    # Set output folder
    if output_folder is None:
        output_folder = folder / "16bit"
    else:
        output_folder = Path(output_folder)
        
    # Create output directory if it doesn't exist
    output_folder.mkdir(exist_ok=True, parents=True)
    
    # Find all raster files
    tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))
    
    print(f"Found {len(tif_files)} raster files to convert")
    
    for i, file_path in enumerate(tif_files, 1):
        output_path = output_folder / file_path.name
        print(f"Converting ({i}/{len(tif_files)}): {file_path.name}")
        try:
            convert_to_16bit(file_path, output_path, signed=signed)
        except Exception as e:
            print(f"Error converting {file_path.name}: {e}")
            
    print(f"Conversion complete. Output files saved to: {output_folder}")

In [None]:
# batch_convert_to_16bit(folder_path, output_folder=folder_path+'_2', signed=True)

In [None]:
def add_extrasamples_to_tiff(input_tiff_path, output_tiff_path=None, extrasample_type='unassalpha'):
#     """
#     Add or modify extrasamples in a GeoTIFF file to avoid processing errors.
    
#     Args:
#         input_tiff_path (str): Path to the input GeoTIFF file
#         output_tiff_path (str, optional): Path for the output GeoTIFF. If None, overwrites input.
#         extrasample_type (str): Type of extrasample to add. Options:
#             - 'unassalpha': Unassociated alpha (transparency) channel
#             - 'assocalpha': Associated alpha channel
#             - 'unspecified': Unspecified extra sample
    
#     Returns:
#         str: Path to the output GeoTIFF file
#     """
#     # Map string options to rasterio ExtraSample enum values
#     extrasample_mapping = {
#         'unassalpha': 1,#.unassalpha,
#         'assocalpha': 2,#.assocalpha,
#         'unspecified': 0,#ExtraSample.unspecified
#     }
    
#     if extrasample_type not in extrasample_mapping:
#         raise ValueError(f"Invalid extrasample_type. Choose from: {list(extrasample_mapping.keys())}")
    
#     # If no output path specified, create a temp file and then overwrite original
#     overwrite = False
#     if output_tiff_path is None:
#         output_tiff_path = str(Path(input_tiff_path).with_name(f"temp_{Path(input_tiff_path).name}"))
#         overwrite = True
    
#     # Read the source file
#     with rasterio.open(input_tiff_path) as src:
#         # Get metadata
#         profile = src.profile.copy()
#         data = src.read()
        
#         # Add extrasamples parameter to the profile
#         profile.update({
#             'photometric': 'rgb' if src.count >= 3 else 'minisblack',
#             'extra_samples': [extrasample_mapping[extrasample_type]]
#         })
        
#         # Write to the new file with extrasamples
#         with rasterio.open(output_tiff_path, 'w', **profile) as dst:
#             dst.write(data)
    
#     # If we're overwriting the original, replace it
#     if overwrite:
#         import os
#         os.replace(output_tiff_path, input_tiff_path)
#         return input_tiff_path
    
#     return output_tiff_path

In [None]:
# # Process files in parallel
# import concurrent.futures

# tiff_files = glob.glob(folder_path + "_1/*.tif")

# with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
#     futures = [executor.submit(add_extrasamples_to_tiff, file) for file in tiff_files]
    
#     for i, future in enumerate(concurrent.futures.as_completed(futures)):
#         try:
#             result = future.result()
#             print(f"Processed {i+1}/{len(tiff_files)}: {Path(result).name}")
#         except Exception as e:
#             print(f"Error processing file: {str(e)}")

Sequential stats (client side using exact extract)

In [None]:
# def geotiff_stats_by_feature_id(
#     geojson_path, 
#     tiff_dir=None, 
#     output_csv=None, 
#     tiff_id_pattern=r'feature_(\d+)\.tif', 
#     id_column='internal_id',
#     ops=['sum'],
#     max_features=None
# ):
#     """
#     Process GeoTIFF files that match feature IDs in a GeoJSON, run exactextract,
#     and save the results to CSV.
    
#     Args:
#         geojson_path (str or Path): Path to the GeoJSON file with features
#         tiff_dir (str or Path): Directory containing GeoTIFF files (default: ~/Downloads/whisp_features)
#         output_csv (str or Path): Path to save the output CSV (default: uses timestamp)
#         tiff_id_pattern (str): Regex pattern to extract ID from GeoTIFF filename
#         id_column (str): Column name in GeoJSON containing feature IDs
#         ops (list): List of operations to perform with exactextract
#         max_features (int): Maximum number of features to process
        
#     Returns:
#         pd.DataFrame: Combined results DataFrame
#         str: Path to the output CSV file
#     """
#     import os
#     import re
#     import pandas as pd
#     import geopandas as gpd
#     from pathlib import Path
#     from datetime import datetime
#     import logging
#     from exactextract import exact_extract
    
#     logger = logging.getLogger('whisp_processor')
    
#     # Set default directory if not specified
#     if tiff_dir is None:
#         tiff_dir = Path.home() / 'Downloads' / 'whisp_features'
#     else:
#         tiff_dir = Path(tiff_dir)
    
#     # Load the GeoJSON
#     logger.info(f"Loading GeoJSON from {geojson_path}")
#     gdf = gpd.read_file(geojson_path)
    
#     # Ensure ID column exists
#     if id_column not in gdf.columns:
#         logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
#         gdf[id_column] = range(1, len(gdf) + 1)
    
#     # Apply max_features if specified
#     if max_features and max_features < len(gdf):
#         logger.info(f"Limiting to first {max_features} features")
#         gdf = gdf.iloc[:max_features]
    
#     # Find all GeoTIFF files in the directory
#     tiff_files = []
#     for file in os.listdir(tiff_dir):
#         if file.endswith('.tif') or file.endswith('.tiff'):
#             tiff_files.append(file)
    
#     logger.info(f"Found {len(tiff_files)} GeoTIFF files in {tiff_dir}")
    
#     # Set up output CSV
#     if output_csv is None:
#         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#         output_csv = Path(tiff_dir) / f"feature_stats_{timestamp}.csv"
#     else:
#         output_csv = Path(output_csv)
    
#     # Create empty results DataFrame
#     all_results = pd.DataFrame()
#     id_pattern = re.compile(tiff_id_pattern)
    
#     # Track processed features for reporting
#     processed_count = 0
#     matched_count = 0
    
#     # Process each GeoTIFF file
#     for tiff_file in tiff_files:
#         # Extract ID from filename using regex
#         match = id_pattern.search(tiff_file)
#         if not match:
#             logger.debug(f"Could not extract ID from filename: {tiff_file}, skipping")
#             continue
        
#         tiff_id = int(match.group(1))
#         processed_count += 1
        
#         # Find matching feature in GeoJSON
#         matching_feature = gdf[gdf[id_column] == tiff_id]
#         if len(matching_feature) == 0:
#             logger.debug(f"No matching feature found for ID {tiff_id}, skipping")
#             continue
        
#         matched_count += 1
#         logger.info(f"Processing feature ID: {tiff_id} ({matched_count} of {processed_count} matched)")
        
#         # Full path to GeoTIFF file
#         tiff_path = tiff_dir / tiff_file
        
#         try:
#             # Run exactextract
#             logger.debug(f"Running exactextract on {tiff_file}")
#             stats = exact_extract(
#                 rast=str(tiff_path),
#                 vec=matching_feature,
#                 ops=ops,
#                 output='pandas',
#                 include_cols=[id_column]
#             )
            
#             # Add the geometry column to the results
#             stats['geometry'] = matching_feature.iloc[0].geometry
            
#             # Append to results
#             if all_results.empty:
#                 all_results = stats
#                 # Write header to CSV
#                 stats.to_csv(output_csv, index=False)
#             else:
#                 all_results = pd.concat([all_results, stats], ignore_index=True)
#                 # Append to CSV without header
#                 stats.to_csv(output_csv, mode='a', header=False, index=False)
            
#             logger.info(f"Feature {tiff_id} processed successfully")
            
#         except Exception as e:
#             logger.error(f"Error processing feature {tiff_id}: {str(e)}")
    
#     # Convert to GeoDataFrame for spatial analysis
#     if not all_results.empty:
#         try:
#             result_gdf = gpd.GeoDataFrame(all_results, geometry='geometry')
#             if gdf.crs:
#                 result_gdf = result_gdf.set_crs(gdf.crs)
                
#             logger.info(f"Processed {matched_count}/{processed_count} GeoTIFF files with matching features")
#             logger.info(f"Results saved to {output_csv}")
#             return result_gdf, str(output_csv)
#         except Exception as e:
#             logger.error(f"Error creating GeoDataFrame from results: {str(e)}")
    
#     if all_results.empty:
#         logger.warning("No results generated")
    
#     return all_results, str(output_csv)

In [None]:
# geotiff_stats_by_feature_id(geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#                             tiff_dir=folder_path,
#                             output_csv=Path.home() / 'Downloads' / 'whisp_outputs_test' / 'feature_stats.csv',
#                             tiff_id_pattern=r'feature_(\d+)\.tif',
#                             id_column='internal_id',
#                             ops=['sum'],
#                             max_features=100)

2025-04-28 20:40:25,255 - INFO - Loading GeoJSON from C:/Users/Arnell/Downloads/whisp_outputs_test/random_polygons.geojson
2025-04-28 20:40:25,276 - INFO - Limiting to first 100 features
2025-04-28 20:40:25,278 - INFO - Found 100 GeoTIFF files in C:\Users\Arnell\Downloads\whisp_outputs_test
2025-04-28 20:40:25,280 - INFO - Processing feature ID: 1 (1 of 1 matched)
2025-04-28 20:40:25,843 - INFO - Feature 1 processed successfully
2025-04-28 20:40:25,845 - INFO - Processing feature ID: 10 (2 of 2 matched)
2025-04-28 20:40:26,189 - INFO - Feature 10 processed successfully
2025-04-28 20:40:26,189 - INFO - Processing feature ID: 100 (3 of 3 matched)
2025-04-28 20:40:26,538 - INFO - Feature 100 processed successfully
2025-04-28 20:40:26,540 - INFO - Processing feature ID: 11 (4 of 4 matched)
2025-04-28 20:40:26,888 - INFO - Feature 11 processed successfully
2025-04-28 20:40:26,888 - INFO - Processing feature ID: 12 (5 of 5 matched)
2025-04-28 20:40:27,300 - INFO - Feature 12 processed succes

(    internal_id     band_1_sum  band_2_sum     band_3_sum   band_4_sum  \
 0             1  406705.570396         0.0  405668.683872  1036.886524   
 1            10  324134.025841         0.0  243736.799870  6206.581650   
 2           100  367567.668926         0.0  330890.821960  4673.256319   
 3            11  318846.834685         0.0  318846.834685     0.000000   
 4            12  703105.304239         0.0  702217.448984   887.855255   
 ..          ...            ...         ...            ...          ...   
 95           95  527201.388954         0.0    4450.022700     0.000000   
 96           96  479597.749610         0.0  477820.228164  1777.521446   
 97           97  505726.099456         0.0  503382.405997     0.000000   
 98           98  663340.275764         0.0  658894.579536  4445.696228   
 99           99  431827.182816         0.0  431827.182816     0.000000   
 
     band_5_sum  band_6_sum  band_7_sum  band_8_sum     band_9_sum  ...  \
 0          0.0        

simple approach for single raster or list of rasters and all features in geojson

list tif files in the directory

In [None]:

# Define the folder path
# folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'

# Using Path from pathlib (more modern approach)
print("\n=== TIFF files using pathlib ===")
folder = Path(folder_path)
tif_files = list(folder.glob('*.tif')) + list(folder.glob('*.tiff'))

if tif_files:
    for i, file_path in enumerate(tif_files, 1):
        file_size = file_path.stat().st_size / (1024 * 1024)  # Convert to MB
        print(f"{i}. {file_path.name} - {file_size:.2f} MB")
else:
    print("No TIFF files found in the directory")

print(f"\nTotal: {len(tif_files)} TIFF files")


=== TIFF files using pathlib ===
1. feature_1.tif - 0.07 MB
2. feature_10.tif - 0.06 MB
3. feature_100.tif - 0.07 MB
4. feature_101.tif - 0.08 MB
5. feature_102.tif - 0.05 MB
6. feature_103.tif - 0.10 MB
7. feature_104.tif - 0.11 MB
8. feature_105.tif - 0.07 MB
9. feature_106.tif - 0.07 MB
10. feature_107.tif - 0.07 MB
11. feature_108.tif - 0.08 MB
12. feature_109.tif - 0.08 MB
13. feature_11.tif - 0.07 MB
14. feature_110.tif - 0.06 MB
15. feature_111.tif - 0.08 MB
16. feature_112.tif - 0.06 MB
17. feature_113.tif - 0.08 MB
18. feature_114.tif - 0.06 MB
19. feature_115.tif - 0.07 MB
20. feature_116.tif - 0.08 MB
21. feature_117.tif - 0.07 MB
22. feature_118.tif - 0.07 MB
23. feature_119.tif - 0.09 MB
24. feature_12.tif - 0.06 MB
25. feature_120.tif - 0.08 MB
26. feature_121.tif - 0.07 MB
27. feature_122.tif - 0.08 MB
28. feature_123.tif - 0.09 MB
29. feature_124.tif - 0.06 MB
30. feature_125.tif - 0.08 MB
31. feature_126.tif - 0.08 MB
32. feature_127.tif - 0.06 MB
33. feature_128.tif 

GeoTIFF to Cloud-Optimized GeoTIFF Conversion Function
Here's a function that efficiently converts regular GeoTIFFs to Cloud-Optimized GeoTIFFs (COGs) with parallel processing capabilities:

In [None]:
import os
import concurrent.futures
import rasterio
from pathlib import Path
import logging

def convert_to_cogs(tif_files, output_dir=None, max_workers=None, overwrite=False):
    """
    Convert a list of GeoTIFF files to Cloud-Optimized GeoTIFFs (COGs).
    
    Args:
        tif_files (list): List of file paths to GeoTIFFs to convert
        output_dir (str, optional): Output directory. If None, uses the same directory as input files
        max_workers (int, optional): Maximum number of parallel workers. Default is None (sequential)
        overwrite (bool, optional): Whether to overwrite existing COG files. Default is False
        
    Returns:
        list: Paths to created COG files
    """
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    logger = logging.getLogger('cog_converter')
    
    # Check if tif_files is a list
    if not isinstance(tif_files, list):
        tif_files = [tif_files]
    
    # Create output directory if specified and doesn't exist
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    # Function to process a single file
    def process_file(tif_path):
        try:
            # Get input file path as Path object
            input_path = Path(tif_path)
            
            # Determine output location
            if output_dir:
                output_path = Path(output_dir) / f"cog_{input_path.name}"
            else:
                output_path = input_path.parent / f"cog_{input_path.name}"
                
            # Skip if output file exists and overwrite is False
            if output_path.exists() and not overwrite:
                logger.info(f"Skipping {input_path.name} (output file already exists)")
                return str(output_path)
                
            # # Create COG creation options
            # cog_profile = {
            #     'driver': 'GTiff',
            #     'compress': 'LZW',  # Good compromise between speed and compression
            #     'predictor': 2,     # Horizontal predictor for better compression with imagery
            #     'tiled': True,      # Essential for COG - enables tiled access
            #     'blockxsize': 512,  # Standard tile size
            #     'blockysize': 512, 
            #     'photometric': 'MINISBLACK',  # For single-band grayscale data
            #     'BIGTIFF': 'IF_SAFER',  # Use BigTIFF if needed
            #     'COPY_SRC_OVERVIEWS': 'YES' if input_path.exists() else 'NO',  # Use existing overviews if present
            #     'COMPRESS': 'LZW',  # Redundant but explicit
            #     'NUM_THREADS': 'ALL_CPUS'  # Use all CPUs for this operation
            # }
            
            # Faster compression options (prioritizes speed over size)

            cog_profile = {
                'driver': 'GTiff',
                'compress': 'DEFLATE',    # Often faster than LZW for creation
                'compression_level': 1,   # Lowest compression level (fastest)
                'predictor': 2,
                'tiled': True,
                'blockxsize': 512, 
                'blockysize': 512,
                'BIGTIFF': 'IF_SAFER',
                'NUM_THREADS': 'ALL_CPUS'
            }
            
            # Read source file with rasterio
            with rasterio.open(str(input_path)) as src:
                # Update profile with source metadata
                profile = src.profile.copy()
                profile.update(cog_profile)
                
                # Handle photometric interpretation for multiband data
                if src.count > 1:
                    profile['photometric'] = 'RGB' if src.count >= 3 else 'MINISBLACK'
                
                # Read data
                data = src.read()
                
                # Write COG
                with rasterio.open(str(output_path), 'w', **profile) as dst:
                    dst.write(data)
                    
                    # Build overviews (pyramids) if they don't exist in source
                    if 'COPY_SRC_OVERVIEWS' not in profile or profile['COPY_SRC_OVERVIEWS'] == 'NO':
                        # Calculate reasonable overview levels
                        max_dimension = max(src.width, src.height)
                        overview_levels = []
                        level = 2
                        while max_dimension // level >= 256:  # Keep building until smallest overview is ~256px
                            overview_levels.append(level)
                            level *= 2
                            
                        # Build overviews
                        if overview_levels:
                            dst.build_overviews(overview_levels, "NEAREST")
                            
                        # Set overview-related metadata
                        dst.update_tags(ns='rio_overview', resampling='nearest')
            
            logger.info(f"Successfully created COG: {output_path}")
            return str(output_path)
            
        except Exception as e:
            logger.error(f"Error converting {tif_path}: {str(e)}")
            return None
    
    # Process files (in parallel if max_workers > 1)
    output_files = []
    
    if max_workers and max_workers > 1:
        logger.info(f"Processing {len(tif_files)} files using {max_workers} workers")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all jobs
            future_to_path = {executor.submit(process_file, tif_file): tif_file for tif_file in tif_files}
            
            # Process results as they complete
            completed = 0
            for future in concurrent.futures.as_completed(future_to_path):
                tif_path = future_to_path[future]
                try:
                    result = future.result()
                    if result:
                        output_files.append(result)
                except Exception as exc:
                    logger.error(f"{tif_path} generated an exception: {exc}")
                
                # Simple progress tracking
                completed += 1
                if completed % 10 == 0 or completed == len(tif_files):
                    logger.info(f"Completed {completed}/{len(tif_files)} files")
    else:
        logger.info(f"Processing {len(tif_files)} files sequentially")
        total = len(tif_files)
        for i, tif_file in enumerate(tif_files):
            result = process_file(tif_file)
            if result:
                output_files.append(result)
            
            # Log progress every 10% or for the first/last item
            if (i+1) % max(1, total//10) == 0 or i == 0 or i+1 == total:
                logger.info(f"Progress: {i+1}/{total} files ({(i+1)/total:.1%})")
    
    logger.info(f"Converted {len(output_files)}/{len(tif_files)} files successfully")
    return output_files

In [None]:
# Example 1: Convert a list of files with default settings
# tif_files = list(Path(folder_path).glob('*.tif'))

# cog_files = convert_to_cogs(tif_files)

# Example 2: Convert with parallel processing
# cog_files = convert_to_cogs(tif_files, max_workers=25)

# # Example 3: Output to a different directory
# cog_files = convert_to_cogs(tif_files, output_dir="path/to/cog_output")

# # Example 4: Single file conversion
# cog_file = convert_to_cogs("path/to/single.tif")[0]

imports

In [None]:
import os
import re
import pandas as pd
import geopandas as gpd
from pathlib import Path
from datetime import datetime
import logging
from exactextract import exact_extract
import concurrent.futures
import threading
import time
import glob

try avoiding gdal using rio_vrt



VRT file created at: C:\Users\Arnell\Downloads\whisp_outputs_test\combined_rasters.vrt


In [None]:
import geopandas as gpd
import pandas as pd
from exactextract import exact_extract
import concurrent.futures
import time

def exact_extract_in_chunks_parallel(rasters, vector_file, chunk_size=25, ops=['sum'], max_workers=4):
    """
    Process exactextract in parallel chunks of features
    
    Args:
        rasters: List of raster files or single raster path
        vector_file: Path to vector file (GeoJSON, shapefile, etc.)
        chunk_size: Number of features to process in each chunk
        ops: List of operations to perform
        max_workers: Maximum number of parallel processes/threads to use
        
    Returns:
        pd.DataFrame: Combined results
    """
    start_time = time.time()
    
    # Read the vector file
    print(f"Reading vector file: {vector_file}")
    gdf = gpd.read_file(vector_file)
    total_features = len(gdf)
    print(f"Total features to process: {total_features}")
    
    # Calculate number of chunks
    num_chunks = (total_features + chunk_size - 1) // chunk_size  # Ceiling division
    print(f"Processing in {num_chunks} chunks of up to {chunk_size} features each")
    print(f"Using {max_workers} parallel workers")
    
    # Function to process a single chunk
    def process_chunk(chunk_idx):
        start_idx = chunk_idx * chunk_size
        end_idx = min(start_idx + chunk_size, total_features)
        
        print(f"Starting chunk {chunk_idx+1}/{num_chunks} (features {start_idx+1}-{end_idx})")
        chunk_start_time = time.time()
        
        # Extract the chunk
        chunk_gdf = gdf.iloc[start_idx:end_idx].copy()
        
        try:
            # Process this chunk
            chunk_results = exact_extract(
                progress=False,  # Disable progress bar for parallel processing to avoid mixed output
                rast=rasters,
                vec=chunk_gdf,
                # strategy="feature-sequential",
                ops=ops,
                output='pandas'
            )
            
            chunk_time = time.time() - chunk_start_time
            print(f"Completed chunk {chunk_idx+1}/{num_chunks} in {chunk_time:.2f}s")
            return chunk_results
            
        except Exception as e:
            print(f"Error processing chunk {chunk_idx+1}/{num_chunks}: {str(e)}")
            return None
    
    # Initialize empty DataFrame for results
    all_results = pd.DataFrame()
    
    # Process chunks in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all chunks for processing
        future_to_chunk = {executor.submit(process_chunk, chunk_idx): chunk_idx 
                           for chunk_idx in range(num_chunks)}
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_chunk):
            chunk_idx = future_to_chunk[future]
            try:
                result = future.result()
                if result is not None:
                    if all_results.empty:
                        all_results = result
                    else:
                        # Append to existing results
                        all_results = pd.concat([all_results, result], ignore_index=True)
                    
                    print(f"Chunk {chunk_idx+1} integrated into results")
            except Exception as e:
                print(f"Exception in chunk {chunk_idx+1}: {str(e)}")
    
    total_time = time.time() - start_time
    processed_count = len(all_results) if not all_results.empty else 0
    
    print(f"Processing complete. Processed {processed_count}/{total_features} features in {total_time:.2f}s")
    
    return all_results

In [None]:
# ops = ['sum']
# # tiff_path = r'C:\Users\Arnell\Downloads\whisp_image_clip_v0.tif'

# # folder_path = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001_2' #goetiffs (16bit)
# list_of_tiffs = glob.glob(folder_path + '/*.tif') 
# list_of_tiffs

['C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_1.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_10.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_100.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_101.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_102.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_103.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_104.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_105.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_106.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_107.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_108.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_109.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_11.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_110.tif',
 'C:/Users/Arnell/Downloads/whisp_outputs_test\\feature_111.tif',
 'C:/Users/Arn

In [None]:
# stats = exact_extract(
#     progress=True,
#     # rast=str(tiff_path),
#     # rast = list_of_tiffs,
#     rast = folder_path+'/combined_rasters.vrt',# slow with normal tiffs 
#     # vec=GEOJSON_EXAMPLE_FILEPATH,
#     vec=folder_path+'/random_polygons.geojson',
#     # strategy="raster-sequential",
#     strategy="feature-sequential" ,
#     ops=ops,
#     output='pandas',
#     # include_cols=[id_column]
# )




[1.0%] .




[2.0%] .




[3.0%] .




[4.0%] .




[5.0%] .




[6.0%] .




[7.0%] .




[8.0%] .




[9.0%] .




[10.0%] .




[11.0%] .




[12.0%] .




[13.0%] .




[14.0%] .




[15.0%] .




[16.0%] .




[17.0%] .




[18.0%] .




[19.0%] .




[20.0%] .




[21.0%] .




[22.0%] .




[23.0%] .




[24.0%] .




[25.0%] .




[26.0%] .




[27.0%] .




[28.0%] .




[29.0%] .




[30.0%] .




[31.0%] .




[32.0%] .




[33.0%] .




[34.0%] .




[35.0%] .




[36.0%] .




[37.0%] .




[38.0%] .




[39.0%] .




[40.0%] .




[41.0%] .




[42.0%] .




[43.0%] .




[44.0%] .




[45.0%] .




[46.0%] .




[47.0%] .




[48.0%] .




[49.0%] .




[50.0%] .




[51.0%] .




[52.0%] .




[53.0%] .




[54.0%] .




[55.0%] .




[56.0%] .




[57.0%] .




[58.0%] .




[59.0%] .




[60.0%] .




[61.0%] .




[62.0%] .




[63.0%] .




[64.0%] .




[65.0%] .




[66.0%] .




[67.0%] .




[68.0%] .




[69.0%] .




[70.0%] .




[71.0%] .




[72.0%] .




[73.0%] .




[74.0%] .




[75.0%] .




[76.0%] .




[77.0%] .




[78.0%] .




[79.0%] .




[80.0%] .




[81.0%] .




[82.0%] .




[83.0%] .




[84.0%] .




[85.0%] .




[86.0%] .




[87.0%] .




[88.0%] .




[89.0%] .




[90.0%] .




[91.0%] .




[92.0%] .




[93.0%] .




[94.0%] .




[95.0%] .




[96.0%] .




[97.0%] .




[98.0%] .




[99.0%] .




[100.0%] .


In [None]:
stats

Unnamed: 0,band_1_sum,band_2_sum,band_3_sum,band_4_sum,band_5_sum,band_6_sum,band_7_sum,band_8_sum,band_9_sum,band_10_sum,...,band_158_sum,band_159_sum,band_160_sum,band_161_sum,band_162_sum,band_163_sum,band_164_sum,band_165_sum,band_166_sum,band_167_sum
0,198071.723639,0.0,198071.723639,0.000000,0.0,0.0,0.0,4600.645974,198071.723639,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,247469.705494,0.0,247469.705494,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,13774.589131,7696.565175,9067.515696,0.000000,4497.722526,0.000000,888.477123,888.476738,66967.660342
2,168847.500575,0.0,168847.500575,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,6145.792113,6809.445426,1830.564402,0.000000,7093.940114,5242.930819,0.000000,1779.361908,0.000000,8006.458353
3,112732.699477,0.0,112732.699477,0.000000,0.0,0.0,0.0,1629.426180,0.000000,0.0,...,200.187773,890.209946,0.000000,2670.632950,0.000000,5341.270454,0.000000,0.000000,3560.846329,5205.545590
4,183149.324650,0.0,180485.576580,2663.748070,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,827.639816,20829.728709,4347.510675,4593.706558,1775.824516,8545.411985,8118.639575,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,275941.944746,0.0,271497.521673,888.883736,0.0,0.0,0.0,0.000000,0.000000,0.0,...,2666.652969,1447.718328,1777.767723,15440.904624,1783.246634,862.522262,0.000000,3629.362196,5333.311249,3882.485773
96,197364.401033,0.0,197364.401033,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,5763.822023,33852.283043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
97,166332.723751,0.0,166332.723751,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,1775.815819,0.000000,0.000000,18606.837896,3243.019288,0.000000,0.000000,0.000000
98,191162.604907,0.0,191162.604907,0.000000,0.0,0.0,0.0,0.000000,73039.029963,0.0,...,0.000000,86820.182709,0.000000,2707.816212,0.000000,812.455670,0.000000,0.000000,0.000000,19075.644447


chunked stats

In [None]:
import geopandas as gpd
import pandas as pd
from exactextract import exact_extract

# Function to process in chunks
def exact_extract_in_chunks(rasters, vector_file, chunk_size=25, ops=['sum']):
    """
    Process exactextract in chunks of features
    
    Args:
        rasters: List of raster files or single raster path
        vector_file: Path to vector file (GeoJSON, shapefile, etc.)
        chunk_size: Number of features to process in each chunk
        ops: List of operations to perform
        
    Returns:
        pd.DataFrame: Combined results
    """
    # Read the vector file
    print(f"Reading vector file: {vector_file}")
    gdf = gpd.read_file(vector_file)
    total_features = len(gdf)
    print(f"Total features to process: {total_features}")
    
    # Initialize empty DataFrame for results
    all_results = pd.DataFrame()
    
    # Process in chunks
    for i in range(0, total_features, chunk_size):
        # Get the current chunk of features
        end_idx = min(i + chunk_size, total_features)
        print(f"Processing features {i+1} to {end_idx} of {total_features}")
        
        # Extract the chunk
        chunk_gdf = gdf.iloc[i:end_idx].copy()
        
        # Process this chunk
        try:
            chunk_results = exact_extract(
                progress=True,
                rast=rasters,
                vec=chunk_gdf,
                strategy="feature-sequential",
                ops=ops,
                output='pandas'
            )
            
            # If this is the first chunk, use it to initialize the results
            if all_results.empty:
                all_results = chunk_results
            else:
                # Append to existing results
                all_results = pd.concat([all_results, chunk_results], ignore_index=True)
                
            print(f"Chunk complete. Processed {end_idx}/{total_features} features so far.")
            
        except Exception as e:
            print(f"Error processing chunk {i//chunk_size + 1}: {str(e)}")
    
    return all_results

In [None]:
# # Using your example:
# chunk_size = 25  # Process 25 features at a time
# stats = exact_extract_in_chunks(
#     rasters=list_of_tiffs,
#     vector_file=folder_path+'/random_polygons.geojson',
#     chunk_size=chunk_size,
#     ops=['sum']
# )

# # Save the results
# stats.to_csv(folder_path+'/combined_rasters_chunked.csv', index=False)
# print("Processing complete.")

NameError: name 'list_of_tiffs' is not defined

In [None]:
# def exact_extract_in_chunks_parallel(rasters, vector_file, chunk_size=25, ops=['sum'], max_workers=4, id_column='internal_id'):
#     """
#     Process exactextract in parallel chunks, with optimized raster selection per chunk
#     """
#     start_time = time.time()
    
#     # Read the vector file
#     print(f"Reading vector file: {vector_file}")
#     gdf = gpd.read_file(vector_file)
#     total_features = len(gdf)
    
#     # Ensure id_column exists
#     if id_column not in gdf.columns:
#         print(f"Warning: ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
#         gdf[id_column] = range(1, len(gdf) + 1)
    
#     # Create feature ID to index mapping
#     feature_dict = {row[id_column]: idx for idx, row in gdf.iterrows()}
    
#     # If rasters is a directory, create mapping of feature IDs to raster files
#     if isinstance(rasters, str) and os.path.isdir(rasters):
#         raster_dir = rasters
#         id_pattern = re.compile(r'feature_(\d+)\.tif')
#         raster_files = {}
        
#         for file in os.listdir(raster_dir):
#             if file.endswith(('.tif', '.tiff')):
#                 match = id_pattern.search(file)
#                 if match:
#                     tiff_id = int(match.group(1))
#                     if tiff_id in feature_dict:
#                         raster_files[tiff_id] = os.path.join(raster_dir, file)
        
#         print(f"Found {len(raster_files)} matching raster files")
        
#         # Create chunks by feature IDs rather than by indices
#         feature_ids = list(set(feature_dict.keys()) & set(raster_files.keys()))
#     else:
#         # For single raster or list, use all feature IDs
#         feature_ids = list(feature_dict.keys())
#         raster_files = rasters  # May be a single file or list
    
#     # Create chunks of feature IDs
#     chunk_ids = [feature_ids[i:i+chunk_size] for i in range(0, len(feature_ids), chunk_size)]
#     num_chunks = len(chunk_ids)
#     print(f"Processing {len(feature_ids)} features in {num_chunks} chunks")
    
#     # Function to process a single chunk of feature IDs
#     def process_chunk(chunk_idx):
#         ids_in_chunk = chunk_ids[chunk_idx]
#         print(f"Starting chunk {chunk_idx+1}/{num_chunks} with {len(ids_in_chunk)} features")
#         chunk_start_time = time.time()
#         chunk_results = []
        
#         for feature_id in ids_in_chunk:
#             feature_idx = feature_dict[feature_id]
#             feature = gdf.iloc[[feature_idx]]
            
#             # Get the matching raster for this feature ID
#             if isinstance(raster_files, dict):
#                 if feature_id not in raster_files:
#                     print(f"  No raster found for feature ID {feature_id}")
#                     continue
#                 raster_path = raster_files[feature_id]
#             else:
#                 # Use the same raster(s) for all features
#                 raster_path = raster_files
            
#             try:
#                 result = exact_extract(
#                     progress=False,
#                     rast=raster_path,
#                     vec=feature,
#                     strategy="feature-sequential",
#                     ops=ops,
#                     output='pandas',
#                     include_cols=[id_column]
#                 )
                
#                 if result is not None and not result.empty:
#                     chunk_results.append(result)
#                     print(f"  Processed feature ID {feature_id}")
#             except Exception as e:
#                 print(f"  Error processing feature ID {feature_id}: {str(e)}")
        
#         chunk_time = time.time() - chunk_start_time
#         print(f"Completed chunk {chunk_idx+1}/{num_chunks} in {chunk_time:.2f}s")
#         return pd.concat(chunk_results, ignore_index=True) if chunk_results else pd.DataFrame()

#     # Process chunks in parallel
#     all_results = pd.DataFrame()
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
#         future_to_chunk = {executor.submit(process_chunk, i): i for i in range(num_chunks)}
        
#         for future in concurrent.futures.as_completed(future_to_chunk):
#             chunk_idx = future_to_chunk[future]
#             try:
#                 result = future.result()
#                 if not result.empty:
#                     if all_results.empty:
#                         all_results = result
#                     else:
#                         all_results = pd.concat([all_results, result], ignore_index=True)
#             except Exception as e:
#                 print(f"Exception in chunk {chunk_idx+1}: {str(e)}")
    
#     total_time = time.time() - start_time
#     print(f"Processing complete in {total_time:.2f}s")
#     return all_results

In [None]:
# # Example with a directory of TIFFs:
# stats = exact_extract_in_chunks_parallel(
#     rasters=folder_path,  # Directory containing feature_1.tif, feature_2.tif, etc.
#     vector_file=folder_path+'/random_polygons.geojson',
#     chunk_size=25,
#     ops=['sum'],
#     max_workers=4,
#     id_column='internal_id'  # Column in GeoJSON that matches the numbers in TIFF filenames
# )

# # Save the results
# stats.to_csv(folder_path+'/matched_features_results.csv', index=False)

Reading vector file: C:/Users/Arnell/Downloads/whisp_outputs_test/random_polygons.geojson


NameError: name 're' is not defined

In [None]:
print(stats)


    cog_feature_1_band_1_sum  cog_feature_1_band_2_sum  \
0                        0.0                       0.0   
1                        0.0                       0.0   
2                        0.0                       0.0   
3                        0.0                       0.0   
4                        0.0                       0.0   
..                       ...                       ...   
95                       0.0                       0.0   
96                       0.0                       0.0   
97                       0.0                       0.0   
98                       0.0                       0.0   
99                       0.0                       0.0   

    cog_feature_1_band_3_sum  cog_feature_1_band_4_sum  \
0                        0.0                       0.0   
1                        0.0                       0.0   
2                        0.0                       0.0   
3                        0.0                       0.0   
4            

In [None]:
stats.to_csv(folder_path+'/combined_rasters.csv', index=False)


STORING AI answer to canb i run exact extract on a cloud bucket:
 Use GDAL's virtual file system (advanced)
For Cloud-Optimized GeoTIFFs, you can use GDAL's virtual file system with the /vsigs/ prefix:

In [None]:
# import os
# import gdal
# from exactextract import exact_extract
# import geopandas as gpd

# # Set GCS credentials environment variable
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/credentials.json'

# # GCS path using GDAL's virtual file system
# gcs_path = "/vsigs/your-bucket-name/path/to/file.tif"

# vector_file = "your_vector_data.geojson"
# gdf = gpd.read_file(vector_file)

# # Try with exactextract
# try:
#     stats = exact_extract(
#         rast=gcs_path, 
#         vec=gdf,
#         ops=["mean"],
#         output='pandas'
#     )
#     print(stats)
# except Exception as e:
#     print(f"Direct access failed: {e}")
#     print("You may need to download the file first.")

In [None]:
# Basic usage with default settings
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH
# )

# # Custom directory and operations 
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     tiff_dir=Path.home() / 'my_geotiffs',
#     ops=['sum', 'mean', 'count'],
#     max_features=5
# )

# # Custom ID pattern for different filename format (e.g., 'parcel_123_ndvi.tif')
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     tiff_id_pattern=r'parcel_(\d+)_ndvi\.tif',
#     id_column='parcel_id'
# )

# # Specify output CSV location
# results_df, csv_path = geotiff_stats_by_feature_id(
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH,
#     output_csv=Path.home() / 'analysis' / 'whisp_results.csv'
# )

Stats for fc - parallel processing in batches (client side) 

In [None]:
def geotiff_stats_by_feature_id_threaded(
    geojson_path, 
    tiff_dir=None, 
    output_csv=None, 
    tiff_id_pattern=r'feature_(\d+)\.tif', 
    id_column='internal_id',
    ops=['sum'],
    max_features=None,
    max_workers=4,
    batch_size=5  # Process files in batches for better performance
):
    """
    Process GeoTIFF files that match feature IDs in a GeoJSON using thread-based
    parallelism, which avoids the serialization issues of multiprocessing.
    
    Args:
        geojson_path (str or Path): Path to the GeoJSON file with features
        tiff_dir (str or Path): Directory containing GeoTIFF files (default: ~/Downloads/whisp_features)
        output_csv (str or Path): Path to save the output CSV (default: uses timestamp)
        tiff_id_pattern (str): Regex pattern to extract ID from GeoTIFF filename
        id_column (str): Column name in GeoJSON containing feature IDs
        ops (list): List of operations to perform with exactextract
        max_features (int): Maximum number of features to process
        max_workers (int): Maximum number of concurrent workers (default: 4)
        batch_size (int): Number of files to process in each batch
        
    Returns:
        pd.DataFrame: Combined results DataFrame
        str: Path to the output CSV file
    """

    
    start_time = time.time()
    logger = logging.getLogger('whisp_processor')
    
    # Set default directory if not specified
    if tiff_dir is None:
        tiff_dir = Path.home() / 'Downloads' / 'whisp_features'
    else:
        tiff_dir = Path(tiff_dir)
    
    # Load the GeoJSON
    logger.info(f"Loading GeoJSON from {geojson_path}")
    gdf = gpd.read_file(geojson_path)
    
    # Ensure ID column exists
    if id_column not in gdf.columns:
        logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
        gdf[id_column] = range(1, len(gdf) + 1)
    
    # Apply max_features if specified
    if max_features and max_features < len(gdf):
        logger.info(f"Limiting to first {max_features} features")
        gdf = gdf.iloc[:max_features]
    
    # Create feature lookup dictionary for faster access
    feature_dict = {}
    for idx, row in gdf.iterrows():
        feature_id = row[id_column]
        feature_dict[feature_id] = idx
    
    # Find matching GeoTIFF files
    tiff_files = []
    id_pattern = re.compile(tiff_id_pattern)
    
    for file in os.listdir(tiff_dir):
        if file.endswith('.tif') or file.endswith('.tiff'):
            match = id_pattern.search(file)
            if match:
                tiff_id = int(match.group(1))
                if tiff_id in feature_dict:
                    tiff_files.append(file)
    
    logger.info(f"Found {len(tiff_files)} matching GeoTIFF files in {tiff_dir}")
    
    # Set up output CSV
    if output_csv is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = Path(tiff_dir) / f"feature_stats_{timestamp}.csv"
    else:
        output_csv = Path(output_csv)
    
    # CSV writing synchronization
    csv_created = False
    csv_lock = threading.Lock()
    
    # Create batches of files for processing
    batches = []
    for i in range(0, len(tiff_files), batch_size):
        batches.append(tiff_files[i:i+batch_size])
    
    logger.info(f"Created {len(batches)} batches with up to {batch_size} files each")
    
    # Function to process a batch of files
    def process_batch(batch_files):
        batch_results = []
        
        for tiff_file in batch_files:
            try:
                # Extract ID from filename
                match = id_pattern.search(tiff_file)
                if not match:
                    logger.debug(f"Could not extract ID from filename: {tiff_file}")
                    continue
                    
                tiff_id = int(match.group(1))
                
                # Find the corresponding feature in the GeoDataFrame
                if tiff_id not in feature_dict:
                    logger.debug(f"No matching feature found for ID {tiff_id}")
                    continue
                
                # Get the feature from the dataframe
                feature_idx = feature_dict[tiff_id]
                feature = gdf.iloc[[feature_idx]]
                
                # Full path to GeoTIFF file
                tiff_path = tiff_dir / tiff_file
                
                # Execute exactextract
                logger.info(f"Processing feature ID: {tiff_id}")
                stats = exact_extract(
                    rast=str(tiff_path),
                    vec=feature,
                    ops=ops,
                    output='pandas',
                    include_cols=[id_column]
                )
                
                # Add the geometry column to the results
                stats['geometry'] = feature.iloc[0].geometry
                
                # Add to batch results
                batch_results.append(stats)
                logger.info(f"Feature {tiff_id} processed successfully")
                
            except Exception as e:
                logger.error(f"Error processing file {tiff_file}: {str(e)}")
        
        return batch_results
    
    # Process batches (in parallel if max_workers > 1)
    all_results = []
    
    if max_workers > 1:
        logger.info(f"Processing batches in parallel with {max_workers} threads")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit batch tasks
            future_to_batch = {executor.submit(process_batch, batch): i for i, batch in enumerate(batches)}
            
            # Collect results
            for future in concurrent.futures.as_completed(future_to_batch):
                batch_idx = future_to_batch[future]
                try:
                    batch_results = future.result()
                    for result in batch_results:
                        # Write to CSV with proper synchronization
                        with csv_lock:
                            if not csv_created:
                                result.to_csv(output_csv, index=False)
                                csv_created = True
                            else:
                                result.to_csv(output_csv, mode='a', header=False, index=False)
                        # Collect results
                        all_results.append(result)
                    
                    logger.info(f"Completed batch {batch_idx+1}/{len(batches)}")
                except Exception as e:
                    logger.error(f"Exception in batch {batch_idx}: {str(e)}")
    else:
        # Process sequentially
        logger.info("Processing batches sequentially")
        for i, batch in enumerate(batches):
            batch_results = process_batch(batch)
            for result in batch_results:
                # Write to CSV
                if not csv_created:
                    result.to_csv(output_csv, index=False)
                    csv_created = True
                else:
                    result.to_csv(output_csv, mode='a', header=False, index=False)
                # Collect results
                all_results.append(result)
            
            logger.info(f"Completed batch {i+1}/{len(batches)}")
    
    # Combine all results
    if all_results:
        all_results_df = pd.concat(all_results, ignore_index=True)
        
        # Convert to GeoDataFrame
        try:
            result_gdf = gpd.GeoDataFrame(all_results_df, geometry='geometry')
            if gdf.crs:
                result_gdf = result_gdf.set_crs(gdf.crs)
            
            logger.info(f"Total processing time: {time.time() - start_time:.2f}s")
            logger.info(f"Results saved to {output_csv}")
            return result_gdf, str(output_csv)
        except Exception as e:
            logger.error(f"Error creating GeoDataFrame from results: {str(e)}")
    
    if not all_results:
        logger.warning("No results generated")
    
    logger.info(f"Total processing time: {time.time() - start_time:.2f}s")
    return pd.DataFrame(), str(output_csv)

In [None]:
# Use thread-based parallelism (more reliable than processes for GIS operations)
results_df, csv_path = geotiff_stats_by_feature_id_threaded(
    geojson_path=GEOJSON_EXAMPLE_FILEPATH,
    max_workers=10,  # Adjust based on your machine's capabilities
    batch_size=10    # Smaller batch size for better load balancing
)

2025-04-28 19:44:23,468 - INFO - Loading GeoJSON from C:/Users/Arnell/Downloads/whisp_outputs_test/random_polygons.geojson
2025-04-28 19:44:23,485 - INFO - Found 100 matching GeoTIFF files in C:\Users\Arnell\Downloads\whisp_features


NameError: name 'threading' is not defined

In [None]:
import gc
# import os
from exactextract import exact_extract

def safely_extract_stats(tiff_path, feature, ops=['sum'], id_column=None):
    """
    A safer wrapper around exactextract that ensures resources are properly released.
    
    This function isolates the exactextract call and ensures cleanup even if exceptions occur.
    """
    
    result = None
    try:
        # Process in its own scope
        result = exact_extract(
            rast=tiff_path,
            vec=feature,
            ops=ops,
            output='pandas',
            include_cols=[id_column] if id_column else None
        )
        
    except Exception as e:
        print(f"Error processing {tiff_path}: {str(e)}")
        
    finally:
        # Explicit cleanup to help release the file
        gc.collect()
        
        # On Windows, add a brief delay which sometimes helps release file locks
        try:
            import time
            time.sleep(0.1)  # Short delay
        except:
            pass
    
    return result

Chain with exact extract

In [None]:
def download_and_extract_stats_for_collection(
    feature_collection, 
    image, 
    geojson_path=None,
    output_dir=None,
    output_csv=None,
    scale=10, 
    max_features=None, 
    max_workers=None,
    max_retries=3, 
    retry_delay=3,
    ops=['sum'],
    id_column='internal_id',
    keep_geotiffs=True
):
    """
    Combined function that downloads GeoTIFFs for features in a collection and
    immediately runs exactextract to calculate statistics with proper band names.
    
    Args:
        feature_collection: Earth Engine FeatureCollection to process
        image: Earth Engine image to clip and download
        geojson_path: Path to matching GeoJSON file (optional, for more precise polygon extraction)
        output_dir: Directory to save the GeoTIFFs (default: ~/Downloads/whisp_features)
        output_csv: Path to save the output CSV (default: uses timestamp)
        scale: Resolution in meters (default 10m)
        max_features: Maximum number of features to process (default: all)
        max_workers: Maximum number of parallel workers (default: None, sequential)
        max_retries: Maximum number of retry attempts for each download
        retry_delay: Base delay in seconds between retries
        ops: List of operations to perform with exactextract
        id_column: Column name in GeoJSON containing feature IDs
        keep_geotiffs: Whether to keep the downloaded GeoTIFF files (default: True)
        
    Returns:
        gdf: GeoDataFrame with extracted statistics
        csv_path: Path to the saved CSV file
    """
    import ee
    import os
    import time
    import logging
    import concurrent.futures
    import pandas as pd
    import geopandas as gpd
    import requests
    import rasterio
    from pathlib import Path
    from datetime import datetime
    import threading
    from exactextract import exact_extract
    
    # Set up logging
    logger = logging.getLogger('whisp_processor')
    
    # Get band names from the Earth Engine image
    try:
        # Get band information from the Earth Engine image
        band_names = image.bandNames().getInfo()
        logger.info(f"Retrieved band names from image: {band_names}")
    except Exception as e:
        logger.warning(f"Failed to get band names from image: {str(e)}")
        band_names = None
    
    # Set default output directory
    if output_dir is None:
        output_dir = Path.home() / 'Downloads' / 'whisp_features'
    
    # Create directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Set up output CSV
    if output_csv is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_csv = output_dir / f"feature_stats_{timestamp}.csv"
    else:
        output_csv = Path(output_csv)
    
    # Load GeoJSON if provided (for more accurate extraction)
    if geojson_path:
        logger.info(f"Loading GeoJSON from {geojson_path}")
        source_gdf = gpd.read_file(geojson_path)
        
        # Ensure ID column exists
        if id_column not in source_gdf.columns:
            logger.warning(f"ID column '{id_column}' not found in GeoJSON. Adding sequential IDs.")
            source_gdf[id_column] = range(1, len(source_gdf) + 1)
            
        # Create feature lookup dictionary for faster access
        feature_dict = {}
        for idx, row in source_gdf.iterrows():
            feature_id = row[id_column]
            feature_dict[feature_id] = idx
    else:
        source_gdf = None
        feature_dict = None
    
    # Get collection size and limit if needed
    collection_size = feature_collection.size().getInfo()
    logger.info(f"Processing Earth Engine FeatureCollection with {collection_size} features")
    
    if max_features and max_features < collection_size:
        feature_collection = feature_collection.limit(max_features)
        collection_size = max_features
        logger.info(f"Limited to processing first {max_features} features")
    
    # Get features as a list
    features = feature_collection.toList(collection_size)
    
    # CSV writing synchronization
    csv_created = False
    csv_lock = threading.Lock()
    all_results = []
    
    
    def rename_band_columns(df, band_names, ops=['sum']):
        """
        Rename generic band index columns to meaningful band names in a DataFrame.
        
        Args:
            df (pd.DataFrame): DataFrame with stats columns to rename
            band_names (list): List of band names from Earth Engine image
            ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
            
        Returns:
            pd.DataFrame: DataFrame with renamed columns
        """
        # Create a copy of the input DataFrame to avoid modifying the original
        renamed_df = df.copy()
        
        # Create a mapping from generic band names to actual band names
        column_mapping = {}
        for op in ops:
            for i, band_name in enumerate(band_names):
                # Check both possible formats
                format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
                format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
                
                if format1 in renamed_df.columns:
                    column_mapping[format1] = band_name
                elif format2 in renamed_df.columns:
                    column_mapping[format2] = band_name
        
        # Apply the renaming and return
        return renamed_df.rename(columns=column_mapping)


    def download_and_process_feature(index):
        try:
            # Get the feature
            ee_feature = ee.Feature(features.get(index))
            
            # Get the feature ID
            try:
                internal_id = ee_feature.get(id_column).getInfo()
                logger.info(f"Processing feature {internal_id} ({index+1}/{collection_size})")
            except Exception:
                internal_id = f"unknown_{index}"
                logger.warning(f"Could not get ID for feature {index}, using {internal_id}")
            
            # Create a unique filename
            filename = f"feature_{internal_id}.tif"
            output_path = output_dir / filename
            
            # Skip download if file exists
            if output_path.exists():
                logger.info(f"File {filename} already exists, skipping download")
                # Always use process_downloaded_feature for consistency
                return process_downloaded_feature(ee_feature, str(output_path), internal_id)
            
            # Download the file
            retries = 0
            while retries < max_retries:
                try:
                    # Clip the image to the feature
                    clipped_image = image.clip(ee_feature.geometry())
                    
                    # Generate the download URL
                    logger.debug(f"Generating download URL for feature {internal_id}")
                    download_url = clipped_image.getDownloadURL({
                        'format': 'GeoTIFF',
                        'region': ee_feature.geometry(),
                        'scale': scale,
                        'crs': 'EPSG:4326'
                    })
                    
                    # Download the image with timeout
                    logger.info(f"Downloading to {output_path}")
                    response = requests.get(download_url, timeout=300)
                    
                    if response.status_code == 200:
                        # Check if the response is actually a GeoTIFF
                        content_type = response.headers.get('Content-Type', '')
                        if 'tiff' in content_type.lower() or 'zip' in content_type.lower():
                            with open(output_path, 'wb') as f:
                                f.write(response.content)
                            logger.info(f"Successfully downloaded {filename}")
                            
                            # Process the downloaded file
                            return process_downloaded_feature(ee_feature, str(output_path), internal_id)
                        else:
                            logger.error(f"Download returned non-TIFF content: {content_type}")
                            error_file = output_dir / f"error_{internal_id}.txt"
                            with open(error_file, 'wb') as f:
                                f.write(response.content[:2000])
                            retries += 1
                    else:
                        logger.error(f"Failed to download (status {response.status_code})")
                        retries += 1
                    
                    # Wait before retrying
                    if retries < max_retries:
                        sleep_time = retry_delay * (2 ** retries)
                        logger.info(f"Retrying in {sleep_time} seconds (attempt {retries+1}/{max_retries})")
                        time.sleep(sleep_time)
                
                except Exception as e:
                    logger.error(f"Error downloading feature {internal_id}: {str(e)}", exc_info=True)
                    retries += 1
                    if retries < max_retries:
                        logger.info(f"Retrying in {retry_delay} seconds (attempt {retries+1}/{max_retries})")
                        time.sleep(retry_delay)
            
            logger.error(f"Maximum retries reached for feature {internal_id}")
            return None
        
        except Exception as e:
            logger.error(f"Error processing feature at index {index}: {str(e)}", exc_info=True)
            return None
        
    
    # Helper function to process a downloaded GeoTIFF file
    def process_downloaded_feature(ee_feature, tiff_path, feature_id):

        def get_band_names(tif_path):
            """Extract band names from a GeoTIFF file."""
            with rasterio.open(tif_path) as src:
                if src.descriptions and all(src.descriptions):
                    return list(src.descriptions)
                else:
                    return [f"Band {i+1}" for i in range(src.count)]
        try:
            # Ensure we have a plain string path, not a Path object
            tiff_path_str = str(tiff_path)
            
            logger.info(f"Processing downloaded file: {tiff_path_str}")

            # # Get band names from the TIFF file if not already available
            
            local_band_names = band_names

            if local_band_names is None:
                print(f"Band names not available from image, trying to read from file: {tiff_path_str}")
                try:
                    local_band_names = get_band_names(tiff_path_str)
                    local_band_names = image.bandNames().getInfo()

                    with rasterio.open(tiff_path_str) as src:
                        # If raster has descriptions, use them as band names
                        if src.descriptions and all(src.descriptions):
                            local_band_names = list(src.descriptions)
                            logger.info(f"Using band descriptions from GeoTIFF: {local_band_names}")
                        else:
                            # Otherwise create generic names
                            local_band_names = [f"band_{i+1}" for i in range(src.count)]
                            logger.info(f"Using generic band names: {local_band_names}")
                except Exception as e:
                    logger.warning(f"Failed to read band names from GeoTIFF: {str(e)}")
                    # Fallback to generic names if reading fails
                    local_band_names = image.bandNames().getInfo()
            


            # Find the corresponding feature in source GeoJSON if available
            if source_gdf is not None and feature_id in feature_dict:
                feature_idx = feature_dict[feature_id]
                feature = source_gdf.iloc[[feature_idx]]
            else:
                logger.warning(f"Feature ID {feature_id} not found in GeoJSON")
                # # Use the EE feature's geometry (less precise but works)
                # feature_geom = gpd.GeoDataFrame(
                #     {'internal_id': [feature_id]},
                #     geometry=[gpd.GeoSeries.from_wkt([ee_feature.geometry().toWkt().getInfo()])[0]]
                # )
                # feature = feature_geom
            
            # Use exactextract with plain string path
            logger.debug(f"Running exactextract on {tiff_path_str}")
            stats = exact_extract(
                rast=tiff_path_str,
                vec=feature,
                ops=ops,
                output='pandas',
                include_cols=[id_column]
            )


            # Rename columns to use actual band names if available
            if local_band_names:
                stats = rename_band_columns(stats, local_band_names, ops=ops)
                logger.debug(f"Renamed columns using band names: {local_band_names}")
            

            
            # Add the geometry column
            stats['geometry'] = feature.iloc[0].geometry
            
            # Delete the GeoTIFF if not keeping
            if not keep_geotiffs:
                try:
                    os.remove(tiff_path_str)
                    logger.debug(f"Deleted temporary file {tiff_path_str}")
                except Exception as e:
                    logger.warning(f"Failed to delete temporary file {tiff_path_str}: {str(e)}")
            
            return stats
        except Exception as e:
            logger.error(f"Error extracting stats for feature {feature_id}: {str(e)}")
            return None
    
    # Process features (parallel or sequential)
    if max_workers and max_workers > 1:
        logger.info(f"Processing features in parallel with {max_workers} workers")
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_index = {
                executor.submit(download_and_process_feature, i): i 
                for i in range(collection_size)
            }
            
            # Process results as they complete
            for future in concurrent.futures.as_completed(future_to_index):
                index = future_to_index[future]
                try:
                    stats = future.result()
                    if stats is not None:
                        # Write to CSV with proper synchronization
                        with csv_lock:
                            if not csv_created:
                                stats.to_csv(output_csv, index=False)
                                csv_created = True
                            else:
                                stats.to_csv(output_csv, mode='a', header=False, index=False)
                        # Add to results
                        all_results.append(stats)
                        logger.info(f"Completed feature {index+1}/{collection_size}")
                    else:
                        logger.warning(f"Failed to process feature {index+1}/{collection_size}")
                except Exception as e:
                    logger.error(f"Exception occurred while processing feature {index+1}: {str(e)}")
    else:
        # Sequential processing
        logger.info("Processing features sequentially")
        for i in range(collection_size):
            logger.info(f"Processing feature {i+1}/{collection_size}")
            stats = download_and_process_feature(i)
            if stats is not None:
                # Write to CSV
                if not csv_created:
                    stats.to_csv(output_csv, index=False)
                    csv_created = True
                else:
                    stats.to_csv(output_csv, mode='a', header=False, index=False)
                # Add to results
                all_results.append(stats)
    
    # Combine all results
    if all_results:
        try:
            all_results_df = pd.concat(all_results, ignore_index=True)

            result_gdf = gpd.GeoDataFrame(all_results_df, geometry='geometry')
            
            if source_gdf is not None and source_gdf.crs:
                result_gdf = result_gdf.set_crs(source_gdf.crs)
            
            logger.info(f"Completed processing {len(all_results)}/{collection_size} features successfully")
            logger.info(f"Results saved to {output_csv}")                    
                
            return result_gdf, str(output_csv)
        except Exception as e:
            logger.warning(f"Error creating final GeoDataFrame: {str(e)}")
    
    if not all_results:
        logger.warning("No results generated")
    return None, str(output_csv)

In [None]:
# # Basic usage with defaults
# results_df, csv_path = download_and_extract_stats_for_collection(
#     feature_collection=ee_bbox_collection,
#     image=whisp.combine_datasets(),
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH
# )

GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

# ee_bbox_collection = convert_geojson_to_ee_bbox(GEOJSON_EXAMPLE_FILEPATH)

# Example 5: Full obscuration - extend, shift, and add random features
fully_obscured_collection = convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    # extension_range=[0.002, 0.003],
    # shift_geometries=True,
    # shift_proportion=0.9,
    # pixel_length=0.0001,  # ~10m at equator
    # add_random_features=False,
    # max_distance=0.05,  # xkm at equator
    # random_proportion= 0.1  # Add X more features as decoys
)

# Advanced usage
results_df, csv_path = download_and_extract_stats_for_collection(
    feature_collection=fully_obscured_collection,
    image=whisp.combine_datasets(),
    geojson_path=GEOJSON_EXAMPLE_FILEPATH,
    output_dir=folder_path+ "/"+'whisp_on_the_fly_v4',
    # output_csv=Path.home() / 'whisp_analysis' / 'results.csv',
    ops=['sum'],# 'mean', 'count'],
    max_features=1000,
    max_workers=30,
    keep_geotiffs=True  # Delete GeoTIFFs after processing to save space
)

In [None]:
# import rasterio
# tif_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_9.tif"
# def get_band_names(tif_path):
#     """Extract band names from a GeoTIFF file."""
#     with rasterio.open(tif_path) as src:
#         if src.descriptions and all(src.descriptions):
#             return list(src.descriptions)
#         else:
#             return [f"Band {i+1}" for i in range(src.count)]
            
# # Check the updated band names
# bands = get_band_names(tif_path)
# print(bands)

In [None]:
# def rename_band_columns(df, band_names, ops=['sum']):
#     """
#     Rename generic band index columns to meaningful band names in a DataFrame.
    
#     Args:
#         df (pd.DataFrame): DataFrame with stats columns to rename
#         band_names (list): List of band names from Earth Engine image
#         ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
        
#     Returns:
#         pd.DataFrame: DataFrame with renamed columns
#     """
#     # Create a copy of the input DataFrame to avoid modifying the original
#     renamed_df = df.copy()
    
#     # Create a mapping from generic band names to actual band names
#     column_mapping = {}
#     for op in ops:
#         for i, band_name in enumerate(band_names):
#             # Check both possible formats
#             format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
#             format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
            
#             if format1 in renamed_df.columns:
#                 column_mapping[format1] = band_name
#             elif format2 in renamed_df.columns:
#                 column_mapping[format2] = band_name
    
#     # Apply the renaming and return
#     return renamed_df.rename(columns=column_mapping)



# def rename_stats_columns_with_band_names(csv_path, band_names, ops=['sum']):
#     """
#     Rename statistics columns in a CSV file by replacing generic band indices 
#     with actual band names from Earth Engine image bands.
    
#     Args:
#         csv_path (str): Path to the CSV file with statistics
#         band_names (list): List of band names from image.bandNames().getInfo()
#         ops (list): List of operations (e.g., ['sum', 'mean', 'count'])
        
#     Returns:
#         pd.DataFrame: DataFrame with renamed columns
#     """
#     # import pandas as pd
    
#     # Load the CSV
#     stats = pd.read_csv(csv_path)
    
#     # Print current columns to help diagnose format
#     print("Current columns:", stats.columns.tolist())
#     rename_band_columns
#     # Create a mapping from generic band names to actual band names
#     column_mapping = {}
#     for op in ops:
#         for i, band_name in enumerate(band_names):
#             # Check both possible formats
#             format1 = f"{op}_{i+1}"      # e.g., sum_1 (standard exactextract format)
#             format2 = f"band_{i+1}_{op}" # e.g., band_1_sum (alternative format)
            
#             if format1 in stats.columns:
#                 column_mapping[format1] = band_name
#             elif format2 in stats.columns:
#                 column_mapping[format2] = band_name
    
#     # Print mapping for verification
#     print("Column mapping:", column_mapping)
    
#     # Apply the renaming
#     stats = stats.rename(columns=column_mapping)
    
#     return stats

In [None]:
# csv_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_stats_20250424_190341.csv"
# stats = pd.read_csv(csv_path)
# ops = ['sum']
# band_names = whisp.combine_datasets().bandNames().getInfo()
# stats = rename_band_columns(stats, band_names, ops)

In [None]:
# stats

In [None]:
# import rasterio

# def get_band_names(tif_path):
#     """
#     Extract band names from a GeoTIFF file.
    
#     Args:
#         tif_path (str): Path to the GeoTIFF file
        
#     Returns:
#         list: List of band names/descriptions
#     """
#     with rasterio.open(tif_path) as src:
#         # Try to get band descriptions (often contain band names)
#         band_descriptions = src.descriptions
        
#         # If descriptions are available and not empty, use them
#         if band_descriptions and all(band_descriptions):
#             print(f"Found {len(band_descriptions)} bands with descriptions")
#             return list(band_descriptions)
            
#         # Check for band metadata that might contain names
#         band_names = []
#         for i in range(1, src.count + 1):
#             band_meta = src.tags(i)
#             if band_meta and 'name' in band_meta:
#                 band_names.append(band_meta['name'])
#             else:
#                 # Fall back to generic naming
#                 band_names.append(f"Band {i}")
        
#         print(f"Found {src.count} bands")
#         return band_names


In [None]:

# Example usage
# Replace with your file path
tif_path = r"C:\Users\Arnell\Downloads\whisp_features\feature_1.tif"
band_names = get_band_names(tif_path)
for i, name in enumerate(band_names):
    print(f"Band {i+1}: {name}")

In [None]:
def check_open_files():
    """Check for open TIFF files and return their paths"""
    import psutil
    process = psutil.Process()
    open_files = process.open_files()
    tiff_files = [f.path for f in open_files if f.path.endswith(('.tif', '.tiff'))]
    print(f"Open TIFF files: {tiff_files}")
    return tiff_files  # Return the list of file paths

In [None]:
check_open_files()

In [None]:
def force_release_tiff_files():
    """Force release of stubborn TIFF file locks using multiple approaches"""
    import gc
    import os
    import psutil
    import sys
    
    print("Starting aggressive TIFF cleanup...")
    
    # 1. First garbage collection pass
    gc.collect()
    
    # 2. Try to identify what's holding the files
    process = psutil.Process()
    tiff_files = [f for f in process.open_files() if f.path.endswith(('.tif', '.tiff'))]
    for file in tiff_files:
        print(f"Locked file: {file}")
    
    # 3. Try to reset libraries that commonly lock files
    try:
        # Reset GDAL
        from osgeo import gdal
        gdal.UseExceptions()  # Make GDAL throw exceptions
        print("Resetting GDAL cache...")
        gdal.SetConfigOption('GDAL_MAX_DATASET_POOL_SIZE', '0')  # Disable dataset pooling
        gdal.SetCacheMax(0)  # Clear caches
        
    except ImportError:
        print("GDAL not directly imported")
    
    # 4. Reset exactextract if it's loaded
    if 'exactextract' in sys.modules:
        print("Removing exactextract from sys.modules...")
        del sys.modules['exactextract']
    
    # 5. If rasterio is being used
    try:
        import rasterio
        from rasterio.errors import RasterioIOError
        
        print("Cleaning rasterio environment...")
        rasterio.env.GDALEnv(CPL_DEBUG=True)  # Create a new environment with debug on
        
        # Try to deliberately close the files
        for file in tiff_files:
            try:
                # This might raise an error, but sometimes forces release
                with rasterio.open(file.path, 'r') as src:
                    pass  # Just open and close it to reset
            except RasterioIOError:
                pass  # Ignore errors, we're just trying to force close
            
    except ImportError:
        print("Rasterio not installed")
    
    # 6. More aggressive garbage collection
    for _ in range(3):
        gc.collect()
    
    # 7. Check what's left
    remaining = [f for f in psutil.Process().open_files() if f.path.endswith(('.tif', '.tiff'))]
    print(f"After aggressive cleanup: {len(remaining)} files still locked")
    
    return remaining

In [None]:
force_release_tiff_files()

In [None]:
def safely_process_with_isolation(tiff_paths):
    """
    Process a list of TIFF files by completely isolating the exactextract module
    to prevent file locks from persisting.
    """
    import gc
    import sys
    import importlib
    
    print(f"Attempting to unlock {len(tiff_paths)} files using isolation method...")
    
    # Step 1: Force reset all relevant modules that might hold locks
    modules_to_reload = []
    for module_name in list(sys.modules.keys()):
        if any(keyword in module_name for keyword in ['gdal', 'rasterio', 'exactextract', 'fiona', 'osgeo']):
            modules_to_reload.append(module_name)
    
    # Force unload these modules
    for module_name in modules_to_reload:
        if module_name in sys.modules:
            try:
                del sys.modules[module_name]
                print(f"Unloaded: {module_name}")
            except:
                pass
                
    # Step 2: Run aggressive garbage collection
    print("Running multiple garbage collection cycles...")
    for _ in range(3):
        gc.collect()
    
    # Step 3: Attempt a direct file copy approach to break locks
    import os
    import shutil
    from pathlib import Path
    
    for tiff_path in tiff_paths:
        try:
            path = Path(tiff_path)
            # Create a temporary copy with a different name
            temp_path = path.with_name(f"temp_{path.name}")
            
            try:
                # Copy the file data rather than moving the file handle
                shutil.copy2(tiff_path, temp_path)
                print(f"Created temporary copy: {temp_path}")
                
                # Remove original (might fail if still locked)
                try:
                    os.remove(tiff_path)
                    # Rename temp back to original
                    os.rename(temp_path, tiff_path)
                    print(f"Successfully unlocked: {tiff_path}")
                except:
                    print(f"Original file still locked, will keep temporary copy: {temp_path}")
            except Exception as e:
                print(f"Error copying file {tiff_path}: {str(e)}")
        except Exception as e:
            print(f"Error processing {tiff_path}: {str(e)}")
    
    print("Isolation process complete")

In [None]:
safely_process_with_isolation(check_open_files())

Chain for downloading and stats

In [None]:
# from pathlib import Path

# tiff_path = Path("C:/Users/Arnell/Downloads/whisp_features/feature_1.tif")
# print("Exists:", tiff_path.exists())
# print("Is file:", tiff_path.is_file())
# print("Absolute path:", tiff_path.resolve())
# print(str(tiff_path))

In [None]:
# # internal_id ="1"        
# gdf = gpd.read_file(GEOJSON_EXAMPLE_FILEPATH)
# # Find matching feature in the GeoDataFrame
# feature = gdf.iloc[0]
# # if len(feature) == 0:
# #     logger.warning(f"No matching feature found for ID {internal_id}, skipping")
# #     return []

# # Get the geometry from the feature
# geom = feature.geometry#.iloc[0]

In [None]:
# exact_extract(rast=str(tiff_path),
#                vec=gdf,
#                ops=['sum'],
#                output='pandas',
#             #    include_cols=['internal_id']
#             )



In [None]:
# df = whisp.whisp_formatted_stats_ee_to_df(convert_geojson_to_ee_bbox(GEOJSON_EXAMPLE_FILEPATH))
# df

Parallel processing test


Whisp it

In [None]:
# df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(folder_path+'/random_polygons.geojson')
df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)



Reading GeoJSON file from: C:\Users\Arnell\Downloads\whisp_outputs_test\random_polygons.geojson
['Area', 'European_Primary_Forest', 'GLC_FCS30D_TC_2022', 'GLC_FCS30D_crop_2022', 'IFL_2020', 'IIASA_planted_plantation', 'Cocoa_bnetd', 'Oil_palm_Descals', 'ESA_fire_before_2020', 'ESA_fire_2001', 'ESA_fire_2002', 'ESA_fire_2003', 'ESA_fire_2004', 'ESA_fire_2005', 'ESA_fire_2006', 'ESA_fire_2007', 'ESA_fire_2008', 'ESA_fire_2009', 'ESA_fire_2010', 'ESA_fire_2011', 'ESA_fire_2012', 'ESA_fire_2013', 'ESA_fire_2014', 'ESA_fire_2015', 'ESA_fire_2016', 'ESA_fire_2017', 'ESA_fire_2018', 'ESA_fire_2019', 'ESA_fire_2020', 'ESA_TC_2020', 'ESRI_2023_TC', 'ESRI_2023_crop', 'Cocoa_ETH', 'Cocoa_2023_FDaP', 'Cocoa_FDaP', 'Forest_FDaP', 'Oil_palm_2023_FDaP', 'Oil_palm_FDaP', 'Rubber_2023_FDaP', 'Rubber_FDaP', 'GFT_naturally_regenerating', 'GFT_planted_plantation', 'GFT_primary', 'GFC_TC_2020', 'GFC_loss_after_2020', 'GFC_loss_before_2020', 'GFC_loss_year_2001', 'GFC_loss_year_2002', 'GFC_loss_year_2003', 

Display table

In [None]:
df_formatted_stats

Unnamed: 0,plotId,external_id,Area,Geometry_type,Country,ProducerCountry,Admin_Level_1,Centroid_lon,Centroid_lat,Unit,...,TMF_regrowth_2023,ESRI_2023_TC,GLC_FCS30D_TC_2022,Oil_palm_2023_FDaP,Rubber_2023_FDaP,Cocoa_2023_FDaP,ESRI_2023_crop,GLC_FCS30D_crop_2022,GFW_logging,geo
0,1,,2.908,Polygon,GHA,GH,Central Region,-1.318184,5.773201,ha,...,1.188,2.908,2.908,0.998,0.0,0.625,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.31919..."
1,2,,1.681,Polygon,GHA,GH,Bono Region,-2.300386,7.247708,ha,...,0.944,1.681,1.681,0.000,0.0,0.767,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-2.30125..."
2,3,,2.531,Polygon,GHA,GH,Western Region,-1.954633,5.950335,ha,...,0.089,2.061,2.489,0.000,0.0,0.341,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.95555..."
3,4,,3.342,Polygon,GHA,GH,Western Region,-2.295417,5.493379,ha,...,0.210,3.342,3.342,0.000,0.0,0.289,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-2.29637..."
4,5,,1.915,Polygon,GHA,GH,Western North Region,-2.559378,6.324447,ha,...,1.884,1.915,1.915,0.039,0.0,0.698,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-2.55999..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,,2.506,Polygon,GHA,GH,Ashanti Region,-1.393980,6.017721,ha,...,0.000,2.506,2.506,0.010,0.0,1.588,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.39479..."
96,97,,1.135,Polygon,GHA,GH,Ahafo Region,-1.888497,7.228614,ha,...,0.000,1.135,1.135,0.000,0.0,0.000,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.88917..."
97,98,,2.036,Polygon,GHA,GH,Ashanti Region,-1.814841,6.868563,ha,...,0.351,2.036,2.036,0.000,0.0,0.803,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.81565..."
98,99,,1.838,Polygon,GHA,GH,Ashanti Region,-2.117506,6.487262,ha,...,0.000,1.838,1.838,0.079,0.0,0.000,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-2.11825..."
