### Whisp a feature collection

Setup
- NB use a virtual environment to avoid altering your python environment (https://docs.python.org/3/tutorial/venv.html)

In [19]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

try:
    ee.Initialize(project='ee-andyarnellgee')#, opt_url='https://earthengine-highvolume.googleapis.com')
except Exception:
    ee.Authenticate()
    ee.Initialize(project='ee-andyarnellgee')#, opt_url='https://earthengine-highvolume.googleapis.com')

In [20]:
# Install openforis-whisp (uncomment line if not already installed)
# !pip install --pre openforis-whisp

In [21]:
import openforis_whisp as whisp

In [None]:
import ee.batch


image = whisp.combine_datasets(pixel_area=False)


geometry = ee.Geometry.Polygon(
        [[[-3.04548260909834, 7.48307210714245],
          [-3.04548260909834, 5.253961384163733],
          [-1.0179939534016594, 5.253961384163733],
          [-1.0179939534016594, 7.48307210714245]]], None, False);


task = ee.batch.Export.image.toDrive(
                    image=image,
                    description='whisp_image_clip_v0',
                    scale=10,
                    region=geometry,
                    folder='gee_whisp_image_export',
                    skipEmptyTiles=True,
                    # prefix='whisp_image_clip_v0',
                    formatOptions={'cloudOptimized': True}, 
                    maxPixels=1e13,
                    fileFormat='GeoTIFF',
)


task.start()

['Area', 'European_Primary_Forest', 'GLC_FCS30D_TC_2022', 'GLC_FCS30D_crop_2022', 'IFL_2020', 'IIASA_planted_plantation', 'Cocoa_bnetd', 'Oil_palm_Descals', 'ESA_fire_before_2020', 'ESA_fire_2001', 'ESA_fire_2002', 'ESA_fire_2003', 'ESA_fire_2004', 'ESA_fire_2005', 'ESA_fire_2006', 'ESA_fire_2007', 'ESA_fire_2008', 'ESA_fire_2009', 'ESA_fire_2010', 'ESA_fire_2011', 'ESA_fire_2012', 'ESA_fire_2013', 'ESA_fire_2014', 'ESA_fire_2015', 'ESA_fire_2016', 'ESA_fire_2017', 'ESA_fire_2018', 'ESA_fire_2019', 'ESA_fire_2020', 'ESA_TC_2020', 'ESRI_2023_TC', 'ESRI_2023_crop', 'Cocoa_ETH', 'Cocoa_2023_FDaP', 'Cocoa_FDaP', 'Forest_FDaP', 'Oil_palm_2023_FDaP', 'Oil_palm_FDaP', 'Rubber_2023_FDaP', 'Rubber_FDaP', 'GFT_naturally_regenerating', 'GFT_planted_plantation', 'GFT_primary', 'GFC_TC_2020', 'GFC_loss_after_2020', 'GFC_loss_before_2020', 'GFC_loss_year_2001', 'GFC_loss_year_2002', 'GFC_loss_year_2003', 'GFC_loss_year_2004', 'GFC_loss_year_2005', 'GFC_loss_year_2006', 'GFC_loss_year_2007', 'GFC_los

In [32]:

# Create descriptive filenames 
description = 'whisp_image_clip_v0'
filename = f"whisp_exports/{description}"

# Start the export task
task = ee.batch.Export.image.toCloudStorage(
    image=image,
    description=description,
    bucket='whisp_bucket',
    fileNamePrefix=filename,
    scale=10,
    region=geometry,
    maxPixels=1e13,
    fileFormat='GeoTIFF',
    formatOptions={
        'cloudOptimized': True,
        # 'fileDimensions': 2048  # Optional: set tile size for COG
    }
)

task.start()

In [None]:
import os
import glob
import rasterio
from rasterio.merge import merge
from rasterio.enums import Resampling

# Set your input folder and output COG path
# input_folder = r'C:\Users\Arnell\OneDrive - Food and Agriculture Organization\project_work\p0004_commodity_mapper_support\work_in_progress\cog_test'
input_folder = r'C:\Users\Arnell\Downloads\drive-download-20250427T115601Z-001'

output_cog = input_folder+'/output_cog.tif'

print ('input_folder:', input_folder)
print ('output_cog:', output_cog)



In [None]:
# Find all TIFF files in the folder
tif_files = glob.glob(os.path.join(input_folder, '*.tif'))

# Open all the tiffs
src_files_to_mosaic = [rasterio.open(fp) for fp in tif_files]

# Merge them
mosaic, out_transform = merge(src_files_to_mosaic)

gdal_translate input.tif output_cog.tif -of COG -co COMPRESS=LZW -co BLOCKSIZE=512

# Take the metadata from the first file and update
out_meta = src_files_to_mosaic[0].meta.copy()
out_meta.update({
    "driver": "COG",
    "height": mosaic.shape[1],
    "width": mosaic.shape[2],
    "transform": out_transform,
    "compress": "deflate",   # optional, makes file smaller
    # "BIGTIFF": "YES",
})


# Write the mosaic to a new COG
with rasterio.open(output_cog, 'w', **out_meta) as dest:
    dest.write(mosaic)

# Close the input datasets
for src in src_files_to_mosaic:
    src.close()

Get a feature collection

In [4]:
GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")

print (GEOJSON_EXAMPLE_FILEPATH)

..\tests\fixtures\geojson_example.geojson


In [5]:
df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)



Reading GeoJSON file from: c:\Users\Arnell\Documents\GitHub\whisp\tests\fixtures\geojson_example.geojson
['Area', 'European_Primary_Forest', 'GLC_FCS30D_TC_2022', 'GLC_FCS30D_crop_2022', 'IFL_2020', 'IIASA_planted_plantation', 'Cocoa_bnetd', 'Oil_palm_Descals', 'ESA_fire_before_2020', 'ESA_fire_2001', 'ESA_fire_2002', 'ESA_fire_2003', 'ESA_fire_2004', 'ESA_fire_2005', 'ESA_fire_2006', 'ESA_fire_2007', 'ESA_fire_2008', 'ESA_fire_2009', 'ESA_fire_2010', 'ESA_fire_2011', 'ESA_fire_2012', 'ESA_fire_2013', 'ESA_fire_2014', 'ESA_fire_2015', 'ESA_fire_2016', 'ESA_fire_2017', 'ESA_fire_2018', 'ESA_fire_2019', 'ESA_fire_2020', 'ESA_TC_2020', 'ESRI_2023_TC', 'ESRI_2023_crop', 'Cocoa_ETH', 'Cocoa_2023_FDaP', 'Cocoa_FDaP', 'Forest_FDaP', 'Oil_palm_2023_FDaP', 'Oil_palm_FDaP', 'Rubber_2023_FDaP', 'Rubber_FDaP', 'GFT_naturally_regenerating', 'GFT_planted_plantation', 'GFT_primary', 'GFC_TC_2020', 'GFC_loss_after_2020', 'GFC_loss_before_2020', 'GFC_loss_year_2001', 'GFC_loss_year_2002', 'GFC_loss_yea

In [6]:
df_formatted_stats 

Unnamed: 0,plotId,external_id,Area,Geometry_type,Country,ProducerCountry,Admin_Level_1,Centroid_lon,Centroid_lat,Unit,...,TMF_regrowth_2023,ESRI_2023_TC,GLC_FCS30D_TC_2022,Oil_palm_2023_FDaP,Rubber_2023_FDaP,Cocoa_2023_FDaP,ESRI_2023_crop,GLC_FCS30D_crop_2022,GFW_logging,geo
0,1,,1.939,Polygon,GHA,GH,Ashanti Region,-1.611942,6.15954,ha,...,0.803,1.939,1.939,1.834,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.61283..."
1,2,,4.152,Polygon,GHA,GH,Ashanti Region,-1.644732,6.104735,ha,...,0.017,4.152,4.063,0.042,0.0,0.003,0.0,0.089,0.0,"{'type': 'Polygon', 'coordinates': [[[-1.64615..."
2,3,,16.6,Polygon,GHA,GH,Western Region,-2.157144,5.981149,ha,...,0.0,16.6,16.511,0.661,0.0,0.0,0.0,0.089,0.0,"{'type': 'Polygon', 'coordinates': [[[-2.15951..."
3,4,,31.212999,Polygon,IDN,ID,South Sumatra,103.956096,-3.054668,ha,...,0.0,6.332,27.767,26.664,2.145,0.0,24.882,3.356,0.0,"{'type': 'Polygon', 'coordinates': [[[103.9514..."
4,5,,1.964,Polygon,IDN,ID,South Sumatra,103.970371,-3.068831,ha,...,0.316,1.934,0.686,0.0,1.626,0.0,0.0,1.278,0.0,"{'type': 'Polygon', 'coordinates': [[[103.9694..."
5,6,,12.725,Polygon,IDN,ID,South Sumatra,103.975182,-3.082922,ha,...,1.431,12.725,12.152,0.204,0.05,0.0,0.0,0.573,0.0,"{'type': 'Polygon', 'coordinates': [[[103.9731..."
6,7,,20.882,Polygon,IDN,ID,South Sumatra,103.977512,-3.083808,ha,...,4.897,20.882,20.120001,0.118,0.238,0.0,0.0,0.762,0.0,"{'type': 'Polygon', 'coordinates': [[[103.9749..."
7,8,,8.279,Polygon,CIV,CI,Lagunes,-4.101646,5.711935,ha,...,3.282,6.147,8.279,0.089,1.175,0.072,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-4.10288..."
8,9,,1.981,Polygon,CIV,CI,Lagunes,-4.086848,5.673811,ha,...,0.715,1.981,1.981,0.11,0.032,0.276,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-4.08767..."
9,10,,3.797,Polygon,CIV,CI,District Autonome D'Abidjan,-4.119589,5.572136,ha,...,2.857,3.797,3.717,0.68,3.134,0.0,0.0,0.08,0.0,"{'type': 'Polygon', 'coordinates': [[[-4.12062..."


In [7]:
import math
def create_bbox(center_lon, center_lat, hectares=4):
    """
    Create a bounding box as an Earth Engine feature with a specified area.
    
    Args:
        center_lon (float): Longitude of the center point
        center_lat (float): Latitude of the center point
        hectares (float): Size of the bounding box in hectares (default: 4)
        
    Returns:
        ee.Feature: Earth Engine feature representing a bounding box
    """
    # Calculate area and side length
    # 1 hectare = 10,000 sq meters
    area_sq_meters = hectares * 10000
    side_length_meters = math.sqrt(area_sq_meters)
    half_side = side_length_meters / 2
    
    # Create a point at the specified coordinates
    center_point = ee.Geometry.Point([center_lon, center_lat])
    
    # Approximate conversion to degrees
    # This varies with latitude, but roughly 111km per degree at equator
    # More precise conversion: 1 degree = 111,320 meters * cos(latitude) for longitude
    meters_per_degree = 111320 * math.cos(math.radians(abs(center_lat)))
    lat_offset = half_side / 111320  # Latitude degrees
    lon_offset = half_side / meters_per_degree  # Longitude degrees
    
    # Create a square by specifying bounds in all directions
    bbox = ee.Geometry.Rectangle(
        [
            center_lon - lon_offset,  # Precise conversion to degrees
            center_lat - lat_offset,
            center_lon + lon_offset,
            center_lat + lat_offset
        ]
    )
    
    # Create feature with properties
    return ee.Feature(bbox, {'hectares': hectares})

In [None]:
# Example usage
center_lon = -76.934  # Example longitude - replace with your location
center_lat = 6.145   # Example latitude

# Create the bounding box feature
# bbox_feature = create_4ha_bbox(center_lon, center_lat)

whisp.combine_datasets().clip(create_bbox(center_lon, center_lat,1000)).select([0,1]).getDownloadURL(
    {scale:10,format:'GeoTIFF'}
    )


In [None]:


# Select an image — e.g., Hansen Tree Cover (UMD GFC)
image = ee.Image("UMD/hansen/global_forest_change_2023_v1_11") \
    # .select("treecover2000") 
    # .gt(10).selfMask().rename("tree_gt_10")

# Define a small region of interest — e.g., 5 km square around a point in Central Africa
region = ee.Geometry.Point([15.0, 0.5]).buffer(1200).bounds()  # 5 km extent

# Clip image to region
image_clipped = whisp.combine_datasets().clip(region)
# Generate download URL
download_url = image_clipped.getDownloadURL({
    'scale': 10,
    'region': region,
    'format': 'GeoTIFF'
})

print("Download URL:", download_url)


In [None]:
# Function to process a single location with multiband export
def process_location(location_data, hectares=4):
    """
    Process a location and download a multiband GeoTIFF of the specified area.
    
    Args:
        location_data: Tuple containing (longitude, latitude, region_name)
        hectares: Area size in hectares (default: 4)
        
    Returns:
        str: Status message about the download
    """
    lon, lat, region = location_data
    try:
        # Create the bounding box with specified hectares
        bbox_feature = create_bbox(lon, lat, hectares=hectares)
        
        # Create the combined dataset
        combined_image = whisp.combine_datasets().clip(bbox_feature)
        
        # Get all band names from the combined dataset
        band_names = combined_image.bandNames().getInfo()
        
        # Get the download URL with multiband GeoTIFF format
        download_url = combined_image.getDownloadURL({
            'format': 'GeoTIFF',
            # 'bands': band_names,  # Include all bands
            'region': bbox_feature.geometry(),
            'scale': 10,  # Resolution in meters (adjust as needed)
            'crs': 'EPSG:4326'
        })
        
        # Create a unique filename that includes the area size
        filename = f"whisp_multiband_{region}_{lon}_{lat}_{hectares}ha.tif"
        output_path = out_directory / filename
        
        # Download the image
        response = requests.get(download_url)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            return f"Successfully downloaded {hectares}ha multiband image: {filename}"
        else:
            return f"Failed to download {filename}: Status {response.status_code}"
        
    except Exception as e:
        return f"Error processing location {lon}, {lat} ({hectares}ha): {str(e)}"

In [None]:
out_directory = Path("whisp_samples")
# Download with default 4 hectares
# result = process_location((lon, lat, region))
lon = 15.0
lat = 0.5
# Download with custom area size
result = process_location((lon, lat, region), hectares=10)
print(result)
# # For parallel processing with different area sizes
# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#     # Create tasks with different area sizes
#     tasks = [
#         executor.submit(process_location, loc, hectares=1) for loc in random_locations[:5]
#     ] + [
#         executor.submit(process_location, loc, hectares=10) for loc in random_locations[5:10]
#     ]
    
#     for future in concurrent.futures.as_completed(tasks):
#         try:
#             result = future.result()
#             print(result)
#         except Exception as e:
#             print(f"Task generated an exception: {str(e)}")

Created random_polygons.geojson with 30 random polygons


'c:\\Users\\Arnell\\Documents\\GitHub\\whisp\\notebooks'

In [None]:
import random
import time
import requests
from pathlib import Path
import numpy as np

# Define output directory
out_directory = Path.home() / 'Downloads' / 'whisp_samples'
out_directory.mkdir(exist_ok=True, parents=True)

# Define regions with forest coverage (to make results more interesting)
# Format: [min_lon, max_lon, min_lat, max_lat, region_name]
forest_regions = [
    [-120, -40, -20, 50, "americas"],  # Americas
    [-20, 40, -30, 60, "europe_africa"],  # Europe/Africa
    [60, 150, -40, 60, "asia_oceania"]  # Asia/Oceania
]

# Generate 10 random locations across the forest regions
random_locations = []
for i in range(2):
    # Choose a random region
    region = random.choice(forest_regions)
    
    # Generate random coordinates within the region
    lon = random.uniform(region[0], region[1])
    lat = random.uniform(region[2], region[3])
    
    # Round to 3 decimal places
    lon = round(lon, 3)
    lat = round(lat, 3)
    
    random_locations.append((lon, lat, region[4]))

# Function to download the image
def download_image(url, output_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            f.write(response.content)
        return True
    return False

# Process each location and download the combined dataset
for i, (lon, lat, region) in enumerate(random_locations):
    print(f"Processing location {i+1}/10: {lon}, {lat} ({region})")
    
    try:
        # Create the bounding box
        bbox_feature = create_bbox(lon, lat,10)
        
        # Get the download URL
        combined_image = whisp.combine_datasets().clip(bbox_feature)
             
        # Get the download URL with multiband GeoTIFF format
        download_url = combined_image.getDownloadURL({
            'format': 'GeoTIFF',
            # 'bands': band_names,  # Include all bands
            'region': bbox_feature.geometry(),
            'scale': 10,  # Resolution in meters (adjust as needed)
            'crs': 'EPSG:4326'
        })
        
        # Create a unique filename based on coordinates
        filename = f"whisp_sample_{region}_{lon}_{lat}.tif"
        output_path = out_directory / filename
        
        # Download the image
        print(f"  Downloading to {output_path}")
        success = download_image(download_url, output_path)
        
        if success:
            print(f"  Successfully downloaded {filename}")
        else:
            print(f"  Failed to download {filename}")
        
        # Pause to avoid overwhelming the server
        time.sleep(2)
        
    except Exception as e:
        print(f"  Error processing location {lon}, {lat}: {str(e)}")
        
        # Continue with the next location
        continue

print(f"\nDownloaded images are saved to: {out_directory}")

In [None]:
# !pip install rasterio # Install rasterio for TIFF file validation

import rasterio
import os


In [None]:

def validate_tiff_files(directory):
    """Check if TIFF files in directory are valid and print their properties."""
    directory_path = Path(directory)
    tiff_files = list(directory_path.glob('*.tif'))
    
    if not tiff_files:
        print("No TIFF files found in the directory.")
        return
    
    print(f"Found {len(tiff_files)} TIFF files to check:")
    
    for tiff_file in tiff_files:
        print(f"\nChecking {tiff_file.name}...")
        try:
            with rasterio.open(tiff_file) as src:
                print(f"  Valid GeoTIFF: Yes")
                print(f"  Dimensions: {src.width} x {src.height} pixels")
                print(f"  Number of bands: {src.count}")
                print(f"  Coordinate system: {src.crs}")
                print(f"  Bounds: {src.bounds}")
        except Exception as e:
            print(f"  Invalid or unreadable file: {str(e)}")
            
            # Try to get file size
            try:
                print(f"  File size: {os.path.getsize(tiff_file)} bytes")
            except:
                pass

# Run the validation on your downloaded files
validate_tiff_files(out_directory)

Parallel processing test


In [None]:
# Check file headers to determine actual format
def check_file_format(file_path):
    with open(file_path, 'rb') as f:
        header = f.read(20)  # Read first 20 bytes to identify format
    
    if header.startswith(b'PK'):
        return "ZIP file"
    elif header.startswith(b'{'):
        return "JSON file (likely an error response)"
    elif header.startswith(b'\x89PNG'):
        return "PNG image"
    else:
        return f"Unknown format (hex): {header.hex()}"

# Check a sample file
sample_file = list(out_directory.glob('*.tif'))[0]
print(f"File {sample_file.name} appears to be: {check_file_format(sample_file)}")

In [None]:
import concurrent.futures
import random
import requests
from pathlib import Path

# Define output directory
out_directory = Path.home() / 'Downloads' / 'whisp_samples'
out_directory.mkdir(exist_ok=True, parents=True)

# Define regions with forest coverage
forest_regions = [
    [-120, -40, -20, 50, "americas"],  
    [-20, 40, -30, 60, "europe_africa"],
    [60, 150, -40, 60, "asia_oceania"]
]

# Generate random locations
random_locations = []
for i in range(10):
    region = random.choice(forest_regions)
    lon = round(random.uniform(region[0], region[1]), 3)
    lat = round(random.uniform(region[2], region[3]), 3)
    random_locations.append((lon, lat, region[4]))

# Function to process a single location
def process_location(location_data):
    lon, lat, region = location_data
    try:
        # Create the bounding box
        bbox_feature = create_bbox(lon, lat)
        
        # Get the download URL
        combined_image = whisp.combine_datasets().clip(bbox_feature)
             
        # Get the download URL with multiband GeoTIFF format
        download_url = combined_image.getDownloadURL({
            'format': 'GeoTIFF',
            # 'bands': band_names,  # Include all bands
            'region': bbox_feature.geometry(),
            'scale': 10,  # Resolution in meters (adjust as needed)
            'crs': 'EPSG:4326'
        })
        
        
        # Create a unique filename
        filename = f"whisp_sample_{region}_{lon}_{lat}.tif"
        output_path = out_directory / filename
        
        # Download the image
        response = requests.get(download_url)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            return f"Successfully downloaded {filename}"
        else:
            return f"Failed to download {filename}: Status {response.status_code}"
        
    except Exception as e:
        return f"Error processing location {lon}, {lat}: {str(e)}"

# Use ThreadPoolExecutor for parallel downloads (max 4 concurrent downloads)
print("Starting parallel downloads...")
with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
    future_to_location = {executor.submit(process_location, loc): loc for loc in random_locations}
    
    for future in concurrent.futures.as_completed(future_to_location):
        location = future_to_location[future]
        try:
            result = future.result()
            print(f"Location {location[0]}, {location[1]}: {result}")
        except Exception as e:
            print(f"Location {location[0]}, {location[1]} generated an exception: {str(e)}")

print(f"\nDownloaded images are saved to: {out_directory}")

Concurrent with timings

In [8]:
import concurrent.futures
import random
import requests
import time
import pandas as pd
import numpy as np
from pathlib import Path
from statistics import mean, median, stdev
from datetime import datetime


In [None]:
def run_parallel_downloads(image=None, number_of_samples=3, max_workers=4, hectares=4, 
                           band_indices=None, output_dir=None, calculate_zonal_stats=False):
    """
    Run parallel downloads of Whisp datasets for random global locations with optional zonal statistics.
    
    Args:
        image: Earth Engine image to process (default: will use whisp.combine_datasets())
        number_of_samples (int): Number of random locations to sample (default: 3)
        max_workers (int): Number of parallel download threads (default: 4)
        hectares (int): Area size for each sample in hectares (default: 4)
        band_indices (list): Optional list of specific band indices to select
        output_dir (Path): Directory to save downloaded files (default: Downloads/whisp_samples)
        calculate_zonal_stats (bool): Whether to calculate zonal statistics (default: False)
        
    Returns:
        dict: Statistics about the processing times, download results, and zonal stats path
    """
 
    
    # For zonal statistics
    if calculate_zonal_stats:
        try:
            import rasterio
            import rasterstats
        except ImportError:
            print("Warning: rasterstats and/or rasterio packages not found.")
            print("Installing required packages for zonal statistics...")
            import subprocess
            subprocess.check_call(["pip", "install", "rasterstats", "rasterio"])
            import rasterio
            import rasterstats
    
    # Start timing
    start_time = time.time()
    start_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Define output directory
    if output_dir is None:
        output_dir = Path.home() / 'Downloads' / 'whisp_samples'
    output_dir.mkdir(exist_ok=True, parents=True)
    
    # Check or create image
    if image is None:
        print("No image provided, using whisp.combine_datasets()")
        image = whisp.combine_datasets()
    
    # Define regions with forest coverage
    forest_regions = [
        [-120, -40, -20, 50, "americas"],
        [-20, 40, -30, 60, "europe_africa"],
        [60, 150, -40, 60, "asia_oceania"]
    ]
    
    # Generate random locations
    random_locations = []
    for i in range(number_of_samples):
        region = random.choice(forest_regions)
        lon = round(random.uniform(region[0], region[1]), 3)
        lat = round(random.uniform(region[2], region[3]), 3)
        random_locations.append((lon, lat, region[4]))
    
    # Track timing information
    download_times = []
    success_count = 0
    failures = []
    
    # For zonal statistics tracking
    zonal_stats_results = []
    
    # Use ThreadPoolExecutor for parallel downloads
    print(f"Starting {number_of_samples} parallel downloads with {max_workers} workers at {start_datetime}...")
    
    # Modified process_location function with timing and zonal stats
    def process_with_timing(location_data):
        item_start_time = time.time()
        lon, lat, region = location_data
        try:
            # Create the bounding box with specified hectares
            bbox_feature = create_bbox(lon, lat, hectares=hectares)
            
            # Create the combined dataset
            combined_image = image.clip(bbox_feature)
            
            # Select specific bands if requested
            if band_indices is not None:
                combined_image = combined_image.select(band_indices)
                bands_info = f"bands_{'_'.join(map(str, band_indices))}"
            else:
                # Default to all bands
                bands_info = "all_bands"
            
            # Get the download URL
            download_url = combined_image.getDownloadURL({
                'format': 'GeoTIFF',
                'region': bbox_feature.geometry(),
                'scale': 10,
                'crs': 'EPSG:4326'
            })
            
            # Create a unique filename
            filename = f"whisp_{bands_info}_{region}_{lon}_{lat}_{hectares}h.tif"
            output_path = output_dir / filename
            
            # Download the image
            response = requests.get(download_url)
            if response.status_code == 200:
                with open(output_path, 'wb') as f:
                    f.write(response.content)
                
                # Calculate zonal statistics if requested
                local_stats = []
                if calculate_zonal_stats:
                    try:
                        # Open the file with rasterio
                        with rasterio.open(output_path) as src:
                            num_bands = src.count
                            
                            # Create a simple polygon from the bounding box
                            bbox_geom = bbox_feature.geometry().bounds().getInfo()
                            geom = {
                                'type': 'Polygon',
                                'coordinates': [[
                                    [bbox_geom[0], bbox_geom[1]],
                                    [bbox_geom[2], bbox_geom[1]],
                                    [bbox_geom[2], bbox_geom[3]],
                                    [bbox_geom[0], bbox_geom[3]],
                                    [bbox_geom[0], bbox_geom[1]],
                                ]]
                            }
                            print(bbox_geom)
                            # Calculate statistics for each band
                            for band in range(1, num_bands + 1):
                                band_name = f"B{band}" if band_indices is None else f"B{band_indices[band-1]}"
                                
                                # Calculate zonal statistics
                                stats = rasterstats.zonal_stats(
                                    geom, 
                                    src.read(band), 
                                    affine=src.transform,
                                    stats=["min", "max", "mean", "median", "std", "count"]
                                )[0]
                                
                                # Add to results
                                for stat_name, stat_value in stats.items():
                                    if stat_value is not None:  # Skip None values
                                        local_stats.append({
                                            "longitude": lon,
                                            "latitude": lat,
                                            "region": region,
                                            "filename": filename,
                                            "band": band_name,
                                            "statistic": stat_name,
                                            "value": stat_value,
                                            "hectares": hectares
                                        })
                    except Exception as e:
                        print(f"Error calculating zonal statistics for {filename}: {str(e)}")
                
                elapsed_time = time.time() - item_start_time
                return True, f"Successfully downloaded {filename} in {elapsed_time:.2f}s", elapsed_time, local_stats
            else:
                elapsed_time = time.time() - item_start_time
                return False, f"Failed to download {filename}: Status {response.status_code} in {elapsed_time:.2f}s", elapsed_time, []
            
        except Exception as e:
            elapsed_time = time.time() - item_start_time
            return False, f"Error processing location {lon}, {lat}: {str(e)} in {elapsed_time:.2f}s", elapsed_time, []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_location = {executor.submit(process_with_timing, loc): loc for loc in random_locations}
        
        for future in concurrent.futures.as_completed(future_to_location):
            location = future_to_location[future]
            try:
                success, result, elapsed_time, local_stats = future.result()
                download_times.append(elapsed_time)
                
                # Add zonal stats results to the global list
                if local_stats:
                    zonal_stats_results.extend(local_stats)
                
                if success:
                    success_count += 1
                else:
                    failures.append(result)
                print(f"Location {location[0]}, {location[1]}: {result}")
            except Exception as e:
                print(f"Location {location[0]}, {location[1]} generated an exception: {str(e)}")
                failures.append(str(e))
    
    end_time = time.time()
    end_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    total_elapsed = end_time - start_time
    
    # Create a zonal statistics CSV if requested and available
    zonal_stats_file = None
    if calculate_zonal_stats and zonal_stats_results:
        # Convert to DataFrame and save as CSV
        df_stats = pd.DataFrame(zonal_stats_results)
        zonal_stats_file = output_dir / f"whisp_zonal_stats_{start_datetime.replace(':', '-').replace(' ', '_')}.csv"
        df_stats.to_csv(zonal_stats_file, index=False)
        print(f"\nZonal statistics saved to: {zonal_stats_file}")
    
    # processing speed statistics
    stats = {
        "start_time": start_datetime,
        "end_time": end_datetime,
        "total_time_seconds": total_elapsed,
        "success_count": success_count,
        "failure_count": len(failures),
        "total_samples": number_of_samples,
        "success_rate": success_count / number_of_samples * 100 if number_of_samples > 0 else 0,
        "zonal_stats_file": str(zonal_stats_file) if zonal_stats_file else None,
        "zonal_stats_count": len(zonal_stats_results)
    }
    
    if download_times:
        stats.update({
            "avg_time": mean(download_times),
            "median_time": median(download_times),
            "min_time": min(download_times),
            "max_time": max(download_times)
        })
        
        # Calculate standard deviation if more than one download
        if len(download_times) > 1:
            stats["std_dev"] = stdev(download_times)
    
    # Print summary
    print(f"\nDownload Summary:")
    print(f"  Start time: {start_datetime}")
    print(f"  End time: {end_datetime}")
    print(f"  Total processing time: {total_elapsed:.2f}s")
    print(f"  Success rate: {stats['success_rate']:.1f}% ({success_count}/{number_of_samples})")
    
    if download_times:
        print(f"\nDownload Time Statistics:")
        print(f"  Average time: {stats.get('avg_time', 0):.2f}s")
        print(f"  Median time: {stats.get('median_time', 0):.2f}s")
        print(f"  Min time: {stats.get('min_time', 0):.2f}s")
        print(f"  Max time: {stats.get('max_time', 0):.2f}s")
        if 'std_dev' in stats:
            print(f"  Standard Deviation: {stats['std_dev']:.2f}s")
    
    if calculate_zonal_stats:
        print(f"\nZonal Statistics:")
        print(f"  Statistics calculated: {stats['zonal_stats_count']}")
        if zonal_stats_file:
            print(f"  Statistics saved to: {zonal_stats_file}")
        else:
            print("  No zonal statistics were generated.")
    
    print(f"\nDownloaded images are saved to: {output_dir}")
    
    return stats

In [10]:
image = whisp.combine_datasets()
# Import required libraries
import ee
# import openforis_whisp as whisp
from pathlib import Path

# # Initialize Earth Engine if needed
# try:
#     ee.Initialize()
# except:
#     ee.Authenticate()
#     ee.Initialize()

# Get the Whisp combined dataset
image = whisp.combine_datasets()

# Run with zonal statistics enabled
stats = run_parallel_downloads(
    image=image,
    number_of_samples=10,
    max_workers=40,
    hectares=10,
    # band_indices=[0, 1, 2],  # Only download first 3 bands
    calculate_zonal_stats=True  # Enable zonal statistics calculation
)

# Print statistics information
print(f"Job completed in {stats['total_time_seconds']:.2f} seconds")
print(f"Success rate: {stats['success_rate']:.1f}%")
print(f"Zonal statistics file: {stats['zonal_stats_file']}")
# # Run with default parameters (3 samples, 4 workers)
# # stats = run_parallel_downloads()

# # Run with custom parameters
# stats = run_parallel_downloads(
#     image = image,
#     number_of_samples=1000,
#     max_workers=40,
#     hectares=10,
#     # band_indices=[0, 1, 2]  # Only download first 3 bands
# )

# # Access statistics programmatically
# print(f"Job completed in {stats['total_time_seconds']:.2f} seconds")
# print(f"Success rate: {stats['success_rate']:.1f}%")

['Area', 'European_Primary_Forest', 'GLC_FCS30D_TC_2022', 'GLC_FCS30D_crop_2022', 'IFL_2020', 'IIASA_planted_plantation', 'Cocoa_bnetd', 'Oil_palm_Descals', 'ESA_fire_before_2020', 'ESA_fire_2001', 'ESA_fire_2002', 'ESA_fire_2003', 'ESA_fire_2004', 'ESA_fire_2005', 'ESA_fire_2006', 'ESA_fire_2007', 'ESA_fire_2008', 'ESA_fire_2009', 'ESA_fire_2010', 'ESA_fire_2011', 'ESA_fire_2012', 'ESA_fire_2013', 'ESA_fire_2014', 'ESA_fire_2015', 'ESA_fire_2016', 'ESA_fire_2017', 'ESA_fire_2018', 'ESA_fire_2019', 'ESA_fire_2020', 'ESA_TC_2020', 'ESRI_2023_TC', 'ESRI_2023_crop', 'Cocoa_ETH', 'Cocoa_2023_FDaP', 'Cocoa_FDaP', 'Forest_FDaP', 'Oil_palm_2023_FDaP', 'Oil_palm_FDaP', 'Rubber_2023_FDaP', 'Rubber_FDaP', 'GFT_naturally_regenerating', 'GFT_planted_plantation', 'GFT_primary', 'GFC_TC_2020', 'GFC_loss_after_2020', 'GFC_loss_before_2020', 'GFC_loss_year_2001', 'GFC_loss_year_2002', 'GFC_loss_year_2003', 'GFC_loss_year_2004', 'GFC_loss_year_2005', 'GFC_loss_year_2006', 'GFC_loss_year_2007', 'GFC_los



Error calculating zonal statistics for whisp_all_bands_europe_africa_32.227_36.984_10h.tif: 0
Location 32.227, 36.984: Successfully downloaded whisp_all_bands_europe_africa_32.227_36.984_10h.tif in 4.61s
Error calculating zonal statistics for whisp_all_bands_europe_africa_18.619_-6.145_10h.tif: 0
Location 18.619, -6.145: Successfully downloaded whisp_all_bands_europe_africa_18.619_-6.145_10h.tif in 4.74s




Error calculating zonal statistics for whisp_all_bands_europe_africa_-17.877_-7.419_10h.tif: 0
Location -17.877, -7.419: Successfully downloaded whisp_all_bands_europe_africa_-17.877_-7.419_10h.tif in 5.39s




Error calculating zonal statistics for whisp_all_bands_americas_-106.819_43.557_10h.tif: 0
Location -106.819, 43.557: Successfully downloaded whisp_all_bands_americas_-106.819_43.557_10h.tif in 5.67s
Error calculating zonal statistics for whisp_all_bands_americas_-77.55_28.798_10h.tif: 0
Error calculating zonal statistics for whisp_all_bands_asia_oceania_87.462_-18.858_10h.tif: 0
Location -77.55, 28.798: Successfully downloaded whisp_all_bands_americas_-77.55_28.798_10h.tif in 5.83s
Location 87.462, -18.858: Successfully downloaded whisp_all_bands_asia_oceania_87.462_-18.858_10h.tif in 5.90s
Error calculating zonal statistics for whisp_all_bands_asia_oceania_63.353_42.934_10h.tif: 0
Location 63.353, 42.934: Successfully downloaded whisp_all_bands_asia_oceania_63.353_42.934_10h.tif in 5.94s
Error calculating zonal statistics for whisp_all_bands_americas_-113.913_11.507_10h.tif: 0
Location -113.913, 11.507: Successfully downloaded whisp_all_bands_americas_-113.913_11.507_10h.tif in 6.28s



Error calculating zonal statistics for whisp_all_bands_europe_africa_28.605_19.826_10h.tif: 0
Location 28.605, 19.826: Successfully downloaded whisp_all_bands_europe_africa_28.605_19.826_10h.tif in 8.16s

Download Summary:
  Start time: 2025-04-16 18:13:57
  End time: 2025-04-16 18:14:05
  Total processing time: 8.26s
  Success rate: 100.0% (10/10)

Download Time Statistics:
  Average time: 5.89s
  Median time: 5.86s
  Min time: 4.61s
  Max time: 8.16s
  Standard Deviation: 0.99s

Zonal Statistics:
  Statistics calculated: 0
  No zonal statistics were generated.

Downloaded images are saved to: C:\Users\Arnell\Downloads\whisp_samples
Job completed in 8.26 seconds
Success rate: 100.0%
Zonal statistics file: None


testing extact extract package 

In [None]:
    import pandas as pd
    import rasterio
    import geopandas as gpd
    import re
    from pathlib import Path
    from datetime import datetime
    from shapely.geometry import box
    from exactextract import exact_extract
    

In [None]:
# def simple_zonal_stats_exactextract(directory=None, output_csv=None):
#     """
#     Ultra-simple zonal statistics calculator using exactextract.
    
#     Args:
#         directory: Directory containing GeoTIFF files
#         output_csv: Path for output CSV
#     """

#     # Setup directory
#     directory = Path(directory or Path.home() / 'Downloads' / 'whisp_samples')
#     output_csv = output_csv or directory / f"whisp_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
#     # Find GeoTIFF files
#     tiff_files = list(directory.glob("*.tif"))
#     print(f"Found {len(tiff_files)} GeoTIFF files")
    
#     # Store results
#     all_results = []
    
#     # Process each file
#     for tiff_file in tiff_files:
#         print(f"Processing {tiff_file.name}...")
        
#         try:
#             # Extract location info from filename
#             match = re.search(r'([a-z_]+)_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d+)h?', tiff_file.name)
#             region = match.group(1) if match else "unknown"
#             lon = float(match.group(2)) if match else 0.0
#             lat = float(match.group(3)) if match else 0.0
#             hectares = int(match.group(4)) if match else 0
            
#             # Open the file and get bounds
#             with rasterio.open(tiff_file) as src:
#                 # Create a polygon from the bounds
#                 geom = box(*src.bounds)
#                 gdf = gpd.GeoDataFrame({'id': [1], 'geometry': [geom]}, crs=src.crs)
                
#                 # Process each band
#                 for band in range(1, src.count + 1):
#                     # Calculate all statistics at once
#                     stats_df = exact_extract(
#                         src,
#                         gdf,
#                         # ['min', 'max', 'mean', 'median', 'stdev', 'count'],
#                         ['count'],
#                         include_cols=['id'],
#                         output='pandas'
#                     )
                    
#                     # Transform the results to our desired format
#                     for stat_name in ['count']:#['min', 'max', 'mean', 'median', 'stdev', 'count']:
#                         if stat_name in stats_df.columns:
#                             all_results.append({
#                                 'filename': tiff_file.name,
#                                 'longitude': lon,
#                                 'latitude': lat,
#                                 'region': region, 
#                                 'band': f"B{band}",
#                                 'statistic': 'std' if stat_name == 'stdev' else stat_name,
#                                 'value': stats_df.iloc[0][stat_name],
#                                 'hectares': hectares
#                             })
            
#             print(f"  Successfully processed {src.count} bands")
                
#         except Exception as e:
#             print(f"  Error processing {tiff_file.name}: {str(e)}")
    
#     # Create and save DataFrame
#     if all_results:
#         df = pd.DataFrame(all_results)
#         df.to_csv(output_csv, index=False)
#         print(f"\nStatistics saved to: {output_csv}")
#         return output_csv
#     else:
#         print("No statistics were calculated")
#         return None

In [None]:
# Run on default directory
# stats_csv = simple_zonal_stats_exactextract()

# Or specify custom paths
custom_dir = Path.home() / 'Downloads' / 'whisp_samples'
output_file = Path.home() / 'Downloads' / 'a3_whisp_stats.csv'
stats_csv = simple_zonal_stats_exactextract(directory=custom_dir, output_csv=output_file)




Found 3 GeoTIFF files
Processing whisp_all_bands_americas_-77.55_28.798_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_americas_-90.96_25.267_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif...




  Successfully processed 167 bands
No statistics were calculated
Found 3 GeoTIFF files
Processing whisp_all_bands_americas_-77.55_28.798_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_americas_-90.96_25.267_10h.tif...




  Successfully processed 167 bands
Processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif...
  Successfully processed 167 bands
No statistics were calculated


testing numpy

In [None]:
    import numpy as np
    import pandas as pd
    import rasterio
    from pathlib import Path

In [45]:
def simple_band_sums(directory=None, output_csv=None):
    """
    Super simple function that just outputs filename and sum of each band.
    
    Args:
        directory: Directory containing GeoTIFF files
        output_csv: Path for output CSV
    """

    
    # Setup directory
    directory = Path(directory or Path.home() / 'Downloads' / 'whisp_samples')
    output_csv = output_csv or directory / "band_sums.csv"
    
    # Find all tiff files
    tiff_files = list(directory.glob("*.tif"))
    print(f"Found {len(tiff_files)} GeoTIFF files")
    
    # Store results
    results = []
    
    # Process each file
    for tiff_file in tiff_files:
        try:
            with rasterio.open(tiff_file) as src:
                # Process each band with direct NumPy sum
                for band in range(1, src.count + 1):
                    # Read band data and handle NoData
                    band_data = src.read(band)
                    if src.nodata is not None:
                        band_data = band_data.astype('float64')
                        band_data[band_data == src.nodata] = np.nan
                    
                    # Calculate sum ignoring NaN values
                    sum_value = np.nansum(band_data)
                    
                    # Add to results
                    results.append({
                        'filename': tiff_file.name,
                        'band': band,
                        'sum': sum_value
                    })
            
        except Exception as e:
            print(f"Error processing {tiff_file.name}: {str(e)}")
    
    # Create and save DataFrame
    if results:
        df = pd.DataFrame(results)
        df.to_csv(output_csv, index=False)
        print(f"Band sums saved to: {output_csv}")
        return df
    else:
        print("No results were calculated")
        return None

In [None]:
# Just run it
sums_df = simple_band_sums()

# View the results directly
print(sums_df)

#pretty quick if already clipped (they werent): 2 mins for 1000 10ha tifs



Found 1004 GeoTIFF files




Band sums saved to: C:\Users\Arnell\Downloads\whisp_samples\band_sums.csv
                                                 filename  band            sum
0        whisp_all_bands_americas_-100.532_45.211_10h.tif     1  106946.890625
1        whisp_all_bands_americas_-100.532_45.211_10h.tif     2       0.000000
2        whisp_all_bands_americas_-100.532_45.211_10h.tif     3       0.000000
3        whisp_all_bands_americas_-100.532_45.211_10h.tif     4       0.000000
4        whisp_all_bands_americas_-100.532_45.211_10h.tif     5       0.000000
...                                                   ...   ...            ...
165039  whisp_bands_0_1_2_europe_africa_29.59_34.161_1...     2       0.000000
165040  whisp_bands_0_1_2_europe_africa_29.59_34.161_1...     3       0.000000
165041  whisp_bands_0_1_2_europe_africa_7.448_59.732_1...     1  106805.778076
165042  whisp_bands_0_1_2_europe_africa_7.448_59.732_1...     2       0.000000
165043  whisp_bands_0_1_2_europe_africa_7.448_59.732_1...

testing zonal stats


In [11]:
    import rasterio
    import pandas as pd
    import geopandas as gpd
    import os
    import re
    from pathlib import Path
    from datetime import datetime
    from shapely.geometry import box
    
    # Ensure we have the required libraries
    try:
        import rasterio
        import exactextract
        from exactextract import exact_extract
        import geopandas
    except ImportError:
        print("Installing required packages for zonal statistics...")
        import subprocess
        subprocess.check_call(["pip", "install", "rasterio", "exactextract", "geopandas"])
        import rasterio
        import exactextract
        from exactextract import exact_extract
        import geopandas as gpd

In [24]:
def calculate_zonal_stats_for_existing_files(directory=None, output_csv=None):
    """
    Calculate zonal statistics for all GeoTIFF files in the specified directory
    using exactextract for better performance.
    
    Args:
        directory (Path or str): Directory containing GeoTIFF files
        output_csv (Path or str): Path to save the output CSV file (default: same dir with timestamp)
        
    Returns:
        Path: Path to the generated CSV file with statistics
    """
    
    # Set default directory if none provided
    if directory is None:
        directory = Path.home() / 'Downloads' / 'whisp_samples'
    else:
        directory = Path(directory)
    
    if not directory.exists():
        raise ValueError(f"Directory not found: {directory}")
    
    # Find all GeoTIFF files
    tiff_files = list(directory.glob("*.tif"))
    if not tiff_files:
        print(f"No GeoTIFF files found in {directory}")
        return None
    
    print(f"Found {len(tiff_files)} GeoTIFF files for analysis")
    
    # Store all statistics
    all_stats = []
    
    # Process each file
    for tiff_file in tiff_files:
        filename = tiff_file.name
        print(f"Processing {filename}...")
        
        try:
            # Extract location info from filename using regex
            # Looking for patterns like "whisp_all_bands_americas_-65.234_25.789_4h.tif"
            match = re.search(r'([a-z_]+)_(-?\d+\.\d+)_(-?\d+\.\d+)_(\d+)h', filename)
            
            if match:
                region = match.group(1)
                lon = float(match.group(2))
                lat = float(match.group(3))
                hectares = int(match.group(4))
            else:
                # Fallback if filename doesn't match pattern
                region = "unknown"
                lon = 0.0
                lat = 0.0
                hectares = 0
            
            # Open the GeoTIFF and get metadata
            with rasterio.open(tiff_file) as src:
                # Get bounds and transform
                bounds = src.bounds
                transform = src.transform
                num_bands = src.count
                
                # Create a polygon from the bounds
                geom = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
                
                # Create a GeoDataFrame with the polygon
                gdf = gpd.GeoDataFrame({'id': [1], 'geometry': [geom]}, crs=src.crs)
                
                # Calculate statistics for each band
                for band_idx in range(1, num_bands + 1):
                    # Read band data
                    band_data = src.read(band_idx)
                    band_name = f"B{band_idx}"
                    
                    # Calculate zonal statistics using exactextract
                    # Remove the problematic parameter
                    stats = exact_extract(
                        band_data,
                        gdf,
                        transform,
                    #     ['min', 'max', 'mean', 'median', 'stdev', 'count']
                    
                    )
                    
                    # Check if stats is valid
                    if stats is not None and len(stats) > 0:
                        stats_dict = stats.iloc[0].to_dict()
                        
                        # Add each statistic to results with standardized names
                        for stat_name, stat_value in stats_dict.items():
                            # Map exactextract stat names to our standardized names
                            if stat_name == 'stdev':
                                stat_name = 'std'
                                
                            if stat_value is not None:  # Skip None values
                                all_stats.append({
                                    "filename": filename,
                                    "longitude": lon,
                                    "latitude": lat,
                                    "region": region,
                                    "band": band_name,
                                    "statistic": stat_name,
                                    "value": stat_value,
                                    "hectares": hectares
                                })
            
            print(f"  Successfully calculated statistics for band {band_idx}")
                
        except Exception as e:
            print(f"  Error processing {filename}: {str(e)}")
    
    # Create dataframe with results
    if all_stats:
        df_stats = pd.DataFrame(all_stats)
        
        # Create output filename if not specified
        if output_csv is None:
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            output_csv = directory / f"whisp_zonal_stats_{timestamp}.csv"
        else:
            output_csv = Path(output_csv)
        
        # Save to CSV
        df_stats.to_csv(output_csv, index=False)
        print(f"\nZonal statistics saved to: {output_csv}")
        
        # Print summary
        print("\nSummary statistics:")
        print(f"  Files processed: {len(tiff_files)}")
        print(f"  Total statistics calculated: {len(all_stats)}")
        print(f"  Unique bands: {df_stats['band'].nunique()}")
        
        # Show average value for each statistic type
        print("\nAverage values by statistic type:")
        for stat_type in df_stats['statistic'].unique():
            avg_val = df_stats[df_stats['statistic'] == stat_type]['value'].mean()
            print(f"  {stat_type}: {avg_val:.4f}")
        
        return output_csv
    else:
        print("No statistics were calculated.")
        return None

In [25]:
# Run zonal statistics on all existing GeoTIFF files in the default directory
# stats_file = calculate_zonal_stats_for_existing_files()

# # Or specify a custom directory and output file
# # custom_dir = Path.home() / 'my_geotiffs'
custom_dir = Path.home() / 'Downloads' / 'whisp_samples'
output_file = Path.home() / 'Downloads' / 'A2_my_custom_stats.csv'
stats_file = calculate_zonal_stats_for_existing_files(directory=custom_dir, output_csv=output_file)



Found 10 GeoTIFF files for analysis
Processing whisp_all_bands_americas_-106.819_43.557_10h.tif...
  Error processing whisp_all_bands_americas_-106.819_43.557_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_americas_-113.913_11.507_10h.tif...
  Error processing whisp_all_bands_americas_-113.913_11.507_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_americas_-77.55_28.798_10h.tif...
  Error processing whisp_all_bands_americas_-77.55_28.798_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_americas_-90.96_25.267_10h.tif...




  Error processing whisp_all_bands_americas_-90.96_25.267_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_asia_oceania_63.353_42.934_10h.tif...
  Error processing whisp_all_bands_asia_oceania_63.353_42.934_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_asia_oceania_87.462_-18.858_10h.tif...
  Error processing whisp_all_bands_asia_oceania_87.462_-18.858_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_europe_africa_-17.877_-7.419_10h.tif...
  Error processing whisp_all_bands_europe_africa_-17.877_-7.419_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_europe_africa_18.619_-6.145_10h.tif...




  Error processing whisp_all_bands_europe_africa_18.619_-6.145_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_europe_africa_28.605_19.826_10h.tif...
  Error processing whisp_all_bands_europe_africa_28.605_19.826_10h.tif: Unhandled raster datatype
Processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif...
  Error processing whisp_all_bands_europe_africa_32.227_36.984_10h.tif: Unhandled raster datatype
No statistics were calculated.


not working

In [None]:
def process_location(image,location_data, hectares=4, band_indices=None):
    """
    Process a location and download a GeoTIFF with selected bands.
    
    Args:
        location_data: Tuple containing (longitude, latitude, region_name)
        hectares: Area size in hectares (default: 4)
        band_indices
        : List of band indices to include (default: None = all bands)
        
    Returns:
        str: Status message about the download
    """
    lon, lat, region = location_data
    try:
        # Create the bounding box with specified hectares
        bbox_feature = create_bbox(lon, lat, hectares=hectares)
        
        # Create the combined dataset
        combined_image = image.clip(bbox_feature)
        
        # # Select specific bands if requested
        # if band_indices is not None:
        #     combined_image = combined_image.select(band_indices)
        #     bands_info = f"bands_{'-'.join(map(str, band_indices))}"
        # else:
        #     # Get all band names for default case
        #     band_names = combined_image.bandNames().getInfo()
        #     combined_image = combined_image.select(band_names)
        #     bands_info = "all_bands"
        
        # Get the download URL
        download_url = combined_image.getDownloadURL({
            'format': 'Geo_TIFF',
            'region': bbox_feature.geometry(),
            'scale': 10,
            'crs': 'EPSG:4326'
        })
        
        # Create a unique filename
        filename = f"whisp_{region}_{lon}_{lat}_{hectares}ha.tif"
        output_path = out_directory / filename
        
        # Download the image
        response = requests.get(download_url)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            return f"Successfully downloaded {filename}"
        else:
            return f"Failed to download {filename}: Status {response.status_code}"
        
    except Exception as e:
        return f"Error processing location {lon}, {lat}: {str(e)}"

In [None]:
image = whisp.combine_datasets()

# Download with specific bands (indices 0, 1, 2)
result = process_location(image, (lon, lat, region), hectares=10, band_indices=[0, 1, 2])
print(result)
# Download with non-sequential bands (indices 0, 3, 5)
result = process_location(image,(lon, lat, region), hectares=10, band_indices=[0, 3, 5])
print(result)
# Download all bands (default)
result = process_location(image,(lon, lat, region), hectares=10)
print(result)

In [None]:
import concurrent.futures
import random
import requests
import time
from pathlib import Path
from statistics import mean, median, stdev

# Define output directory
out_directory = Path.home() / 'Downloads' / 'whisp_samples'
out_directory.mkdir(exist_ok=True, parents=True)

# Define regions with forest coverage
forest_regions = [
    [-120, -40, -20, 50, "americas"],
    [-20, 40, -30, 60, "europe_africa"],
    [60, 150, -40, 60, "asia_oceania"]
]

# Generate random locations
random_locations = []
for i in range(3):
    region = random.choice(forest_regions)
    lon = round(random.uniform(region[0], region[1]), 3)
    lat = round(random.uniform(region[2], region[3]), 3)
    random_locations.append((lon, lat, region[4]))

# Track timing information
download_times = []

# Use ThreadPoolExecutor for parallel downloads
print("Starting parallel downloads...")
total_start_time = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    future_to_location = {executor.submit(process_location, loc): loc for loc in random_locations}
    
    for future in concurrent.futures.as_completed(future_to_location):
        location = future_to_location[future]
        try:
            result, elapsed_time = future.result()
            download_times.append(elapsed_time)
            print(f"Location {location[0]}, {location[1]}: {result}")
        except Exception as e:
            print(f"Location {location[0]}, {location[1]} generated an exception: {str(e)}")

total_elapsed = time.time() - total_start_time

# Calculate statistics
if download_times:
    avg_time = mean(download_times)
    med_time = median(download_times)
    min_time = min(download_times)
    max_time = max(download_times)
    
    # Calculate standard deviation if more than one download
    if len(download_times) > 1:
        std_dev = stdev(download_times)
        std_dev_info = f"Standard Deviation: {std_dev:.2f}s"
    else:
        std_dev_info = "Standard Deviation: N/A (need at least 2 samples)"
    
    print(f"\nDownload Time Statistics:")
    print(f"  Average time: {avg_time:.2f}s")
    print(f"  Median time: {med_time:.2f}s")
    print(f"  Min time: {min_time:.2f}s")
    print(f"  Max time: {max_time:.2f}s")
    print(f"  {std_dev_info}")
    print(f"  Total time for all downloads: {total_elapsed:.2f}s")
else:
    print("No successful downloads to calculate statistics")

print(f"\nDownloaded images are saved to: {out_directory}")

Whisp it

In [None]:
df_formatted_stats = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)

Display table

In [None]:
df_formatted_stats

In [None]:
# Define the output folder (if running in Sepal change path to preferred folder) 
out_directory = Path.home() / 'Downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table_stats.csv'

# Save the CSV file
df_formatted_stats.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Calculate risk category

In [None]:
# add risk columns to end of dataframe
df_w_risk = whisp.whisp_risk(df=df_formatted_stats)

Display table with risk columns

In [None]:
df_w_risk

Export table to CSV

In [None]:
# Define the output folder 
# e.g. in running in Sepal this might be: Path.home() / 'module_results/whisp/'
out_directory = Path.home() / 'Downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table_w_risk.csv'

# Save the CSV file
df_w_risk.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Export to GeoJSON (optional)

In [None]:
# Define the output file path for GeoJSON
geojson_output_file = out_directory / 'whisp_output_table.geojson'

# Save the GeoJSON file
whisp.convert_df_to_geojson(df_w_risk, geojson_output_file)  # builds a geojson file containing Whisp columns. Uses the geometry column "geo" to create the spatial features.
print(f"GeoJSON file saved to: {geojson_output_file}")