#### Main changes
- Using selfMask() to avoid empty pixels (a lot of bands are sparse)
- Skipping the validation step – panderas should be fast but there is some temp schema generation that takes time 
- Using high volume end point and concurrent processing
- Using reduceRegions instead of mapped reduceRegion - a chunk of code for choosing ha or percent etc is based on using reduceRegion and it also allowed to skip
- Skipping the use of points to get the admin details (country and level 1 info) and water_flag (should be based on image but was using vector admin still)


In [66]:
import ee

# Reset Earth Engine completely
ee.Reset()

# Initialize with standard (normal) endpoint
# ee.Initialize()

In [67]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

# Authenticate and initialize Earth Engine
try:
    ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')  # Try to use existing credentials first
except Exception:
    ee.Authenticate()
    ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')

In [68]:
# !pip install --upgrade --pre openforis-whisp

In [69]:
combined_reducer = ee.Reducer.sum().combine(ee.Reducer.median(),sharedInputs=True)

In [70]:
# Check which endpoint is now active
print("EE Data Base URL:", ee.data._cloud_api_base_url)
print("EE API Base URL:", ee.data._api_base_url)

# Check if using standard endpoint
if 'highvolume' in str(ee.data._cloud_api_base_url):
    print("✅ Using HIGH-VOLUME endpoint")
else:
    print("❌ Using STANDARD endpoint")

EE Data Base URL: https://earthengine-highvolume.googleapis.com
EE API Base URL: https://earthengine-highvolume.googleapis.com/api
✅ Using HIGH-VOLUME endpoint


In [71]:
import openforis_whisp as whisp

In [72]:
whisp_image = whisp.combine_datasets()

Whisp multiband image compiled


In [73]:
import ee
import geopandas as gpd
import pandas as pd
import time
import threading
from queue import Queue
import logging
from typing import List, Optional, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import openforis_whisp as whisp
import tempfile
import os
import sys

# Configure logging ONCE - avoid duplicate handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.WARNING, 
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
    )
logger = logging.getLogger("whisp-batch")

logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
logging.getLogger('googleapiclient').setLevel(logging.WARNING)

EE_MAX_CONCURRENT = 10
EE_FEATURES_PER_BATCH = 25
MAX_RETRIES = 3

def validate_geodataframe(gdf: gpd.GeoDataFrame, remove_nulls: bool = True, fix_invalid: bool = True) -> gpd.GeoDataFrame:
    """Validate and optionally clean a GeoDataFrame's geometries."""
    if remove_nulls:
        null_count = gdf.geometry.isna().sum()
        if null_count > 0:
            print(f"⚠️  Found {null_count} null geometries - removing...", flush=True)
            gdf = gdf[~gdf.geometry.isna()]
    if fix_invalid:
        valid_count = gdf.geometry.is_valid.sum()
        invalid_count = len(gdf) - valid_count
        if invalid_count > 0:
            print(f"⚠️  Found {invalid_count} invalid geometries - fixing...", flush=True)
            from shapely.validation import make_valid
            gdf['geometry'] = gdf['geometry'].apply(lambda g: make_valid(g) if g and not g.is_valid else g)
    print(f"✅ Validation complete. {len(gdf):,} geometries ready.", flush=True)
    return gdf

def batch_geodataframe(gdf: gpd.GeoDataFrame, batch_size: int) -> List[gpd.GeoDataFrame]:
    """Split a GeoDataFrame into batches of given size."""
    return [gdf.iloc[i:i+batch_size] for i in range(0, len(gdf), batch_size)]

def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
    """Convert a batch GeoDataFrame to an Earth Engine FeatureCollection using whisp."""
    temp_fd, temp_geojson_path = tempfile.mkstemp(suffix='.geojson', text=True)
    try:
        os.close(temp_fd)
        batch_gdf.to_file(temp_geojson_path, driver='GeoJSON')
        fc = whisp.convert_geojson_to_ee(temp_geojson_path)
        return fc
    finally:
        time.sleep(0.1)
        if os.path.exists(temp_geojson_path):
            try:
                os.unlink(temp_geojson_path)
            except OSError as cleanup_error:
                logger.warning(f"Could not delete temp file {temp_geojson_path}: {cleanup_error}")

def process_ee_feature_collection(feature_collection: ee.FeatureCollection, whisp_image: ee.Image, reducer: ee.Reducer, batch_idx: int, max_retries: int = MAX_RETRIES) -> pd.DataFrame:
    """Process an EE FeatureCollection with retry logic and return a DataFrame."""
    for attempt in range(max_retries):
        try:
            results = whisp_image.reduceRegions(
                collection=feature_collection,
                reducer=reducer,
                scale=10
            )
            df_result = whisp.convert_ee_to_df(results)
            return df_result
        except ee.EEException as e:
            error_msg = str(e)
            if "Unable to transform geometry" in error_msg:
                raise Exception(f"Geometry transformation error in batch {batch_idx + 1}: {error_msg}")
            elif "Quota" in error_msg or "limit" in error_msg.lower():
                if attempt < max_retries - 1:
                    backoff = min(30, 2 ** attempt)
                    print(f"⏳ Quota/rate limit hit, waiting {backoff}s before retry...", flush=True)
                    time.sleep(backoff)
                else:
                    raise Exception(f"Quota/rate limit exhausted for batch {batch_idx + 1}")
            elif "timeout" in error_msg.lower():
                if attempt < max_retries - 1:
                    backoff = min(15, 2 ** attempt)
                    print(f"⏳ Timeout, retrying in {backoff}s...", flush=True)
                    time.sleep(backoff)
                else:
                    raise e
            else:
                if attempt < max_retries - 1:
                    backoff = min(10, 2 ** attempt)
                    time.sleep(backoff)
                else:
                    raise e
        except Exception as e:
            if attempt < max_retries - 1:
                backoff = min(5, 2 ** attempt)
                time.sleep(backoff)
            else:
                raise e
    raise RuntimeError(f"Failed to process batch {batch_idx + 1} after {max_retries} attempts")

def process_geojson_file(
    geojson_path: str,
    whisp_image: ee.Image,
    reducer: ee.Reducer,
    batch_size: int = EE_FEATURES_PER_BATCH,
    max_concurrent: int = EE_MAX_CONCURRENT,
    validate_null_geometries: bool = True,
    validate_invalid_geometries: bool = True,
    max_retries: int = MAX_RETRIES
) -> pd.DataFrame:
    """Main function to process a GeoJSON file in batches using Whisp and EE."""
    print(f"🔍 Loading and validating GeoJSON file...", flush=True)
    gdf = gpd.read_file(geojson_path)
    print(f"📁 Loaded {len(gdf):,} features from {geojson_path}", flush=True)
    gdf = validate_geodataframe(gdf, remove_nulls=validate_null_geometries, fix_invalid=validate_invalid_geometries)
    batches = batch_geodataframe(gdf, batch_size)
    print(f"📊 Processing {len(gdf):,} features in {len(batches)} batches ({batch_size} features/batch)", flush=True)
    print(f"🔄 Running {max_concurrent} concurrent requests...", flush=True)
    results = []
    def process_one_batch(batch_gdf, batch_idx):
        fc = convert_batch_to_ee(batch_gdf)
        return process_ee_feature_collection(fc, whisp_image, reducer, batch_idx, max_retries)
    with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
        future_to_idx = {executor.submit(process_one_batch, batch, i): i for i, batch in enumerate(batches)}
        for future in as_completed(future_to_idx):
            batch_idx = future_to_idx[future]
            try:
                batch_result = future.result()
                results.append(batch_result)
                print(f"⏳ Progress: Batch {batch_idx + 1} ✓", flush=True)
            except Exception as e:
                print(f"❌ Batch {batch_idx + 1} failed: {str(e)[:80]}...", flush=True)
    if results:
        combined_df = pd.concat(results, ignore_index=True)
        return combined_df
    else:
        print("❌ No results produced - all batches failed", flush=True)
        return pd.DataFrame()

In [74]:
!pip show openforis-whisp

Name: openforis-whisp
Version: 2.0.0b1
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
Home-page: 
Author: Andy Arnell
Author-email: andrew.arnell@fao.org
License: MIT
Location: c:\Users\Arnell\Documents\GitHub\whisp\.venv\Lib\site-packages
Editable project location: C:\Users\Arnell\Documents\GitHub\whisp
Requires: country_converter, earthengine-api, geojson, geopandas, ipykernel, numpy, pandas, pandera, pydantic-core, python-dotenv, rsa, shapely
Required-by: 


In [75]:
folder_path = (r"C:\Users\Arnell\Downloads\a_processing_tests")  # Replace with your folder path

In [76]:
GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

# Define bounds from the provided Earth Engine geometry
# # area in Ghana 
# bounds = [ 
#     -3.04548260909834,  # min_lon
#     5.253961384163733,  # min_lat
#     -1.0179939534016594,  # max_lon
#     7.48307210714245    # max_lat
# ]

# area in China
bounds = [
    90.44831497309737,  # min_lon
    20.686366665187148,  # min_lat
    114.57868606684737,  # max_lon
    30.79200348254393    # max_lat
]

# # Brazil etc
# bounds = [-81.06002305884182,
#         -19.332462745930076,
#         -31.48971055884182,
#          9.600139384904205
#         ]

Benchmarking different processing methods

In [None]:
# Comprehensive benchmarking: compare methods and endpoints as specified
import time
import pandas as pd
import openforis_whisp as whisp
import tempfile
import json
import os
import numpy as np

def benchmark_all_methods(
    polygon_counts=[100],
    vertices_list=[10],
    area_list=[500],
    features_per_request_list=[10],
    n_repeats=3,
    method_choices=["reduceRegions", "whisp_formatted_stats_geojson_to_df", "batched"]
):
    results = []
    for method in method_choices:
        if method == "batched":
            endpoints = ["high-volume"]
        else:
            endpoints = ["standard"]
        for endpoint in endpoints:
            for polygon_count in polygon_counts:
                for vertices in vertices_list:
                    for area in area_list:
                        if method == "batched":
                            for features_per_request in features_per_request_list:
                                for repeat in range(n_repeats):
                                    geojson = whisp.generate_test_polygons(
                                        bounds=bounds,
                                        num_polygons=polygon_count,
                                        min_area_ha=area,
                                        max_area_ha=area,
                                        min_number_vert=vertices,
                                        max_number_vert=vertices
                                    )
                                    with tempfile.NamedTemporaryFile(suffix='.geojson', delete=False, mode='w') as tmp:
                                        json.dump(geojson, tmp)
                                        tmp_path = tmp.name
                                    start = time.time()
                                    ee.Initialize(opt_url="https://earthengine-highvolume.googleapis.com")
                                    whisp_image = whisp.combine_datasets()
                                    df = process_geojson_file(
                                        geojson_path=tmp_path,
                                        whisp_image=whisp_image,
                                        reducer=ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True),
                                        batch_size=features_per_request,
                                        max_concurrent=20,
                                        validate_null_geometries=True,
                                        validate_invalid_geometries=False,
                                        max_retries=2
                                    )
                                    elapsed = time.time() - start
                                    results.append({
                                        "endpoint": endpoint,
                                        "method": method,
                                        "repeat": repeat + 1,
                                        "polygon_count": polygon_count,
                                        "vertices": vertices,
                                        "area": area,
                                        "features_per_request": features_per_request,
                                        "elapsed_sec": elapsed
                                    })
                                    try:
                                        os.remove(tmp_path)
                                    except Exception:
                                        pass
                        else:
                            for repeat in range(n_repeats):
                                geojson = whisp.generate_test_polygons(
                                    bounds=bounds,
                                    num_polygons=polygon_count,
                                    min_area_ha=area,
                                    max_area_ha=area,
                                    min_number_vert=vertices,
                                    max_number_vert=vertices
                                )
                                with tempfile.NamedTemporaryFile(suffix='.geojson', delete=False, mode='w') as tmp:
                                    json.dump(geojson, tmp)
                                    tmp_path = tmp.name
                                start = time.time()
                                ee.Initialize()
                                whisp_image = whisp.combine_datasets()
                                if method == "reduceRegions":
                                    fc = whisp.convert_geojson_to_ee(tmp_path)
                                    reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
                                    ee_results = whisp_image.reduceRegions(fc, reducer=reducer, scale=10)
                                    df = whisp.convert_ee_to_df(ee_results)
                                elif method == "whisp_formatted_stats_geojson_to_df":
                                    df = whisp.whisp_formatted_stats_geojson_to_df(
                                        input_geojson_filepath=tmp_path,
                                        whisp_image=whisp_image
                                    )
                                else:
                                    df = None
                                elapsed = time.time() - start
                                results.append({
                                    "endpoint": endpoint,
                                    "method": method,
                                    "repeat": repeat + 1,
                                    "polygon_count": polygon_count,
                                    "vertices": vertices,
                                    "area": area,
                                    "features_per_request": None,
                                    "elapsed_sec": elapsed
                                })
                                try:
                                    os.remove(tmp_path)
                                except Exception:
                                    pass
    return pd.DataFrame(results)


# Example usage:
benchmark_df = benchmark_all_methods(
    polygon_counts=[100],
    vertices_list=[10],
    area_list=[500],
    features_per_request_list=[10],
    n_repeats=3,
    method_choices=["reduceRegions"]#,"whisp_formatted_stats_geojson_to_df"]#["batched","reduceRegions", "whisp_formatted_stats_geojson_to_df"]
)
display(benchmark_df)


In [None]:

benchmark_df.to_csv(folder_path+"/benchmark_all_methods_results.csv", index=False)
print("Benchmarking complete. Results saved to benchmark_all_methods_results.csv.")

In [None]:
# # Aggregate and sum elapsed_sec for each group
#  NB combined csv manually from 'benchmark_all_methods_results.csv' 
all_res_df = pd.read_csv(folder_path+"/combined_benchmark_all_methods_results.csv")

all_res_df["features_per_request"] = all_res_df["features_per_request"].fillna('NA')
agg_df = all_res_df.groupby(
    ['datasets_py_edits', 'method', 'polygon_count', 'vertices', 'area', 'features_per_request']
 )['elapsed_sec'].agg([
    ('average_elapsed_sec', 'mean'),
    ('min_elapsed_sec', 'min'),
    ('max_elapsed_sec', 'max')
 ]).reset_index()

agg_df["avg_sec_per_feat"] = agg_df["average_elapsed_sec"] / agg_df["features_per_request"].replace('NA', 1).astype(float)
# agg_df["min_sec_per_feat"] = agg_df["min_elapsed_sec"] / agg_df["features_per_request"].replace('NA', 1).astype(float)
# agg_df["max_sec_per_feat"] = agg_df["max_elapsed_sec"] / agg_df["features_per_request"].replace('NA', 1).astype(float)

agg_df

In [None]:
agg_df.to_csv(folder_path+"/aggregated_combined_all_methods_results.csv", index=False)")

Separate testing of functions

In [78]:
# Option 1: Use simple bounds (list)
random_geojson = whisp.generate_test_polygons(
    bounds=bounds, 
    num_polygons=10, 
    min_area_ha=50, 
    max_area_ha=100, 
    min_number_vert=500,     
    max_number_vert=1000     
)

# Option 2: Use Earth Engine Geometry directly (commented examples)
# Get bounds from a specific country
# china = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(ee.Filter.eq('country_na', 'China')).first().geometry()
# random_geojson = whisp.generate_test_polygons(china, num_polygons=10, min_area_ha=100, max_area_ha=1000)

# Or get bounds from multiple countries
# latam = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(ee.Filter.inList('country_na', ['Brazil', 'Colombia', 'Peru'])).geometry()
# random_geojson = whisp.generate_test_polygons(latam, num_polygons=50, min_area_ha=10, max_area_ha=100)

# GEOJSON_EXAMPLE_FILEPATH = folder_path + "/random_polygons.geojson"?
print(GEOJSON_EXAMPLE_FILEPATH)
import json
# Save the GeoJSON to a file
with open(GEOJSON_EXAMPLE_FILEPATH, 'w') as f:
    json.dump(random_geojson, f)

# Use example Whisp inputs (optional)
# GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")


# Add IDs to your existing GeoJSON file

# #Save to a new file (instead of overwriting)
# # whisp.reformat_geojson_properties(
# whisp.reformat_geojson_properties(
    
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH, 
#     id_field="internal_id",
#     output_path=folder_path + "/random_polygons_with_ids.geojson",
#     remove_properties=True
# )


C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson


In [79]:
# GEOJSON_EXAMPLE_FILEPATH = folder_path+"/RSPO-Concessions-Version-10-May-2025.geojson"

In [85]:
FEATURES_PER_EE_REQUEST = 10
MAX_CONCURRENT_EE_REQUESTS = 20

In [None]:
# pd.DataFrame(benchmark_df).to_csv(folder_path + "/whisp_benchmark_results.csv", index=False)

In [88]:
if __name__=="__main__":
    
    result_df = process_geojson_file(
        geojson_path=GEOJSON_EXAMPLE_FILEPATH,
        whisp_image=whisp_image,
        reducer=combined_reducer,
        batch_size=FEATURES_PER_EE_REQUEST,
        max_concurrent=MAX_CONCURRENT_EE_REQUESTS,
        validate_null_geometries=True,
        validate_invalid_geometries=False,
        max_retries=3,
        # ee_version="v1"  # Add this if you implement versioning
    )

🔍 Loading and validating GeoJSON file...
📁 Loaded 10 features from C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson
📁 Loaded 10 features from C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson


✅ Validation complete. 10 geometries ready.
📊 Processing 10 features in 1 batches (10 features/batch)
📊 Processing 10 features in 1 batches (10 features/batch)
🔄 Running 20 concurrent requests...
🔄 Running 20 concurrent requests...
Reading GeoJSON file from: C:\Users\Arnell\AppData\Local\Temp\tmp10bez02j.geojsonReading GeoJSON file from: C:\Users\Arnell\AppData\Local\Temp\tmp10bez02j.geojson

⏳ Progress: Batch 1 ✓
⏳ Progress: Batch 1 ✓


In [89]:
result_df  # Display first few rows of combined results

Unnamed: 0,geo,Area_median,Area_sum,Cocoa_2023_FDaP_median,Cocoa_2023_FDaP_sum,Cocoa_ETH_median,Cocoa_ETH_sum,Cocoa_FDaP_median,Cocoa_FDaP_sum,Coffee_FDaP_2023_median,...,TMF_plant_sum,TMF_regrowth_2023_median,TMF_regrowth_2023_sum,TMF_undist_median,TMF_undist_sum,actual_area_ha,actual_vertices,internal_id,requested_area_ha,requested_vertices
0,"{'type': 'Polygon', 'coordinates': [[[98.83074...",92.890625,735986.100621,,0,,0,,0,,...,0,92.889313,129571.101509,92.890457,80226.143863,69.06,620,1,66.71,620
1,"{'type': 'Polygon', 'coordinates': [[[103.1105...",89.768295,754464.288947,,0,,0,,0,,...,0,,0.0,,0.0,68.3,709,2,67.47,709
2,"{'type': 'Polygon', 'coordinates': [[[106.3549...",88.585541,717705.2754,,0,,0,,0,,...,0,,0.0,,0.0,64.08,844,3,66.86,844
3,"{'type': 'Polygon', 'coordinates': [[[94.38605...",88.899467,638516.87618,,0,,0,,0,,...,0,88.900017,24803.08429,,0.0,57.22,809,4,60.24,809
4,"{'type': 'Polygon', 'coordinates': [[[114.2661...",91.582565,863238.858537,,0,,0,,0,,...,0,,0.0,,0.0,79.8,955,5,88.21,955
5,"{'type': 'Polygon', 'coordinates': [[[110.6325...",89.962616,800512.312311,,0,,0,,0,,...,0,,0.0,,0.0,72.63,819,6,70.43,819
6,"{'type': 'Polygon', 'coordinates': [[[109.5698...",88.368523,899393.943528,,0,,0,,0,,...,0,,0.0,,0.0,80.1,679,7,73.63,679
7,"{'type': 'Polygon', 'coordinates': [[[96.44365...",91.147011,711969.398977,,0,,0,,0,,...,0,91.148666,7383.028725,,0.0,65.49,831,8,60.41,831
8,"{'type': 'Polygon', 'coordinates': [[[93.92365...",86.006149,880545.429243,,0,,0,,0,,...,0,,0.0,,0.0,76.24,788,9,77.85,788
9,"{'type': 'Polygon', 'coordinates': [[[102.6207...",88.787193,659995.622364,,0,,0,,0,,...,0,,0.0,,0.0,59.07,963,10,57.76,963


In [90]:
# Define the output folder 
# e.g. in running in Sepal this might be: Path.home() / 'module_results/whisp/'
out_directory = Path.home() / 'downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table.csv'

# Save the CSV file
result_df.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Table with risk columns saved to: C:\Users\Arnell\downloads\whisp_output_table.csv


In [None]:
# Define the output file path for GeoJSON
geojson_output_file = out_directory / 'whisp_output_geo.geojson'

# Save the GeoJSON file
whisp.convert_df_to_geojson(result_df, geojson_output_file)  # builds a geojson file containing Whisp columns. Uses the geometry column "geo" to create the spatial features.
print(f"GeoJSON file saved to: {geojson_output_file}")

Classic Whisp

In [None]:
ee.Reset()

In [None]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

# Authenticate and initialize Earth Engine with STANDARD endpoint
# (The concurrent processing section above uses high-volume endpoint)
try:
    ee.Initialize()  # Standard endpoint (default)
except Exception:
    ee.Authenticate()
    ee.Initialize()  # Standard endpoint (default)

In [None]:
# Check which endpoint is now active
print("EE Data Base URL:", ee.data._cloud_api_base_url)
print("EE API Base URL:", ee.data._api_base_url)

# Check if using standard endpoint
if 'highvolume' in str(ee.data._cloud_api_base_url):
    print("❌ Still using HIGH-VOLUME endpoint")
else:
    print("✅ Now using STANDARD endpoint")

In [None]:
import openforis_whisp as whisp


In [None]:
!pip show openforis-whisp

In [None]:
#### whisp = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)
# whisp = whisp.whisp_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH,whisp_image=whisp_image)

In [None]:
import openforis_whisp as whisp
fc = whisp.convert_geojson_to_ee(GEOJSON_EXAMPLE_FILEPATH)
# print(fc.size().getInfo())  # Print number of features in the collection


In [None]:
whisp_image = whisp.combine_datasets()

In [None]:
combined_reducer = ee.Reducer.sum().combine(ee.Reducer.median(),sharedInputs=True)
results = whisp_image.reduceRegions(fc, reducer=combined_reducer, scale=10)
whisp.convert_ee_to_df(results)

In [None]:
results = whisp.whisp_formatted_stats_geojson_to_df(input_geojson_filepath=GEOJSON_EXAMPLE_FILEPATH,whisp_image=whisp_image)