#### Main changes
- Using selfMask() to avoid empty pixels (a lot of bands are sparse)
- Skipping the validation step ‚Äì panderas should be fast but there is some temp schema generation that takes time 
- Using high volume end point and concurrent processing
- Using reduceRegions instead of mapped reduceRegion - a chunk of code for choosing ha or percent etc is based on using reduceRegion and it also allowed to skip
- Skipping the use of points to get the admin details (country and level 1 info) and water_flag (should be based on image but was using vector admin still)


In [1]:
import ee

# Reset Earth Engine completely
ee.Reset()

# Initialize with standard (normal) endpoint
# ee.Initialize()

In [2]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

# Authenticate and initialize Earth Engine
try:
    ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')  # Try to use existing credentials first
except Exception:
    ee.Authenticate()
    ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')

In [3]:
# !pip install --upgrade --pre openforis-whisp

In [4]:
combined_reducer = ee.Reducer.sum().combine(ee.Reducer.median(),sharedInputs=True)

In [5]:
# Check which endpoint is now active
print("EE Data Base URL:", ee.data._cloud_api_base_url)
print("EE API Base URL:", ee.data._api_base_url)

# Check if using standard endpoint
if 'highvolume' in str(ee.data._cloud_api_base_url):
    print("‚úÖ Using HIGH-VOLUME endpoint")
else:
    print("‚ùå Using STANDARD endpoint")

EE Data Base URL: https://earthengine-highvolume.googleapis.com
EE API Base URL: https://earthengine-highvolume.googleapis.com/api
‚úÖ Using HIGH-VOLUME endpoint


In [None]:
import openforis_whisp as whisp

In [8]:

whisp_image = whisp.combine_datasets()

Whisp multiband image compiled


In [9]:
import ee
import geopandas as gpd
import pandas as pd
import time
import threading
from queue import Queue
import logging
from typing import List, Optional, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import openforis_whisp as whisp
import tempfile
import os
import sys

# Configure logging ONCE - avoid duplicate handlers
# Clear any existing handlers first
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Set up clean logging configuration
logging.basicConfig(
    level=logging.WARNING, 
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True  # Force reconfiguration
)
logger = logging.getLogger("whisp-batch")

# Suppress verbose logging from Google API libraries
logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
logging.getLogger('googleapiclient').setLevel(logging.WARNING)

# Optimized configuration for EE high-volume processing
EE_MAX_CONCURRENT = 10
EE_FEATURES_PER_BATCH = 25
MAX_RETRIES = 3


class OptimizedWhispProcessor:
    """Optimized processor using whisp.convert_geojson_to_ee() with proper file handling"""
    
    def __init__(self, max_concurrent=EE_MAX_CONCURRENT, features_per_batch=EE_FEATURES_PER_BATCH):
        self.max_concurrent = max_concurrent
        self.features_per_batch = features_per_batch
        self.semaphore = threading.Semaphore(max_concurrent)
        self.results = {}
        self.processing_stats = {'completed': 0, 'failed': 0, 'total': 0}
        self.failed_batches = []
        self.max_consecutive_failures = 3  # Stop if 3 batches fail in a row
        
    def process_file_optimized(self, geojson_path: str, national_codes: Optional[List[str]] = None) -> pd.DataFrame:
        """Process file using whisp.convert_geojson_to_ee() with validation"""
        
        print(f"üîç Loading and validating GeoJSON file...", flush=True)
        
        # Load and validate the GeoDataFrame first
        try:
            gdf = gpd.read_file(geojson_path)
            print(f"üìÅ Loaded {len(gdf):,} features from {geojson_path}", flush=True)
            
            # Basic geometry validation
            invalid_geoms = gdf.geometry.isna().sum()
            if invalid_geoms > 0:
                print(f"‚ö†Ô∏è  Found {invalid_geoms} null geometries - removing...", flush=True)
                gdf = gdf[~gdf.geometry.isna()]
                
            # Check for valid geometries
            valid_geoms = gdf.geometry.is_valid.sum()
            invalid_geom_count = len(gdf) - valid_geoms
            if invalid_geom_count > 0:
                print(f"‚ö†Ô∏è  Found {invalid_geom_count} invalid geometries - fixing...", flush=True)
                from shapely.validation import make_valid
                gdf['geometry'] = gdf['geometry'].apply(lambda g: make_valid(g) if g and not g.is_valid else g)
                
            print(f"‚úÖ Validated {len(gdf):,} geometries", flush=True)
            
        except Exception as e:
            print(f"‚ùå Failed to load/validate GeoJSON: {e}", flush=True)
            return pd.DataFrame()
        
        total_features = len(gdf)
        
        # Split into feature batches
        feature_batches = []
        for i in range(0, total_features, self.features_per_batch):
            batch = gdf.iloc[i:i+self.features_per_batch]
            feature_batches.append(batch)
        
        total_batches = len(feature_batches)
        print(f"üìä Processing {total_features:,} features in {total_batches} batches ({self.features_per_batch} features/batch)", flush=True)
        print(f"üîÑ Running {self.max_concurrent} concurrent requests...", flush=True)
        print(f"üõë Will stop if {self.max_consecutive_failures} consecutive batches fail", flush=True)
        
        # Track progress and failures
        completed_batches = 0
        failed_batches = 0
        consecutive_failures = 0
        
        # Process batches with early stopping 
    
        results = []
        with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
            print(f"üöÄ Submitting all {total_batches} batches concurrently...", flush=True)
            
            # Submit ALL batches at once for maximum concurrency
            future_to_batch = {
                executor.submit(self._process_feature_batch, batch, national_codes, i): i 
                for i, batch in enumerate(feature_batches)
            }
            
            print(f"‚úÖ All batches submitted - processing with {self.max_concurrent} concurrent workers...\n", flush=True)
            
            # Collect results with early stopping on consecutive failures
            for future in as_completed(future_to_batch):
                batch_idx = future_to_batch[future]
                try:
                    batch_result = future.result()
                    results.append(batch_result)
                    completed_batches += 1
                    consecutive_failures = 0  # Reset failure counter on success
                    
                    # Update stats
                    self.processing_stats['completed'] = completed_batches
                    
                    # Show progress for EVERY batch completion (not just every 10)
                    total_processed = completed_batches + failed_batches
                    success_rate = completed_batches / total_processed * 100 if total_processed > 0 else 0
                    
                    # Use flush=True to ensure immediate output without buffering
                    print(f"‚è≥ Progress: {completed_batches}/{total_batches} batches completed ({success_rate:.1f}% success) - Batch #{batch_idx + 1} ‚úì", flush=True)
                        
                except Exception as e:
                    failed_batches += 1
                    consecutive_failures += 1
                    error_msg = str(e)
                    
                    # Update stats
                    self.processing_stats['failed'] = failed_batches
                    
                    print(f"‚ùå Batch {batch_idx + 1} failed: {error_msg[:80]}...", flush=True)
                    
                    # Early stopping only on excessive consecutive failures
                    if consecutive_failures >= self.max_consecutive_failures:
                        print(f"üõë STOPPING: {consecutive_failures} consecutive failures detected", flush=True)
                        print(f"üí° This suggests systematic issues - cancelling remaining batches", flush=True)
                        
                        # Cancel remaining futures to free up resources
                        for remaining_future in future_to_batch:
                            if not remaining_future.done():
                                remaining_future.cancel()
                        break
            
            # Final newline after progress
            print(flush=True)
            
            # Return results without printing summary (will be done in main block)
            if results:
                combined_df = pd.concat(results, ignore_index=True)
                return combined_df
            else:
                print("‚ùå No results produced - all batches failed", flush=True)
                print("üí° Suggestions:", flush=True)
                print("   - Check if GeoJSON has valid geometries", flush=True)
                print("   - Try smaller batch sizes (FEATURES_PER_EE_REQUEST)", flush=True)
                print("   - Verify Earth Engine authentication", flush=True)
                print("   - Check if features are within valid coordinate ranges", flush=True)
                return pd.DataFrame()
    
    def _process_feature_batch(self, batch_gdf: gpd.GeoDataFrame, national_codes: Optional[List[str]], batch_idx: int) -> pd.DataFrame:
        """Process a single batch using whisp.convert_geojson_to_ee() with proper file handling"""
        
        with self.semaphore:
            temp_geojson_path = None
            try:
                # Create temporary file with delete=False to handle manually
                temp_fd, temp_geojson_path = tempfile.mkstemp(suffix='.geojson', text=True)
                
                try:
                    # Close the file descriptor so Windows can access it
                    os.close(temp_fd)
                    
                    # Save batch to temporary GeoJSON file
                    batch_gdf.to_file(temp_geojson_path, driver='GeoJSON')
                    
                    # Use whisp to convert GeoJSON to EE FeatureCollection
                    # This handles reprojection and validation automatically
                    feature_collection = whisp.convert_geojson_to_ee(temp_geojson_path)
                    
                    # Process the FeatureCollection
                    result_df = self._process_ee_feature_collection(feature_collection, national_codes, batch_idx)
                    
                    return result_df
                    
                except Exception as processing_error:
                    raise processing_error
                    
            except Exception as e:
                raise Exception(f"Batch {batch_idx + 1} processing failed: {str(e)}")
                
            finally:
                # Clean up temporary file with proper error handling
                if temp_geojson_path and os.path.exists(temp_geojson_path):
                    try:
                        # Small delay to ensure file is released
                        time.sleep(0.1)
                        os.unlink(temp_geojson_path)
                    except OSError as cleanup_error:
                        # If we can't delete, log it but don't fail
                        logger.warning(f"Could not delete temp file {temp_geojson_path}: {cleanup_error}")

    def _process_ee_feature_collection(self, feature_collection: ee.FeatureCollection, 
                                 national_codes: Optional[List[str]], batch_idx: int) -> pd.DataFrame:
        """Process FeatureCollection with enhanced retry logic"""
        
        for attempt in range(MAX_RETRIES):
            try:
                
                results = whisp_image.reduceRegions(
                    collection=feature_collection,
                    reducer=combined_reducer,
                    scale=10
                )
                df_result = whisp.convert_ee_to_df(results)

                # results = whisp.whisp_stats_ee_to_df(
                #     feature_collection=feature_collection,
                #     # national_codes=national_codes,
                #     whisp_image=whisp_image
                # )
                # df_result = results

                # results = whisp.whisp_formatted_stats_ee_to_df(
                #     feature_collection=feature_collection,
                #     # national_codes=national_codes,
                #     whisp_image=whisp_image
                # )
                # df_result = results
           

                return df_result
                
            except ee.EEException as e:
                error_msg = str(e)
                
                # Check for specific geometry errors
                if "Unable to transform geometry" in error_msg:
                    raise Exception(f"Geometry transformation error in batch {batch_idx + 1}: {error_msg}")
                elif "Quota" in error_msg or "limit" in error_msg.lower():
                    if attempt < MAX_RETRIES - 1:
                        backoff = min(30, 2 ** attempt)
                        print(f"‚è≥ Quota/rate limit hit, waiting {backoff}s before retry...", flush=True)
                        time.sleep(backoff)
                    else:
                        raise Exception(f"Quota/rate limit exhausted for batch {batch_idx + 1}")
                elif "timeout" in error_msg.lower():
                    if attempt < MAX_RETRIES - 1:
                        backoff = min(15, 2 ** attempt)
                        print(f"‚è≥ Timeout, retrying in {backoff}s...", flush=True)
                        time.sleep(backoff)
                    else:
                        raise e
                else:
                    if attempt < MAX_RETRIES - 1:
                        backoff = min(10, 2 ** attempt)
                        time.sleep(backoff)
                    else:
                        raise e
                        
            except Exception as e:
                if attempt < MAX_RETRIES - 1:
                    backoff = min(5, 2 ** attempt)
                    time.sleep(backoff)
                else:
                    raise e
        
        raise RuntimeError(f"Failed to process batch {batch_idx + 1} after {MAX_RETRIES} attempts")

In [10]:
!pip show openforis-whisp

Name: openforis-whisp
Version: 2.0.0b1
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
Home-page: 
Author: Andy Arnell
Author-email: andrew.arnell@fao.org
License: MIT
Location: c:\Users\Arnell\Documents\GitHub\whisp\.venv\Lib\site-packages
Editable project location: C:\Users\Arnell\Documents\GitHub\whisp
Requires: country_converter, earthengine-api, geojson, geopandas, ipykernel, numpy, pandas, pandera, pydantic-core, python-dotenv, rsa, shapely
Required-by: 


In [11]:
folder_path = (r"C:\Users\Arnell\Downloads\a_processing_tests")  # Replace with your folder path

In [12]:
GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

# Define bounds from the provided Earth Engine geometry
# # area in Ghana 
# bounds = [ 
#     -3.04548260909834,  # min_lon
#     5.253961384163733,  # min_lat
#     -1.0179939534016594,  # max_lon
#     7.48307210714245    # max_lat
# ]

# area in China
bounds = [
    90.44831497309737,  # min_lon
    20.686366665187148,  # min_lat
    114.57868606684737,  # max_lon
    30.79200348254393    # max_lat
]

# # Brazil etc
# bounds = [-81.06002305884182,
#         -19.332462745930076,
#         -31.48971055884182,
#          9.600139384904205
#         ]

In [13]:
# Get bounds from a country using the whisp function
brazil = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(
    ee.Filter.eq('country_na', 'Brazil')
).first().geometry()
geojson = whisp.generate_test_polygons(brazil, 
                                num_polygons=10, 
                                max_area_ha=100, 
                                min_number_vert=10, 
                                max_number_vert=100
                                )

# # Or multiple countries
# latam = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(
#     ee.Filter.inList('country_na', ['Brazil', 'Colombia', 'Peru'])
# ).geometry()
# geojson = whisp.generate_test_polygons(latam, num_polygons=50)

In [14]:
GEOJSON_EXAMPLE_FILEPATH = folder_path + "/random_polygons.geojson"
print(GEOJSON_EXAMPLE_FILEPATH)
import json
# Save the GeoJSON to a file
with open(GEOJSON_EXAMPLE_FILEPATH, 'w') as f:
    json.dump(geojson, f)


C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson


In [15]:
# Option 1: Use simple bounds (list)
random_geojson = whisp.generate_test_polygons(
    bounds=bounds, 
    num_polygons=200, 
    min_area_ha=50, 
    max_area_ha=100, 
    min_number_vert=50,      # Fixed: was 500 (greater than max!)
    max_number_vert=1000     # Now max is correctly larger than min
)

# Option 2: Use Earth Engine Geometry directly (commented examples)
# Get bounds from a specific country
# china = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(ee.Filter.eq('country_na', 'China')).first().geometry()
# random_geojson = whisp.generate_test_polygons(china, num_polygons=10, min_area_ha=100, max_area_ha=1000)

# Or get bounds from multiple countries
# latam = ee.FeatureCollection('USDOS/LSIB_SIMPLE/2017').filter(ee.Filter.inList('country_na', ['Brazil', 'Colombia', 'Peru'])).geometry()
# random_geojson = whisp.generate_test_polygons(latam, num_polygons=50, min_area_ha=10, max_area_ha=100)

GEOJSON_EXAMPLE_FILEPATH = folder_path + "/random_polygons.geojson"
print(GEOJSON_EXAMPLE_FILEPATH)
import json
# Save the GeoJSON to a file
with open(GEOJSON_EXAMPLE_FILEPATH, 'w') as f:
    json.dump(random_geojson, f)

# Use example Whisp inputs (optional)
# GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")


# Add IDs to your existing GeoJSON file

# #Save to a new file (instead of overwriting)
# # whisp.reformat_geojson_properties(
# whisp.reformat_geojson_properties(
    
#     geojson_path=GEOJSON_EXAMPLE_FILEPATH, 
#     id_field="internal_id",
#     output_path=folder_path + "/random_polygons_with_ids.geojson",
#     remove_properties=True
# )

C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson


In [16]:
# GEOJSON_EXAMPLE_FILEPATH = folder_path+"/RSPO-Concessions-Version-10-May-2025.geojson"

In [17]:
# Example usage with controlled batch sizes
if __name__ == "__main__":
    
    # Configure batch size based on your data characteristics
    FEATURES_PER_EE_REQUEST = 50 # Small batches for complex geometries
    MAX_CONCURRENT_EE_REQUESTS = 20  # Conservative for quota management
    
    # Initialize processor
    processor = OptimizedWhispProcessor(
        max_concurrent=MAX_CONCURRENT_EE_REQUESTS,
        features_per_batch=FEATURES_PER_EE_REQUEST
    )
    
    # Process file with controlled batching
    try:
        # GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")
        
        logger.info(f"Processing with {FEATURES_PER_EE_REQUEST} features per Earth Engine request")
        logger.info(f"Maximum {MAX_CONCURRENT_EE_REQUESTS} concurrent requests")
        
        result_df = processor.process_file_optimized(
            GEOJSON_EXAMPLE_FILEPATH, 
            # national_codes=["br", "co"]
        )
        
        if not result_df.empty:
            print(f"\n‚úÖ Success! Processed {len(result_df)} features")
            print("\nFirst 5 rows:")
            print(result_df.head())
            
            # Save results
            result_df.to_csv(Path.home() / "downloads"/ "optimized_whisp_results.csv", index=False)
            logger.info("Results saved to optimized_whisp_results.csv")
        else:
            print("\n‚ùå No results produced")
        
        # Display processing statistics and success rate
        print(f"\nüìä Processing Statistics:")
        print(f"   Processing stats: {processor.processing_stats}")
        
        # Calculate and display success rate if we have completed/failed counts
        total_attempts = processor.processing_stats.get('completed', 0) + processor.processing_stats.get('failed', 0)
        if total_attempts > 0:
            success_rate = (processor.processing_stats.get('completed', 0) / total_attempts) * 100
            print(f"\nüéØ Overall Success Rate: {success_rate:.1f}%")
            print(f"   ‚úÖ Completed: {processor.processing_stats.get('completed', 0)}")
            print(f"   ‚ùå Failed: {processor.processing_stats.get('failed', 0)}")
        
    except Exception as e:
        logger.error(f"Processing failed: {e}")
        print(f"\n‚ùå Fatal error: {e}")

üîç Loading and validating GeoJSON file...
üìÅ Loaded 200 features from C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson
‚úÖ Validated 200 geometries
üìä Processing 200 features in 4 batches (50 features/batch)
üìÅ Loaded 200 features from C:\Users\Arnell\Downloads\a_processing_tests/random_polygons.geojson
‚úÖ Validated 200 geometries
üìä Processing 200 features in 4 batches (50 features/batch)
üîÑ Running 20 concurrent requests...
üõë Will stop if 3 consecutive batches fail
üöÄ Submitting all 4 batches concurrently...
üîÑ Running 20 concurrent requests...
üõë Will stop if 3 consecutive batches fail
üöÄ Submitting all 4 batches concurrently...
Reading GeoJSON file from: C:\Users\Arnell\AppData\Local\Temp\tmphz1n9kyw.geojson
‚úÖ All batches submitted - processing with 20 concurrent workers...

Reading GeoJSON file from: C:\Users\Arnell\AppData\Local\Temp\tmphz1n9kyw.geojson
‚úÖ All batches submitted - processing with 20 concurrent workers...

Reading GeoJ

  combined_df = pd.concat(results, ignore_index=True)


In [18]:
result_df  # Display first few rows of combined results

Unnamed: 0,geo,Area_median,Area_sum,Cocoa_2023_FDaP_median,Cocoa_2023_FDaP_sum,Cocoa_ETH_median,Cocoa_ETH_sum,Cocoa_FDaP_median,Cocoa_FDaP_sum,Coffee_FDaP_2023_median,...,TMF_plant_sum,TMF_regrowth_2023_median,TMF_regrowth_2023_sum,TMF_undist_median,TMF_undist_sum,actual_area_ha,actual_vertices,internal_id,requested_area_ha,requested_vertices
0,"{'type': 'Polygon', 'coordinates': [[[91.96837...",90.833130,5.705311e+05,,0,,0,,0,,...,0,90.833878,5012.562173,,0.0,52.29,571,51,53.51,571
1,"{'type': 'Polygon', 'coordinates': [[[108.3274...",93.026390,8.120903e+05,,0,,0,,0,,...,0,,0.000000,,0.0,76.32,722,52,73.07,722
2,"{'type': 'Polygon', 'coordinates': [[[112.9834...",88.230644,1.085563e+06,,0,,0,,0,,...,0,,0.000000,,0.0,96.52,70,53,88.57,70
3,"{'type': 'Polygon', 'coordinates': [[[108.4685...",88.318504,6.506089e+05,,0,,0,,0,,...,0,,0.000000,,0.0,57.91,925,54,54.57,925
4,"{'type': 'Polygon', 'coordinates': [[[108.3975...",88.901604,1.009702e+06,,0,,0,,0,,...,0,,0.000000,,0.0,90.49,329,55,96.17,329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"{'type': 'Polygon', 'coordinates': [[[100.4817...",91.934227,6.557443e+05,,0,,0,,0,,...,0,,0.000000,,0.0,60.87,128,196,56.86,128
196,"{'type': 'Polygon', 'coordinates': [[[101.5073...",91.154373,5.691002e+05,,0,,0,,0,,...,0,,0.000000,,0.0,52.35,205,197,50.27,205
197,"{'type': 'Polygon', 'coordinates': [[[99.36307...",88.253922,9.281237e+05,,0,,0,,0,,...,0,,0.000000,,0.0,82.55,993,198,85.19,993
198,"{'type': 'Polygon', 'coordinates': [[[101.3104...",85.674759,9.301358e+05,,0,,0,,0,,...,0,,0.000000,,0.0,80.21,158,199,81.31,158


In [19]:
# Define the output folder 
# e.g. in running in Sepal this might be: Path.home() / 'module_results/whisp/'
out_directory = Path.home() / 'downloads'

# Define the output file path for CSV
csv_output_file = out_directory / 'whisp_output_table.csv'

# Save the CSV file
result_df.to_csv(path_or_buf=csv_output_file, index=False)
print(f"Table with risk columns saved to: {csv_output_file}")

Table with risk columns saved to: C:\Users\Arnell\downloads\whisp_output_table.csv


In [20]:
# Define the output file path for GeoJSON
geojson_output_file = out_directory / 'whisp_output_geo.geojson'

# Save the GeoJSON file
whisp.convert_df_to_geojson(result_df, geojson_output_file)  # builds a geojson file containing Whisp columns. Uses the geometry column "geo" to create the spatial features.
print(f"GeoJSON file saved to: {geojson_output_file}")

GeoJSON saved to C:\Users\Arnell\downloads\whisp_output_geo.geojson
GeoJSON file saved to: C:\Users\Arnell\downloads\whisp_output_geo.geojson


Classic Whisp

In [21]:
ee.Reset()

In [22]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

# Authenticate and initialize Earth Engine with STANDARD endpoint
# (The concurrent processing section above uses high-volume endpoint)
try:
    ee.Initialize()  # Standard endpoint (default)
except Exception:
    ee.Authenticate()
    ee.Initialize()  # Standard endpoint (default)

In [23]:
# Check which endpoint is now active
print("EE Data Base URL:", ee.data._cloud_api_base_url)
print("EE API Base URL:", ee.data._api_base_url)

# Check if using standard endpoint
if 'highvolume' in str(ee.data._cloud_api_base_url):
    print("‚ùå Still using HIGH-VOLUME endpoint")
else:
    print("‚úÖ Now using STANDARD endpoint")

EE Data Base URL: https://earthengine.googleapis.com
EE API Base URL: https://earthengine.googleapis.com/api
‚úÖ Now using STANDARD endpoint


In [24]:
import openforis_whisp as whisp


In [25]:
!pip show openforis-whisp

Name: openforis-whisp
Version: 2.0.0b1
Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
Home-page: 
Author: Andy Arnell
Author-email: andrew.arnell@fao.org
License: MIT
Location: c:\Users\Arnell\Documents\GitHub\whisp\.venv\Lib\site-packages
Editable project location: C:\Users\Arnell\Documents\GitHub\whisp
Requires: country_converter, earthengine-api, geojson, geopandas, ipykernel, numpy, pandas, pandera, pydantic-core, python-dotenv, rsa, shapely
Required-by: 


In [26]:
#### whisp = whisp.whisp_formatted_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH)
# whisp = whisp.whisp_stats_geojson_to_df(GEOJSON_EXAMPLE_FILEPATH,whisp_image=whisp_image)

In [27]:
import openforis_whisp as whisp
fc = whisp.convert_geojson_to_ee(GEOJSON_EXAMPLE_FILEPATH)
print(fc.size().getInfo())  # Print number of features in the collection


Reading GeoJSON file from: C:\Users\Arnell\Downloads\a_processing_tests\random_polygons.geojson
200
200


In [28]:
whisp_image = whisp.combine_datasets()

Whisp multiband image compiled


In [29]:
combined_reducer = ee.Reducer.sum().combine(ee.Reducer.median(),sharedInputs=True)

In [30]:
results = whisp_image.reduceRegions(fc, reducer=combined_reducer, scale=10)


In [31]:
whisp.convert_ee_to_df(results)

Unnamed: 0,geo,Area_median,Area_sum,Cocoa_2023_FDaP_median,Cocoa_2023_FDaP_sum,Cocoa_ETH_median,Cocoa_ETH_sum,Cocoa_FDaP_median,Cocoa_FDaP_sum,Coffee_FDaP_2023_median,...,TMF_plant_sum,TMF_regrowth_2023_median,TMF_regrowth_2023_sum,TMF_undist_median,TMF_undist_sum,actual_area_ha,actual_vertices,internal_id,requested_area_ha,requested_vertices
0,"{'type': 'Polygon', 'coordinates': [[[101.4470...",90.404190,5.940053e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,54.18,702,1,53.96,702
1,"{'type': 'Polygon', 'coordinates': [[[104.6790...",91.706573,6.369356e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,58.97,661,2,54.89,661
2,"{'type': 'Polygon', 'coordinates': [[[93.92380...",92.855515,7.922338e+05,,0,,0,,0,,...,0,92.856117,78527.096953,92.854805,66766.022801,74.30,971,3,74.20,971
3,"{'type': 'Polygon', 'coordinates': [[[99.75578...",87.602486,8.785513e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,77.53,384,4,72.21,384
4,"{'type': 'Polygon', 'coordinates': [[[91.52270...",92.887070,8.724163e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,81.86,414,5,74.67,414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"{'type': 'Polygon', 'coordinates': [[[106.4827...",90.384163,1.026758e+06,,0,,0,,0,,...,0,,0.000000,,0.000000,93.62,332,196,99.57,332
196,"{'type': 'Polygon', 'coordinates': [[[93.49027...",92.670715,8.357174e+05,,0,,0,,0,,...,0,92.671051,65514.512464,92.669884,68337.497915,78.22,772,197,81.35,772
197,"{'type': 'Polygon', 'coordinates': [[[110.4099...",92.652054,6.813723e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,63.76,300,198,67.12,300
198,"{'type': 'Polygon', 'coordinates': [[[94.64630...",86.658569,9.673970e+05,,0,,0,,0,,...,0,,0.000000,,0.000000,84.42,579,199,83.80,579


In [None]:
# Test: How long does schema BUILDING actually take?
# This tests the full process: loading CSVs + creating Pandera schema
import time
import openforis_whisp as whisp
from pathlib import Path
import os

print("Testing schema building from lookup files...")
print("=" * 60)

# Find and delete any existing cache files first
cache_dir = Path(whisp.__file__).parent / 'parameters'
cache_files = list(cache_dir.glob('schema_*.pkl'))
print(f"\nüóëÔ∏è  Found {len(cache_files)} existing cache file(s)")
for cache_file in cache_files:
    print(f"   Deleting: {cache_file.name}")
    cache_file.unlink()

# Force rebuild
os.environ['WHISP_FORCE_SCHEMA_REBUILD'] = 'true'

print("\nüî® Building schema from scratch (no cache)...")
start = time.time()
schema = whisp.load_schema_if_any_file_changed()
elapsed = time.time() - start

# Check if new cache was created
new_cache_files = list(cache_dir.glob('schema_*.pkl'))
print(f"\n‚úÖ New cache file created: {new_cache_files[0].name if new_cache_files else 'NONE!'}")

print(f"\nüìä Results:")
print(f"   Schema build took: {elapsed*1000:.2f}ms")
print(f"   That's {elapsed:.4f} seconds")
print(f"   Schema has {len(schema.columns)} columns")

if elapsed < 0.1:
    print("   ‚úÖ Nearly instant - no meaningful delay!")
elif elapsed < 1.0:
    print("   ‚ö° Very fast - subsecond")
elif elapsed < 2.0:
    print(f"   ‚è±Ô∏è  About 1 second - reasonable")
else:
    print(f"   ‚è±Ô∏è  Took {elapsed:.1f} seconds")
    
print(f"\nüí° The '2-5 seconds' claim is {'ACCURATE' if elapsed >= 2 else 'EXAGGERATED'}")

# Clean up
del os.environ['WHISP_FORCE_SCHEMA_REBUILD']

Testing schema building from lookup files...
[logger.py | info() | l.23] INFO: üîß Force rebuild enabled (WHISP_FORCE_SCHEMA_REBUILD=true)
[logger.py | info() | l.23] INFO: üî® Building universal schema from lookup files (includes ALL countries)
[logger.py | info() | l.23] INFO: üî® Building universal schema from lookup files (includes ALL countries)


[logger.py | info() | l.23] INFO: ‚úÖ Schema cached to disk: schema_09602fba54d62cde788cef8e9ca1e5b3.pkl
[logger.py | info() | l.23] INFO: ‚úÖ Universal schema built and cached (209 columns)

üìä Results:
   Schema build took: 83.29ms
   That's 0.0833 seconds
   ‚úÖ Nearly instant - no meaningful delay!

üí° The '2-5 seconds' claim is EXAGGERATED
[logger.py | info() | l.23] INFO: ‚úÖ Universal schema built and cached (209 columns)

üìä Results:
   Schema build took: 83.29ms
   That's 0.0833 seconds
   ‚úÖ Nearly instant - no meaningful delay!

üí° The '2-5 seconds' claim is EXAGGERATED
