# Test: WHISP Concurrent & Sequential Processing

Testing new concurrent and sequential stats processing functions with proper logging, progress tracking, and endpoint validation.

## Test Structure

- Concurrent processing (high-volume endpoint)
- Sequential processing (standard endpoint)
- Results comparison and validation

## Part 1: Setup

Initialize Earth Engine and configure logging

In [None]:
import ee

# Reset Earth Engine completely
ee.Reset()
print("✅ Earth Engine reset")

## Part 2: CONCURRENT PROCESSING (High-Volume Endpoint)

Test concurrent processing with the high-volume endpoint

In [None]:
import ee

# Initialize and set high-volume endpoint
try:
    ee.Initialize()
    print("Earth Engine reset")
    ee.data.setDebuggingEnabled(False)
    print("Initialized with high-volume endpoint")
except Exception as e:
    try:
        ee.Authenticate()
        ee.Initialize()
        print("Authenticated and initialized with high-volume endpoint")
    except:
        print("Using HIGH-VOLUME endpoint")
        if "high-volume" not in str(e).lower():
            print("WARNING: Not using high-volume endpoint!")

In [None]:
# Verify endpoint is high-volume
api_url = str(ee.data._cloud_api_base_url)
if 'highvolume' in api_url:
    print("✅ Using HIGH-VOLUME endpoint")
else:
    print("❌ WARNING: Not using high-volume endpoint!")

In [None]:
import openforis_whisp as whisp
import logging
from openforis_whisp.concurrent_stats import (
    setup_concurrent_logger,
    validate_ee_endpoint,
    whisp_stats_geojson_to_df_concurrent,
    check_ee_endpoint,
)

print("✅ Imported concurrent stats module")

In [None]:
# Setup logging for concurrent processing
logger = setup_concurrent_logger(level=logging.INFO)
logger.info("Logging configured")

In [None]:
# Choose if want to include additional custom layers
USE_CUSTOM_BANDS = True # set to True if want to add extra ee data to whisp

In [None]:
# Choose if want to include additional custom layers
USE_CUSTOM_BANDS = True # set to True if want to add extra ee data to whisp
# =============================================================================
# CUSTOM BANDS SETUP (OPTIONAL) - runs only if USE_CUSTOM_BANDS = True above
# =============================================================================
if USE_CUSTOM_BANDS:

    # Step 1: Define custom Earth Engine images (binary values 0 or 1)
    custom_images = {
        'example_treecover': ee.Image(1),  # ee.Image("UMD/hansen/global_forest_change_2024_v1_12").select("treecover2000").gt(10).selfMask()
        'nXX_example_commodity': ee.Image.random(seed=1).gte(.5).reproject(crs='EPSG:4326', scale=10) # ee.ImageCollection("projects/forestdatapartnership/assets/cocoa/model_2025a").filter(ee.Filter.date('2020-01-01', '2021-01-01')).mosaic().gt(.8).selfMask()
        # add more images as needed (prefix 'nXX_' = iso2 code for national dataset)
    }

    # Step 2: Define metadata for each custom band (keys must match above)
    # Themes: 'treecover', 'commodities', 'disturbance_before', 'disturbance_after'
    # Timber themes: 'primary', 'naturally_reg_2020', 'planted_plantation_2020', etc.
    custom_bands_info = {
        'example_treecover': {
            'ISO2_code': "",          # Country code (empty = all countries)
            'theme': 'treecover',     # Risk theme
            'theme_timber': "",       # Timber theme (if applicable)
            'use_for_risk': 1,        # Include in risk calculations (1=yes, 0=no)
            'use_for_risk_timber': 0  # Include in timber risk (1=yes, 0=no)
        },
        'nXX_example_commodity': {
            'ISO2_code': "XX", 
            'theme': 'commodities', 
            'theme_timber': "",
            'use_for_risk': 1, 
            'use_for_risk_timber': 0
        }
        # add more band metadata as needed
    }

    # Step 3: Combine custom bands and extract names
    custom_ee_image = whisp.combine_custom_bands(custom_images, custom_bands_info)

    custom_bands = list(custom_bands_info.keys())


In [None]:
# Choose additional national datasets to include (currently three countries: 'co', 'ci', 'br').
base_iso2_codes = ['co', 'ci', 'br']

# automatically add any custom ISO2 codes from custom_bands_info if USE_CUSTOM_BANDS is True
iso2_codes_list = base_iso2_codes.copy()
if USE_CUSTOM_BANDS:
    iso2_codes_list += [code.lower() for code in {v.get('ISO2_code') for v in custom_bands_info.values()} if code and code.lower() not in iso2_codes_list]

In [None]:
import openforis_whisp as whisp

print("Imported concurrent stats module")

In [None]:
num_polygons=1000  # Smaller dataset for testing
min_area_ha=10 
max_area_ha=10 
min_number_vert=10     
max_number_vert=10   

In [None]:
# Generate test data (or use your own GeoJSON)
import geopandas as gpd
import json
import tempfile
import os
import io
from contextlib import redirect_stdout

state_geom = (ee.FeatureCollection("projects/sat-io/open-datasets/FAO/GAUL/GAUL_2024_L1")
    .filter(ee.Filter.inList('gaul1_name', ['Amazonas', 'Mato Grosso', 'Rondônia', 'Pará'])))
bounds = state_geom.geometry().bounds()

# Suppress GeoJSON generation messages
with redirect_stdout(io.StringIO()):
    random_geojson = whisp.generate_test_polygons(
        bounds=bounds, 
        num_polygons=num_polygons,
        min_area_ha=min_area_ha, 
        max_area_ha=max_area_ha, 
        min_number_vert=min_number_vert,     
        max_number_vert=max_number_vert     
    )

# Save to temporary file
temp_fd, concurrent_geojson_path = tempfile.mkstemp(suffix='.geojson', text=True)
os.close(temp_fd)
with open(concurrent_geojson_path, 'w') as f:
    json.dump(random_geojson, f)

print(f"Generated test GeoJSON with {len(random_geojson['features'])} features")
print(f"   Saved to: {concurrent_geojson_path}")

In [None]:
# Helper function to validate GeoJSON file size
def validate_geojson_size(geojson_path, max_size_mb=10):
    """
    Check if GeoJSON file size is within acceptable limits.
    
    Args:
        geojson_path: Path to the GeoJSON file
        max_size_mb: Maximum allowed size in MB (default: 10)
    
    Returns:
        tuple: (is_valid, size_mb, message)
    """
    import os
    
    file_size_bytes = os.path.getsize(geojson_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
    
    is_valid = file_size_mb <= max_size_mb
    
    if is_valid:
        message = f"GeoJSON size OK: {file_size_mb:.2f} MB (limit: {max_size_mb} MB)"
    else:
        message = f"GeoJSON TOO LARGE: {file_size_mb:.2f} MB (limit: {max_size_mb} MB)"
    
    return is_valid, file_size_mb, message

# Test the validation function
test_path = concurrent_geojson_path
msg = validate_geojson_size(test_path, max_size_mb=10)
print(msg[2])

In [None]:
# Create Whisp image with national codes
iso2_codes = ['br', 'co', 'ci']

# whisp_image = whisp.combine_datasets(national_codes=iso2_codes)
# band_names = whisp_image.bandNames().getInfo()
# print(f"Created Whisp image with {len(band_names)} bands")

In [None]:
# Test concurrent: GeoJSON → DataFrame with automatic formatting
print("\n" + "="*70)
print("TEST 1: Concurrent GeoJSON → DataFrame (Formatted)")
print("="*70 + "\n")

try:
    df_concurrent = whisp.whisp_formatted_stats_geojson_to_df_concurrent(
        input_geojson_filepath=concurrent_geojson_path,
        # whisp_image=whisp_image,
        # custom_bands=custom_bands if USE_CUSTOM_BANDS else None,
        national_codes=iso2_codes,
        batch_size=10,
        max_concurrent=20,
        validate_geometries=False,
        add_metadata_server=False,
        logger=logger,
        
    )
    
    print(f"\n✅ SUCCESS: Concurrent processing complete!")
    print(f"   Processed: {df_concurrent.shape[0]} features")
    print(f"   Output columns: {df_concurrent.shape[1]}")
    print(f"\n   First row sample:")
    print(df_concurrent.iloc[0, :8])
    
except Exception as e:
    print(f"❌ ERROR: {str(e)}")
    import traceback
    traceback.print_exc()

In [None]:
df_concurrent

## Part 3: SEQUENTIAL PROCESSING (For Comparison)

Test sequential (standard endpoint) processing as an alternative approach

### Part 3A: Switch to Standard Endpoint

Switch from high-volume to standard endpoint for sequential testing

In [None]:
import ee

# Reset Earth Engine completely
ee.Reset()
print("✅ Earth Engine reset")

In [None]:
# Earth Engine initialization with STANDARD endpoint
try:
    ee.Initialize(opt_url='https://earthengine.googleapis.com')
    print("✅ Initialized with standard endpoint")
except Exception:
    ee.Authenticate()
    ee.Initialize(opt_url='https://earthengine.googleapis.com')
    print("✅ Authenticated and initialized with standard endpoint")

In [None]:
# Generate fresh test data for sequential testing (avoid caching)
print("\n" + "="*80)
print("GENERATING TEST DATA")
print("="*80)

if True:  # Try generating fresh data
    try:
        random_geojson_sequential = whisp.generate_test_polygons(
            region=geom,
            num_polygons=5,
            area_ha=5000,  # Larger areas
            max_vertices=100,
            seed=42
        )
        
        temp_fd_sequential, sequential_geojson_path = tempfile.mkstemp(suffix='.geojson', text=True)
        
        with open(sequential_geojson_path, 'w') as f:
            json.dump(random_geojson_sequential, f)
        
        print(f"✅ Generated fresh test GeoJSON with {len(random_geojson_sequential['features'])} features")
        print(f"   Saved to: {sequential_geojson_path}")
    except Exception as e:
        print(f"   Fallback: Using example data ({e})")
        random_geojson_sequential = whisp.get_example_data_path("geojson_example.geojson")

In [None]:
# Test sequential: GeoJSON → DataFrame (Sequential Processing)
print("\nTEST 2: Sequential GeoJSON → DataFrame (Sequential)")
print("-" * 80)

try:
    df_sequential = whisp.whisp_formatted_stats_geojson_to_df_sequential(
        input_geojson_filepath=sequential_geojson_path,
        national_codes=['BR'],
        add_metadata_client_side=True,
        logger=logger,
    )
    
    print(f"\n✅ SUCCESS: Sequential processing complete!")
    print(f"   Processed: {df_sequential.shape[0]} features")
    print(f"   Output columns: {df_sequential.shape[1]}")
    print("\n   First row preview:")
    print(df_sequential.iloc[0, :8])
    
except Exception as e:
    print(f"\n❌ FAILED: {e}")
    import traceback
    traceback.print_exc()
    df_sequential = None

In [None]:
# Compare concurrent vs sequential results
print("\nCOMPARISON: Concurrent vs Sequential")
print("=" * 80)

if df_concurrent is not None and df_sequential is not None:
    print(f"\nConcurrent shape:  {df_concurrent.shape}")
    print(f"Sequential shape:  {df_sequential.shape}")
    
    # Verify they return same columns
    if set(df_concurrent.columns) == set(df_sequential.columns):
        print("\n✅ Column names match!")
    else:
        print("\n⚠️  Column names differ")
        print(f"   Concurrent columns: {set(df_concurrent.columns) - set(df_sequential.columns)}")
        print(f"   Sequential columns: {set(df_sequential.columns) - set(df_concurrent.columns)}")
    
    print(f"\n✅ Sequential is simpler and better for debugging")

In [None]:
df_sequential

In [None]:
df_non_concurrent