# WHISP: Concurrent vs Sequential Processing Test

Minimal test notebook comparing concurrent (high-volume endpoint) vs sequential (standard endpoint) processing.

## Quick Start
1. Initialize Earth Engine
2. Generate test data
3. Run concurrent processing
4. Run sequential processing
5. Compare results
6. Export results

## Part 1: Setup & Initialization

In [None]:
import ee
import gc
import logging
import json
import tempfile
import os
import io
from contextlib import redirect_stdout
from pathlib import Path

# Reset and initialize Earth Engine
ee.Reset()
gc.collect()

try:
    ee.Initialize()
except:
    ee.Authenticate()

print("[OK] Earth Engine initialized")

In [None]:
# Import WHISP
import openforis_whisp as whisp
from openforis_whisp.advanced_stats import (
    whisp_formatted_stats_geojson_to_df_concurrent,
    whisp_formatted_stats_geojson_to_df_sequential,
)

# Setup logging
logger = logging.getLogger("whisp-concurrent")
logger.setLevel(logging.INFO)

print("[OK] WHISP imported")

In [None]:
# Define test parameters
iso2_codes = ['br', 'co', 'ci']
num_polygons = 200
min_area_ha = 5
max_area_ha = 10
min_number_vert = 50
max_number_vert = 100

print(f"[OK] Test parameters configured")
print(f"     Polygons: {num_polygons}")
print(f"     Area: {min_area_ha}-{max_area_ha} ha")
print(f"     Vertices: {min_number_vert}-{max_number_vert}")

## Part 2: Generate Test Data

In [None]:
# Get test region (Brazil Amazon)
state_geom = (ee.FeatureCollection("projects/sat-io/open-datasets/FAO/GAUL/GAUL_2024_L1")
    .filter(ee.Filter.inList('gaul1_name', ['Amazonas', 'Mato Grosso', 'Rondônia', 'Pará'])))
bounds = state_geom.geometry().bounds()

# Generate test GeoJSON
with redirect_stdout(io.StringIO()):
    random_geojson = whisp.generate_test_polygons(
        bounds=bounds,
        num_polygons=num_polygons,
        min_area_ha=min_area_ha,
        max_area_ha=max_area_ha,
        min_number_vert=min_number_vert,
        max_number_vert=max_number_vert
    )

# Save to temporary file
temp_fd, test_geojson_path = tempfile.mkstemp(suffix='.geojson', text=True)
os.close(temp_fd)
with open(test_geojson_path, 'w') as f:
    json.dump(random_geojson, f)

print(f"[OK] Generated test GeoJSON")
print(f"     Features: {len(random_geojson['features'])}")
print(f"     File: {test_geojson_path}")

## Part 3: Concurrent Processing (High-Volume Endpoint)

In [None]:
# Switch to high-volume endpoint
ee.Reset()
ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')

api_url = str(ee.data._cloud_api_base_url)
if 'highvolume' in api_url:
    print("[OK] Using HIGH-VOLUME endpoint")
else:
    print("[WARNING] Not using high-volume endpoint")

In [None]:
# Test concurrent processing
print("\n" + "="*70)
print("TEST 1: CONCURRENT PROCESSING")
print("="*70 + "\n")

try:
    df_concurrent = whisp_formatted_stats_geojson_to_df_concurrent(
        input_geojson_filepath=test_geojson_path,
        national_codes=iso2_codes,
        batch_size=10,
        max_concurrent=30,
        validate_geometries=False,
        add_metadata_server=False,
        logger=logger,
    )
    
    print(f"[OK] Concurrent processing complete!")
    print(f"     Rows: {df_concurrent.shape[0]}")
    print(f"     Columns: {df_concurrent.shape[1]}")
    
except Exception as e:
    print(f"[ERROR] {str(e)}")
    import traceback
    traceback.print_exc()

## Part 4: Sequential Processing (Standard Endpoint)

In [None]:
# Switch to standard endpoint
ee.Reset()
try:
    ee.Initialize(opt_url='https://earthengine.googleapis.com')
except:
    ee.Authenticate()
    ee.Initialize(opt_url='https://earthengine.googleapis.com')

print("[OK] Using STANDARD endpoint")

In [None]:
# Test sequential processing
print("\n" + "="*70)
print("TEST 2: SEQUENTIAL PROCESSING")
print("="*70 + "\n")

try:
    df_sequential = whisp_formatted_stats_geojson_to_df_sequential(
        input_geojson_filepath=test_geojson_path,
        national_codes=iso2_codes,
        add_metadata_client_side=True,
        logger=logger,
    )
    
    print(f"[OK] Sequential processing complete!")
    print(f"     Rows: {df_sequential.shape[0]}")
    print(f"     Columns: {df_sequential.shape[1]}")
    
except Exception as e:
    print(f"[ERROR] {str(e)}")
    import traceback
    traceback.print_exc()

## Part 5: Compare Results

In [None]:
print("\n" + "="*70)
print("COMPARISON: Concurrent vs Sequential")
print("="*70 + "\n")

if 'df_concurrent' in locals() and 'df_sequential' in locals():
    print(f"Concurrent shape:  {df_concurrent.shape}")
    print(f"Sequential shape:  {df_sequential.shape}")
    
    # Check columns match
    if set(df_concurrent.columns) == set(df_sequential.columns):
        print("\n[OK] Column names match!")
    else:
        print("\n[WARNING] Column names differ")
        concurrent_only = set(df_concurrent.columns) - set(df_sequential.columns)
        sequential_only = set(df_sequential.columns) - set(df_concurrent.columns)
        if concurrent_only:
            print(f"  Only in concurrent: {concurrent_only}")
        if sequential_only:
            print(f"  Only in sequential: {sequential_only}")
else:
    print("[ERROR] Both dataframes needed for comparison")

## Part 6: Export Results

In [None]:
# Setup output directory
out_directory = Path.home() / 'downloads' / 'whisp_results'
out_directory.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {out_directory}")

In [None]:
# Export concurrent results
if 'df_concurrent' in locals():
    csv_file = out_directory / 'concurrent_results.csv'
    df_concurrent.to_csv(csv_file, index=False)
    print(f"[OK] Concurrent results exported: {csv_file}")
    print(f"     Rows: {len(df_concurrent)}")
else:
    print("[SKIP] Concurrent results not available")

In [None]:
# Export sequential results
if 'df_sequential' in locals():
    csv_file = out_directory / 'sequential_results.csv'
    df_sequential.to_csv(csv_file, index=False)
    print(f"[OK] Sequential results exported: {csv_file}")
    print(f"     Rows: {len(df_sequential)}")
else:
    print("[SKIP] Sequential results not available")

In [None]:
# Export as GeoJSON (if geometry column exists)
if 'df_concurrent' in locals() and 'geo' in df_concurrent.columns:
    geojson_file = out_directory / 'concurrent_results.geojson'
    try:
        whisp.convert_df_to_geojson(df_concurrent, geojson_file)
        print(f"[OK] Concurrent GeoJSON exported: {geojson_file}")
    except Exception as e:
        print(f"[WARNING] Could not export GeoJSON: {e}")
else:
    print("[INFO] No geometry column found (GeoJSON export skipped)")

## Summary

- **Concurrent**: Processes multiple batches in parallel using high-volume endpoint
- **Sequential**: Processes one batch at a time using standard endpoint
- **Results**: Both methods should produce identical data (same rows and columns)
- **Use case**: Concurrent is faster for large datasets; sequential is useful for debugging