### Whisp a feature collection

### Setup
- NB use a virtual environment to avoid altering your python environment (https://docs.python.org/3/tutorial/venv.html)

In [1]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

try:
    ee.Initialize(project='ee-andyarnellgee', opt_url='https://earthengine-highvolume.googleapis.com')
except Exception:
    ee.Authenticate()
    ee.Initialize(project='ee-andyarnellgee', opt_url='https://earthengine-highvolume.googleapis.com')

In [2]:
# Install openforis-whisp (uncomment line if not already installed)
# !pip install --pre openforis-whisp 

# NB installed in editable mode (from terminal: pip install -e [dev])

Installing other requirements 

In [3]:
import openforis_whisp as whisp
# import geopandas as gpd
# import pandas as pd
import json
import geemap


In [4]:
# all_layers = whisp.combine_datasets(pixel_area=False)
# print('Combining datasets complete')
# # print(all_layers.bandNames().getInfo())

Make a folder

In [5]:
folder_path = 'C:/Users/Arnell/Downloads/whisp_example_demo_5' #COGS
Path(folder_path).mkdir(parents=True, exist_ok=True)
print(f"Folder ready: {folder_path}")

Folder ready: C:/Users/Arnell/Downloads/whisp_example_demo_5


Get features

In [6]:
GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

In [7]:
# # ...existing code...

# def analyze_geojson_grid_coverage(
#     geojson_path, 
#     geom_threshold=250, 
#     bbox_count_threshold=5,
#     resolution_deg=1/80,  # ~30 arc-seconds = 1/120 degrees
#     id_field = 'internal_id',
#     start_index = 1,
# ):
#     import geopandas as gpd
#     import numpy as np
#     from shapely.geometry import box

#     gdf = gpd.read_file(geojson_path)
#     if len(gdf) <= geom_threshold:
#         print("Geometry count below threshold; skipping grid analysis.")
#         return None

#     minx, miny, maxx, maxy = gdf.total_bounds
#     x_coords = np.arange(minx, maxx, resolution_deg)
#     y_coords = np.arange(miny, maxy, resolution_deg)
#     grid_cells = [box(x, y, x + resolution_deg, y + resolution_deg) for x in x_coords for y in y_coords]
#     grid_gdf = gpd.GeoDataFrame({'geometry': grid_cells}, crs=gdf.crs)



#     # Spatial join: which polygons intersect which grid cells
#     joined = gpd.sjoin(grid_gdf, gdf, how="inner", predicate="intersects")
#     # Count how many polygons intersect each grid cell
#     counts = joined.groupby(joined.index).size()
#     # Filter grid cells by threshold
#     selected_cells = grid_gdf.loc[counts[counts >= bbox_count_threshold].index]


#     # Add sequential numeric IDs
#     selected_cells[id_field] = [i + start_index for i in range(len(selected_cells))]

#     return selected_cells

# # ...existing code...

Area of interest 

In [29]:
# Define bounds from the provided Earth Engine geometry
# area in Ghana 
# bounds = [ 
#     -3.04548260909834,  # min_lon
#     5.253961384163733,  # min_lat
#     -1.0179939534016594,  # max_lon
#     7.48307210714245    # max_lat
# ]

# bounds = [ 
#     -2,  # min_lon
#     5.3,  # min_lat
#     -1.9,  # max_lon
#     5.4    # max_lat
# ]

#area in Brazil (mostly)
bounds = [ 
        -40, # min_lon
        20, #min_lat
        -38, # max_lon
        21 # max_lat
]

# area in China
# bounds = [
#     103.44831497309737,  # min_lon
#     25.686366665187148,  # min_lat
#     109.57868606684737,  # max_lon
#     28.79200348254393    # max_lat
# ]

In [30]:
random_geojson = whisp.create_geojson(
    bounds, 
    num_polygons=50, 
    min_area_ha=1, 
    max_area_ha=10, 
    min_number_vert=5, 
    max_number_vert=10)

# Save the GeoJSON to a file
with open(GEOJSON_EXAMPLE_FILEPATH, 'w') as f:
    json.dump(random_geojson, f)

# Use example Whisp inputs (optional)
# GEOJSON_EXAMPLE_FILEPATH = whisp.get_example_data_path("geojson_example.geojson")


In [31]:
# selected = analyze_geojson_grid_coverage(
#     GEOJSON_EXAMPLE_FILEPATH, 
#     geom_threshold=30, 
#     bbox_count_threshold=1, 
#     id_field='internal_id',
#     )

# print(f"Selected grid cells: {len(selected)}")
# if selected is not None:
#     selected.to_file("selected_grid_cells.geojson", driver="GeoJSON")


In [32]:
# GEOJSON_GRID_SELECT = "selected_grid_cells.geojson"

# whisp.reformat_geojson_properties(
#     geojson_path=GEOJSON_GRID_SELECT, 
#     id_field="internal_id",
#     # output_path=folder_path + "/random_polygons_with_ids.geojson",
#     output_path=folder_path + "/"+GEOJSON_GRID_SELECT,
#     remove_properties=True
# )


Make a copy of GeoJSON

In [33]:
# Add IDs to your existing GeoJSON file

#Save to a new file (instead of overwriting)
whisp.reformat_geojson_properties(
    geojson_path=GEOJSON_EXAMPLE_FILEPATH, 
    id_field="internal_id",
    output_path=folder_path + "/random_polygons_with_ids.geojson",
    remove_properties=True
)



Added internal_id to GeoJSON and saved to C:/Users/Arnell/Downloads/whisp_example_demo_5/random_polygons_with_ids.geojson


### Local Whisp stats processing chain

Input example geojson


In [34]:
GEOJSON_EXAMPLE_FILEPATH = folder_path + "/random_polygons_with_ids.geojson"

Obscure/hide the input polygon locations using bounding boxes

In [36]:
# get feature collection from geojson
ee_collection = whisp.convert_geojson_to_ee(
 GEOJSON_EXAMPLE_FILEPATH
)

# slight obscuration - bounding box
ee_bbox_collection = whisp.convert_geojson_to_ee_bbox(
GEOJSON_EXAMPLE_FILEPATH
)

#selected larger grid cells contianing multiple features
# ee_grid_select = whisp.convert_geojson_to_ee(GEOJSON_GRID_SELECT)

# Full obscuration - extend, shift, and add random features
fully_obscured_collection = whisp.convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_range=[0.001, 0.004],
    shift_geometries=True,
    shift_proportion=1.4,
    pixel_length=0.0002,  # ~10m at equator
    add_random_features=True,
    max_distance=0.07,  # xkm at equator
    random_proportion= .05  # Add X more features as decoys (as a proportion of the original features)
)

map = geemap.Map()

# map.addLayer (ee_grid_select, {}, "ee_grid_select")

map.addLayer(fully_obscured_collection, {}, "Fully Obscured Collection")
# map.addLayer (ee_bbox_collection, {}, "Original bbox Collection")
map.addLayer (ee_collection, {}, "Original Collection")


map.centerObject(ee_collection.first(), 12)  # Center the map on the first feature in the collection

map

Reading GeoJSON file from: C:\Users\Arnell\Downloads\whisp_example_demo_5\random_polygons_with_ids.geojson
Created Earth Engine FeatureCollection with 50 bounding box features


Map(center=[20.58081566274601, -39.56227723793295], controls=(WidgetControl(options=['position', 'transparent_…

In [15]:
# ee_grid_select.first().getInfo()  # Print the first feature in the grid select collection

Download layers and process locally

#### Option A: Simple API (recommended)
Use `whisp_stats_local()` for a one-liner that handles the entire workflow:

In [46]:
# Simple one-liner: handles obscuration, download, VRT, extraction, and cleanup
stats_local = whisp.whisp_stats_local(
    input_geojson_filepath=GEOJSON_EXAMPLE_FILEPATH,
    output_dir=folder_path,
    # Obscuration settings
    extension_range=(0.002, 0.005),
    shift_geometries=True,
    shift_proportion=0.5,
    add_random_features=False,
    # Processing settings  
    max_download_workers=25,
    max_extract_workers=30,
    chunk_size=10,
    cleanup_files=True,  # Remove temp files after processing
)

INFO: Mode: local
Whisp multiband image compiled
INFO: Progress: 5/50 downloads (10%) | Elapsed: 1s | ETA: 9s
INFO: Progress: 10/50 downloads (20%) | Elapsed: 1s | ETA: 4s
INFO: Progress: 15/50 downloads (30%) | Elapsed: 1s | ETA: 3s
INFO: Progress: 20/50 downloads (40%) | Elapsed: 1s | ETA: 2s
INFO: Progress: 25/50 downloads (50%) | Elapsed: 1s | ETA: 1s
INFO: Progress: 30/50 downloads (60%) | Elapsed: 1s | ETA: 1s
INFO: Progress: 35/50 downloads (70%) | Elapsed: 2s | ETA: 1s
INFO: Progress: 40/50 downloads (80%) | Elapsed: 2s | ETA: 1s
INFO: Progress: 45/50 downloads (90%) | Elapsed: 2s | ETA: 0s
INFO: Progress: 50/50 downloads (100%) | Total time: 2s
INFO: Processing 50 features in 5 batches (local mode)...
INFO: Progress: 1/5 batches (20%) | Elapsed: 7s | ETA: calculating...
INFO: Progress: 2/5 batches (40%) | Elapsed: 7s | ETA: 12s
INFO: Progress: 3/5 batches (60%) | Elapsed: 8s | ETA: 6s
INFO: Progress: 4/5 batches (80%) | Elapsed: 8s | ETA: 2s
INFO: Progress: 5/5 batches (100%) 

#### Option B: Step-by-step (for more control)
Use individual functions when you need finer control over each step:

In [37]:
# Full obscuration - extend, shift, and add random features
fully_obscured_collection = whisp.convert_geojson_to_ee_bbox_obscured(
    GEOJSON_EXAMPLE_FILEPATH,
    extension_range=[0.002, 0.005],
    shift_geometries=True,
    shift_proportion=0.5,
    pixel_length=0.0002,  # ~10m at equator
    add_random_features=False,
    max_distance=0.07,  # xkm at equator
    random_proportion= 0.25  # Add X more features as decoys
)

# Get the combined image and band names for column renaming
image = whisp.combine_datasets()
band_names = image.bandNames().getInfo()
print(f"Image has {len(band_names)} bands")

# Parallel processing (faster for many features)
# Uses ThreadPoolExecutor - optimal for I/O-bound downloads
geotiff_paths = whisp.download_geotiffs_for_feature_collection(
    feature_collection=fully_obscured_collection,
    output_dir=folder_path,
    image=image,
    max_features=1000,
    max_workers=30,  # Process X features concurrently
)
whisp.create_vrt_from_folder(folder_path)

# Local zonal stats using parallel ProcessPoolExecutor (optimal for CPU-bound work)
stats = whisp.exact_extract_in_chunks_parallel(
    rasters=folder_path+'/combined_rasters.vrt',
    vector_file=GEOJSON_EXAMPLE_FILEPATH,
    chunk_size=10,
    ops=['sum'],
    max_workers=30,  # Adjust based on your CPU cores
    band_names=band_names,  # Pass band names for proper column naming
)

Whisp multiband image compiled
Image has 196 bands
INFO: Progress: 5/50 downloads (10%) | Elapsed: 1s | ETA: 5s
INFO: Progress: 10/50 downloads (20%) | Elapsed: 1s | ETA: 3s
INFO: Progress: 15/50 downloads (30%) | Elapsed: 1s | ETA: 2s
INFO: Progress: 20/50 downloads (40%) | Elapsed: 1s | ETA: 1s
INFO: Progress: 25/50 downloads (50%) | Elapsed: 1s | ETA: 1s
INFO: Progress: 30/50 downloads (60%) | Elapsed: 1s | ETA: 1s
INFO: Progress: 35/50 downloads (70%) | Elapsed: 1s | ETA: 0s
INFO: Progress: 40/50 downloads (80%) | Elapsed: 1s | ETA: 0s
INFO: Progress: 45/50 downloads (90%) | Elapsed: 1s | ETA: 0s
INFO: Progress: 50/50 downloads (100%) | Total time: 1s
INFO: Processing 50 features in 5 batches (local mode)...
INFO: Progress: 1/5 batches (20%) | Elapsed: 7s | ETA: calculating...
INFO: Progress: 2/5 batches (40%) | Elapsed: 7s | ETA: 12s
INFO: Progress: 3/5 batches (60%) | Elapsed: 7s | ETA: 5s
INFO: Progress: 4/5 batches (80%) | Elapsed: 9s | ETA: 2s
INFO: Progress: 5/5 batches (100%

Save the results of the local processing

In [18]:
stats.to_csv(folder_path+'/whisp_output_local_processing.csv', index=False)

In [19]:
whisp.combine_datasets()

Whisp multiband image compiled


In [20]:
# # optional

# whisp.delete_all_files_in_folder(folder_path,"*vrt*")
# whisp.delete_all_files_in_folder(folder_path,"*tif*") 
# whisp.delete_all_files_in_folder(folder_path,"*geojson*")
# whisp.delete_all_files_in_folder(folder_path,"*csv*")
# whisp.delete_folder(folder_path) #everything and folder

## Regular Whisp 

In [21]:
# Earth Engine and Common Libraries
import ee
from pathlib import Path

try:
    ee.Initialize(project='ee-andyarnellgee')#, opt_url='https://earthengine-highvolume.googleapis.com')
except Exception:
    ee.Authenticate()
    ee.Initialize(project='ee-andyarnellgee')# opt_url='https://earthengine-highvolume.googleapis.com')

### Whisp it

In [45]:
# df_stats = whisp.whisp_formatted_stats_geojson_to_df(input_geojson_filepath=GEOJSON_EXAMPLE_FILEPATH,external_id_column=None)
df_stats = whisp.whisp_formatted_stats_geojson_to_df(input_geojson_filepath=GEOJSON_EXAMPLE_FILEPATH,external_id_column=None, mode="concurrent")

INFO: Mode: concurrent
INFO: Loaded 50 features
INFO: Processing 50 features in 5 batches (concurrent mode)...
INFO: Progress: 1/5 batches (20%) | Elapsed: 4s | ETA: calculating...
INFO: Progress: 2/5 batches (40%) | Elapsed: 5s | ETA: calculating...
INFO: Progress: 3/5 batches (60%) | Elapsed: 5s | ETA: calculating...
INFO: Progress: 4/5 batches (80%) | Elapsed: 5s | ETA: calculating...
INFO: Progress: 5/5 batches (100%) | Total time: 19s
INFO: Processing complete: 5/5 batches in 19s
INFO: Processing complete: 50 features
INFO: Concurrent processing + formatting + validation complete


### Display results

In [23]:
df_stats

Unnamed: 0,plotId,external_id,Area,Geometry_type,Country,ProducerCountry,Admin_Level_1,Centroid_lon,Centroid_lat,Unit,...,TMF_regrowth_2023,ESRI_2023_TC,Oil_palm_2023_FDaP,Rubber_2023_FDaP,Coffee_FDaP_2023,Cocoa_2023_FDaP,ESRI_crop_gain_2020_2023,GFW_logging_before_2020,geo,whisp_processing_metadata
0,1,,8.315,Polygon,BRA,BR,Amazonas,-66.099522,-1.930211,ha,...,0.0,8.315,0.0,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-66.1009...","{'whisp_version': '1.0.0a1', 'processing_times..."
1,2,,9.026,Polygon,Unknown,not found,Unknown,-38.397412,-27.142287,ha,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-38.3991...","{'whisp_version': '1.0.0a1', 'processing_times..."
2,3,,5.699,Polygon,BRA,BR,Piauí,-42.686616,-6.129782,ha,...,0.0,1.602,0.0,0.022,0.02,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-42.6880...","{'whisp_version': '1.0.0a1', 'processing_times..."
3,4,,4.88,Polygon,BRA,BR,Maranhão,-46.712073,-7.687659,ha,...,0.0,4.752,0.0,0.127,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-46.7133...","{'whisp_version': '1.0.0a1', 'processing_times..."
4,5,,4.283,Polygon,ARG,AR,Santiago Del Estero,-61.849904,-28.622885,ha,...,0.0,0.52,0.0,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-61.8510...","{'whisp_version': '1.0.0a1', 'processing_times..."
5,6,,6.711,Polygon,BRA,BR,Maranhão,-46.433828,-4.228716,ha,...,0.0,0.0,0.0,0.0,0.0,0.0,6.71,0.0,"{'type': 'Polygon', 'coordinates': [[[-46.4353...","{'whisp_version': '1.0.0a1', 'processing_times..."
6,7,,9.554,Polygon,Unknown,not found,Unknown,-33.225119,-19.303387,ha,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-33.2265...","{'whisp_version': '1.0.0a1', 'processing_times..."
7,8,,3.011,Polygon,BOL,BO,Tarija,-62.689328,-21.525242,ha,...,0.0,3.011,0.0,0.0,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-62.6908...","{'whisp_version': '1.0.0a1', 'processing_times..."
8,9,,7.076,Polygon,PRY,PY,Itapúa,-55.184122,-26.906326,ha,...,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.0,"{'type': 'Polygon', 'coordinates': [[[-55.1854...","{'whisp_version': '1.0.0a1', 'processing_times..."
9,10,,2.15,Polygon,BRA,BR,Paraíba,-38.019434,-6.776552,ha,...,0.0,0.386,0.0,0.02,0.0,0.0,0.0,0.0,"{'type': 'Polygon', 'coordinates': [[[-38.0202...","{'whisp_version': '1.0.0a1', 'processing_times..."


### Export table to CSV

In [24]:
df_stats.to_csv(folder_path+"/"+"whisp_output_regular.csv",index=False)

In [25]:
# Define the list of desired band names
selected_bands = [
    "nBR_DETER_forestdegradation_Amazon_after2020",
    "nBR_DETER_forestdegradation_Amazon_upto2020",
    "nBR_INPE_TCamz_perennial_2020",
    "nBR_INPE_TCsilviculture_Amazon_2020",
    "nBR_MapBiomas_col9_pc_2020",
    "nBR_MapBiomas_col9_rice_2020"
]

# Get the combined image and select only the desired bands
all_layers = whisp.combine_datasets(pixel_area=False)
selected_layers = all_layers.select(selected_bands)

# Create a geemap map
import geemap
Map = geemap.Map()

# Add each band as a separate binary layer (0/1), thresholding at >0
for band in selected_bands:
    binary_layer = selected_layers.select(band).gt(0).selfMask()
    Map.addLayer(binary_layer, {'min': 0, 'max': 1, 'palette': ['white', 'green']}, band)

Map.centerObject(ee_collection.first(), 8)  # Adjust zoom as needed
Map

TypeError: combine_datasets() got an unexpected keyword argument 'pixel_area'

In [None]:
# # Get all band names from the combined image
# all_layers = whisp.combine_datasets(pixel_area=False)
# all_band_names = all_layers.bandNames().getInfo()

# # Use wildcard: select all bands starting with "nBR_"
# selected_bands = [b for b in all_band_names if b.startswith("nBR_")]

# # Select only these bands
# selected_layers = all_layers.select(selected_bands)
# print (f"Selected bands: {selected_bands}")
# import geemap
# Map = geemap.Map()

# # Add each band as a separate binary layer (0/1), thresholding at >0
# for band in selected_bands:
#     binary_layer = selected_layers.select(band).gt(0).selfMask()
#     Map.addLayer(binary_layer, {'min': 0, 'max': 1, 'palette': ['white', 'green']}, band)

# Map.centerObject(ee_collection.first(), 8)
# Map