### Whisp a geojson

Setup
- Use a [virtual environment](https://docs.python.org/3/tutorial/venv.html) to avoid altering your python environment 

Usage:
- Use this notebook with smaller datasets (e.g., up to 10,000 features). 
- For larger datasets consider the 'whisp_geojson_to_drive.ipynb' notebook, which is more suited to heavy processing
- Please report issues with this notebook [here](https://github.com/forestdatapartnership/whisp/issues)

In [100]:
# Earth Engine and Common Libraries|
import ee
from pathlib import Path

# Authenticate and initialize Earth Engine. 
try:
    ee.Initialize()  # Try to use existing credentials first
except Exception:
    ee.Authenticate() # Authenticate may open a browser window
    ee.Initialize()

# NB if not working add your cloud project: ee.Initialize(project="your_gee_cloud_project_name")

In [101]:
# Install openforis-whisp (uncomment line if not already installed)
# !pip install --pre openforis-whisp

# NB for editable mode install via your terminal with: pip install -e .[dev]

In [102]:
import openforis_whisp as whisp

In [103]:
import pandas as pd
import geopandas as gpd
from openforis_whisp.parameters.config_runtime import (
    admin_1_column, 
    iso3_country_column, 
    iso2_country_column, 
    geometry_type_column, 
    centroid_x_coord_column, 
    centroid_y_coord_column
)


In [None]:
def join_admin_codes(df, lookup_dict, id_col='id_col'):
    """
    Join admin names and ISO3 codes to a DataFrame using a lookup dictionary.
    Output columns are named using config_runtime.py variables.
    """
    from openforis_whisp.parameters.config_runtime import admin_1_column, iso3_country_column, iso2_country_column
    lookup_df = pd.DataFrame.from_dict(lookup_dict, orient='index')
    lookup_df.index.name = 'gaul1_code'
    lookup_df = lookup_df.reset_index()
    lookup["id_col_int"]=lookup[id_col].fillna(-9999).astype(int)
    merged_df = df.merge(lookup_df, left_on="id_col_int", right_on='gaul1_code', how='left')
    merged_df = merged_df.rename(columns={
        'gaul1_name': admin_1_column,
        'iso3_code': iso3_country_column,
        'iso2_code': iso2_country_column
    })
    merged_df = merged_df.drop(columns=['gaul1_code',"gaul0_name"])
    return merged_df
# Usage:
# result_df = join_admin_codes(example_df, lookup_dict, id_col='id_col')

In [196]:
# Function to extract centroid, geometry type, and coordinates from a GeoDataFrame using GeoPandas (faster for local data)

def extract_centroid_and_geomtype_gpd(
    gdf,
    x_col='centroid_x',
    y_col='centroid_y',
    type_col='geometry_type',
    external_id_col=None,
    return_attributes_only=False
):
    """
    Adds centroid coordinate values and geometry type columns to a GeoDataFrame.
    Optionally returns only attributes (no geometry columns) and preserves an external ID column.
    Does NOT add a centroid geometry column, only the values for lon/lat/type.
    Args:
        gdf (GeoDataFrame): Input GeoDataFrame.
        x_col (str): Name for centroid x column.
        y_col (str): Name for centroid y column.
        type_col (str): Name for geometry type column.
        external_id_col (str, optional): Name of external ID column to preserve in output.
        return_attributes_only (bool, optional): If True, returns a pandas DataFrame with only attributes (no geometry columns).
    Returns:
        GeoDataFrame or DataFrame: Copy with new columns for centroid_x, centroid_y, and geometry_type, optionally only attributes.
    """
    gdf = gdf.copy()
    # Calculate centroid coordinates as values only, not as a geometry column
    centroid_points = gdf.geometry.centroid
    gdf[x_col] = centroid_points.x
    gdf[y_col] = centroid_points.y
    gdf[type_col] = gdf.geometry.geom_type
    cols = [x_col, y_col, type_col]
    if external_id_col and external_id_col in gdf.columns:
        cols = [external_id_col] + cols
    if return_attributes_only:
        df = gdf[cols].reset_index(drop=True)
        return df
    return gdf

# Example usage:
# gdf = gpd.read_file(GEOJSON_EXAMPLE_FILEPATH)
# gdf_with_centroids = extract_centroid_and_geomtype_gpd(gdf, return_attributes_only=True)
# print(gdf_with_centroids[["centroid_x", "centroid_y", "geometry_type"]].head())


In [197]:
# --- Helper functions for extracting properties from FeatureCollection ---

# Function to extract centroid, geometry type, and coordinates from an ee.Feature

# (Assumes you have a FeatureCollection 'fc')

def extract_centroid_and_geomtype(fc, x_col='centroid_x', y_col='centroid_y', type_col='geometry_type'):
    def add_centroid_and_geomtype(feature):
        centroid = feature.geometry().centroid(1)
        coords = centroid.coordinates()
        # Round coordinates to 6 decimal places (Earth Engine's round only takes 1 argument)
        x = ee.Number(coords.get(0)).multiply(1e6).round().divide(1e6)
        y = ee.Number(coords.get(1)).multiply(1e6).round().divide(1e6)
        return feature.set({
            x_col: x,
            y_col: y,
            type_col: feature.geometry().type()
        })
    return fc.map(add_centroid_and_geomtype)

# # Example usage:
# fc_with_centroids = extract_centroid_and_geomtype(fc)
# df = whisp.convert_ee_to_df(fc_with_centroids, remove_geom=True)


# # Now join admin codes using lookup_dict and the centroid-based admin code column
# from openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict
# result_df = join_admin_codes(df, lookup_dict, id_col='first')  # 'first' is the admin code column from reduceRegions

# result_df.head()

In [198]:
import sys
from pathlib import Path

# Add project root to sys.path
sys.path.append(str(Path.cwd().parent))

# Import the lookup dictionary
from src.openforis_whisp.parameters.lookup_gaul1_admin import lookup_dict

# Print the first 5 items
print(list(lookup_dict.items())[:5])

[(1000, {'gaul0_name': 'Abyei', 'gaul1_name': 'Administrative Unit Not Available', 'iso2_code': 'not found', 'iso3_code': 'xAB'}), (1001, {'gaul0_name': 'Algeria', 'gaul1_name': 'Adrar', 'iso2_code': 'DZ', 'iso3_code': 'DZA'}), (1002, {'gaul0_name': 'Algeria', 'gaul1_name': 'Ain-Defla', 'iso2_code': 'DZ', 'iso3_code': 'DZA'}), (1003, {'gaul0_name': 'Algeria', 'gaul1_name': 'Ain-Temouchent', 'iso2_code': 'DZ', 'iso3_code': 'DZA'}), (1004, {'gaul0_name': 'Algeria', 'gaul1_name': 'Alger', 'iso2_code': 'DZ', 'iso3_code': 'DZA'})]


In [199]:
folder_path = r"C:\Users\Arnell\Downloads\a_processing_tests"

GEOJSON_EXAMPLE_FILEPATH = folder_path+"/random_polygons.geojson"

geom = (ee.FeatureCollection("projects/sat-io/open-datasets/FAO/GAUL/GAUL_2024_L1")
    .filter(ee.Filter.eq('gaul0_name', 'Brazil')).geometry().bounds()
)

geojson = whisp.generate_test_polygons(
    bounds = geom,
    num_polygons=100,
    min_area_ha=10,
    max_area_ha=20,
    min_number_vert=100,
    max_number_vert=200
    )



[utils.py | generate_test_polygons() | l.378] INFO: Extracting bounds from Earth Engine Geometry...


2025-10-24 11:44:41,676 - INFO - Refreshing credentials due to a 401 response. Attempt 1/2.


[utils.py | generate_test_polygons() | l.391] INFO: Bounds: [-73.98, -33.75, -28.85, 5.27]
[utils.py | generate_test_polygons() | l.419] INFO: Generating 100 test polygons with 100-200 vertices...
[utils.py | generate_test_polygons() | l.419] INFO: Generating 100 test polygons with 100-200 vertices...
[utils.py | generate_test_polygons() | l.467] INFO: Generated 100 polygons!
[utils.py | generate_test_polygons() | l.473] INFO: Vertex count - Requested: 100-199, Actual: 100-199
[utils.py | generate_test_polygons() | l.481] INFO: Area (ha) - Requested: 10.2-20.0, Actual: 9.3-21.0
[utils.py | generate_test_polygons() | l.467] INFO: Generated 100 polygons!
[utils.py | generate_test_polygons() | l.473] INFO: Vertex count - Requested: 100-199, Actual: 100-199
[utils.py | generate_test_polygons() | l.481] INFO: Area (ha) - Requested: 10.2-20.0, Actual: 9.3-21.0


Prepare inputs

In [200]:
import json
with open (GEOJSON_EXAMPLE_FILEPATH, "w") as f:
    json.dump(geojson,f)
    


In [201]:
gdf = gpd.read_file(GEOJSON_EXAMPLE_FILEPATH)
gdf_reproj = gdf#.to_crs(epsg=6933)  # Reproject to equal area if necessary
gdf_w_centroids = extract_centroid_and_geomtype_gpd(gdf_reproj, x_col=centroid_x_coord_column, y_col=centroid_y_coord_column, type_col=geometry_type_column)
gdf_w_centroids_unproj = gdf_w_centroids.to_crs(epsg=4326) 
print(gdf_w_centroids_unproj[[centroid_x_coord_column, centroid_y_coord_column, geometry_type_column]])
gdf_w_centroids_unproj

    Centroid_lon  Centroid_lat Geometry_type
0     -31.908015     -9.797776       Polygon
1     -30.143440     -0.586545       Polygon
2     -39.121779    -25.112656       Polygon
3     -46.688431      3.741962       Polygon
4     -39.326899     -6.198758       Polygon
..           ...           ...           ...
95    -55.386472    -23.799740       Polygon
96    -36.587440    -29.623171       Polygon
97    -36.288693    -12.388983       Polygon
98    -40.331960    -27.912064       Polygon
99    -37.153729    -12.062214       Polygon

[100 rows x 3 columns]





  centroid_points = gdf.geometry.centroid


Unnamed: 0,internal_id,requested_vertices,actual_vertices,requested_area_ha,actual_area_ha,geometry,Centroid_lon,Centroid_lat,Geometry_type
0,1,130,130,19.39,20.99,"POLYGON ((-31.90546 -9.79775, -31.90551 -9.797...",-31.908015,-9.797776,Polygon
1,2,184,184,14.07,13.45,"POLYGON ((-30.14149 -0.58653, -30.1415 -0.5864...",-30.143440,-0.586545,Polygon
2,3,160,160,13.69,14.98,"POLYGON ((-39.1193 -25.11266, -39.11926 -25.11...",-39.121779,-25.112656,Polygon
3,4,186,186,14.73,16.35,"POLYGON ((-46.68619 3.74197, -46.68624 3.74204...",-46.688431,3.741962,Polygon
4,5,125,125,17.14,16.71,"POLYGON ((-39.32469 -6.19875, -39.32468 -6.198...",-39.326899,-6.198758,Polygon
...,...,...,...,...,...,...,...,...,...
95,96,160,160,16.54,16.75,"POLYGON ((-55.38453 -23.79975, -55.38453 -23.7...",-55.386472,-23.799740,Polygon
96,97,106,106,12.99,13.19,"POLYGON ((-36.58499 -29.62317, -36.58493 -29.6...",-36.587440,-29.623171,Polygon
97,98,170,170,12.92,12.33,"POLYGON ((-36.28691 -12.38899, -36.28696 -12.3...",-36.288693,-12.388983,Polygon
98,99,103,103,17.65,17.88,"POLYGON ((-40.32923 -27.91205, -40.32915 -27.9...",-40.331960,-27.912064,Polygon


Process Geojson

In [202]:
# import json
# # import ee

# def convert_geojson_to_ee_quick(geojson_filepath):
#     with open(geojson_filepath, "r") as f:
#         geojson_data = json.load(f)
#     return ee.FeatureCollection(geojson_data)

In [203]:
fc = whisp.convert_geojson_to_ee(GEOJSON_EXAMPLE_FILEPATH)
# fc = convert_geojson_to_ee_quick(GEOJSON_EXAMPLE_FILEPATH)

In [204]:
fc_w_centroid = extract_centroid_and_geomtype(fc, x_col=centroid_x_coord_column, y_col=centroid_y_coord_column, type_col=geometry_type_column)
print(fc_w_centroid.first().getInfo())

{'type': 'Feature', 'geometry': {'type': 'Polygon', 'coordinates': [[[-31.905456, -9.797751], [-31.905514, -9.797631], [-31.905591, -9.797517], [-31.905678, -9.797411], [-31.905761, -9.797311], [-31.905832, -9.797215], [-31.905886, -9.797118], [-31.905924, -9.797018], [-31.905954, -9.796914], [-31.905984, -9.79681], [-31.906024, -9.796709], [-31.906082, -9.796617], [-31.90616, -9.796539], [-31.906256, -9.796477], [-31.906363, -9.796429], [-31.906472, -9.796389], [-31.906574, -9.79635], [-31.906663, -9.796304], [-31.906738, -9.796244], [-31.906801, -9.796169], [-31.906857, -9.796082], [-31.906915, -9.795989], [-31.90698, -9.795899], [-31.907056, -9.795823], [-31.907145, -9.795766], [-31.907244, -9.795731], [-31.907347, -9.795713], [-31.907452, -9.795703], [-31.907555, -9.795693], [-31.907655, -9.795671], [-31.907752, -9.795632], [-31.907852, -9.795575], [-31.907956, -9.795505], [-31.908066, -9.795432], [-31.908183, -9.795368], [-31.908305, -9.795324], [-31.908428, -9.795306], [-31.90854

In [205]:
# import json
# import time

# geojson_filepath = GEOJSON_EXAMPLE_FILEPATH#"your_file.geojson"

# # --- Method 1: Using json.load ---
# def method_json_load(path):
#     with open(path) as f:
#         return ee.FeatureCollection(json.load(f))

# # --- Method 2: Using raw string (no JSON parsing) ---
# def method_read_string(path):
#     with open(path) as f:
#         return ee.FeatureCollection(f.read())

# # --- Method 3: Using preloaded dict (in-memory) ---
# def method_preloaded_dict(data):
#     return ee.FeatureCollection(data)

# # --- Benchmark ---
# start = time.time()
# fc1 = method_json_load(geojson_filepath)
# print("Feature count (example):", fc1.size().getInfo())
# t1 = time.time() - start

# start = time.time()
# fc2 = method_read_string(geojson_filepath)
# t2 = time.time() - start

# with open(geojson_filepath) as f:
#     data = json.load(f)
# start = time.time()
# fc3 = method_preloaded_dict(data)
# print("Feature count (example):", fc3.size().getInfo())
# t3 = time.time() - start

# print(f"json.load()       : {t1:.5f} sec")
# print(f"read() string     : {t2:.5f} sec")
# print(f"preloaded dict    : {t3:.5f} sec")

# # Optional: confirm identical feature counts
# # print("Feature count (example):", fc1.size().getInfo())
# # print("Feature count (example):", fc2.size().getInfo())
# # print("Feature count (example):", fc3.size().getInfo())


In [None]:
admin_image = ee.Image("projects/ee-andyarnellgee/assets/admin_gaul").rename("admin_gaul")

In [207]:
fc_centroids = fc.map(lambda feature: ee.Feature(feature.geometry().centroid()) )
print(fc_centroids.first().getInfo())

{'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [-31.90801548024177, -9.79777628148788]}, 'id': '0', 'properties': {}}


In [208]:
admin_codes = admin_image.reduceRegions(fc_centroids,reducer='first',scale=500)
print(admin_codes.aggregate_array('first').getInfo())

[1848, 1906, 1838, 2373, 1838, 1864, 1846, 1855, 1846, 1858, 1852, 1843, 2373, 1855, 1858, 1860, 1854, 1846, 1764, 2182, 1838, 1853, 1860, 1855, 1854, 1831, 2203, 1854, 1769, 1869, 1854, 1858, 1761, 1864, 1760, 2216, 1853, 1753, 1768, 1853, 2373, 1837, 1846, 2220, 1858, 1858, 1945, 1858, 1843, 1854, 1846, 1858, 1854]


In [209]:
df = whisp.convert_ee_to_df(admin_codes, remove_geom=True)

In [210]:
df.drop(columns=['geo'], errors='ignore', inplace=True)

In [190]:
df

Unnamed: 0,first
0,
1,1845.0
2,
3,
4,1861.0
...,...
95,1846.0
96,2276.0
97,2197.0
98,1853.0


In [191]:
result_df = join_admin_codes(df, lookup_dict, id_col='first').drop(columns=['first'])

In [192]:
result_df

Unnamed: 0,Admin_Level_1,ProducerCountry,Country
0,,,
1,Amapá,BR,BRA
2,,,
3,,,
4,Rio De Janeiro,BR,BRA
...,...,...,...
95,Amazonas,BR,BRA
96,Sipaliwini,SR,SUR
97,Presidente Hayes,PY,PRY
98,Mato Grosso,BR,BRA
