## Some slightly strange operations in this notebook to sample negatives using ESRI LULC: 

### First we split the RA AOI into Sumatra only for sampling (Java is more complicated)

In [3]:
import geopandas as gpd
sumatra_bb = gpd.read_file(
    'gs://demeter-labs/tea/geometries/sumatra_bounding_box.geojson')
ra_aoi = gpd.read_file(
    'gs://demeter-labs/tea/geometries/ra_aoi_indonesia.geojson')

sumatra_aoi = gpd.overlay(ra_aoi, sumatra_bb, how='intersection')
sumatra_aoi.to_file('gs://demeter-labs/tea/geometries/ra_sumatra_only_aoi.geojson')

### We did some quick manual surveying of tea plantations in Sumatra using Google + Google Street View for the exploratory phase, and saved approximate bounding boxes. We combine these with the RA provided tea polygons to make sure we don't sample any tea in Sumatra accidentally into the negative dataset

In [7]:
import ee
credentials = ee.ServiceAccountCredentials(
    '242968905260-compute@developer.gserviceaccount.com',
    '/home/christopher.x.ren/.config/earthengine/earthindex-7d2c9b94c507.json'
)
ee.Initialize(project='earthindex', credentials=credentials)
import geemap
import pandas as pd
import geopandas as gpd

sumatra_aoi = gpd.read_file('gs://demeter-labs/tea/geometries/ra_sumatra_only_aoi.geojson')
pos_gdf = gpd.read_parquet('/home/christopher.x.ren/datasets/ra_tea/pos_gdf_v1_sumatra_2024-11-10.parquet')

m = geemap.Map(center=[-0.7893, 113.9213], zoom=5)

# Add Google Maps hybrid layer
url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'
m.add_tile_layer(url, name='Google Hybrid', attribution='Google')

lulc_viz = {
    'names': ['Water', 'Trees', 'Flooded Vegetation', 'Crops', 'Built Area', 
              'Bare Ground', 'Snow/Ice', 'Clouds', 'Rangeland'],
    'colors': ['#1A5BAB', '#358221', '#87D19E', '#FFDB5C', '#ED022A',
               '#EDE9E4', '#F2FAFF', '#C8C8C8', '#800080']
}

def remap_lulc(image):
    return image.remap([1,2,4,5,7,8,9,10,11], [1,2,3,4,5,6,7,8,9])

esri_lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS")
lulc2023 = esri_lulc.filterDate('2023-01-01', '2023-12-31').mosaic()
lulc2023_remapped = remap_lulc(lulc2023)
m.addLayer(lulc2023_remapped, {'min': 1, 'max': 9, 'palette': lulc_viz['colors']}, 'ESRI LULC 2023')
m.add_gdf(sumatra_aoi, layer_name='Sumatra Sampling AOI')
m.addLayer(geemap.gdf_to_ee(pos_gdf), {'color': 'blue'}, 'Positive Points')
m

Map(center=[-0.7893, 113.9213], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchD…

### Sample using ESRI LC: adjust number of samples. Here we are sampling the 'Tree' and 'Built' class. We could ostensibly also sample the 'crop' class since we've removed most of the tea areas, however we'll explicitly use the cocoa and coffee polygons provided by the RA for that for v0

In [8]:
import shapely
ee_sumatra_aoi = ee.Geometry(shapely.geometry.mapping(sumatra_aoi.geometry.iloc[0]))
lulc_samples = lulc2023_remapped.stratifiedSample(  
    region=ee_sumatra_aoi,
    scale=320,
    numPoints=0,
    classValues=[1, 2, 4, 5, 9], 
    classPoints=[200, 1000, 1000, 1000, 300],
    seed=0,
    geometries=True
)


m.addLayer(lulc_samples, {}, 'LULC Sampled Points')

In [9]:
lulc_samples_gdf = geemap.ee_to_gdf(lulc_samples)
print(f"Original number of samples: {len(lulc_samples_gdf)}")

# Create buffer around positive points and remove samples within buffer
# First convert to a projected CRS (UTM) for accurate distances in meters
utm_zone = int(((pos_gdf.geometry.iloc[0].x + 180) / 6) + 1)
utm_crs = f'EPSG:326{utm_zone:02d}' # Northern hemisphere

# Project both dataframes to UTM
pos_gdf_utm = pos_gdf.to_crs(utm_crs)
lulc_samples_utm = lulc_samples_gdf.to_crs(utm_crs)

# Create buffer in UTM coordinates (meters)
pos_gdf_buffered = pos_gdf_utm.copy()
pos_gdf_buffered.geometry = pos_gdf_utm.geometry.buffer(320)
pos_union = pos_gdf_buffered.geometry.unary_union

# Remove samples that intersect with buffered positive points
lulc_samples_utm = lulc_samples_utm[~lulc_samples_utm.geometry.intersects(pos_union)]

# Convert back to EPSG:4326
lulc_samples_gdf_filtered = lulc_samples_utm.to_crs('EPSG:4326')

# Create mapping dictionary
class_mapping = {
    1: 'Water',
    2: 'Trees', 
    4: 'Crops',
    5: 'Crops',
    9: 'Rangeland'
}

# Add class column
lulc_samples_gdf_filtered['class'] = lulc_samples_gdf_filtered['remapped'].map(class_mapping)






Original number of samples: 3500


In [10]:
print(f"Number of samples after filtering: {len(lulc_samples_gdf_filtered)}")

Number of samples after filtering: 3491


In [11]:
lulc_samples_gdf_filtered.to_parquet(
    '/home/christopher.x.ren/datasets/ra_tea/sumatra_neg_water_built_tree_rangeland_samples_3491.parquet')