In [71]:
import geopandas as gpd
import pandas as pd
java_pos_gdf = gpd.read_parquet("/home/christopher.x.ren/datasets/ra_tea/pos_gdf_v1_java_2024-11-08.parquet")
java_pos_gdf['class'] = 'java_pos'
sumatra_pos_gdf = gpd.read_parquet("/home/christopher.x.ren/datasets/ra_tea/pos_gdf_v1_sumatra_2024-11-10.parquet")
sumatra_pos_gdf['class'] = 'sumatra_pos'

pos_gdf = gpd.GeoDataFrame(pd.concat([java_pos_gdf, sumatra_pos_gdf]), geometry='geometry')
pos_gdf.to_parquet("/home/christopher.x.ren/datasets/ra_tea/pos_gdf_v2_java_sumatra_2024-11-11.parquet")

java_neg_gdf = gpd.read_parquet("/home/christopher.x.ren/datasets/ra_tea/neg_gdf_v1_java_2024-11-08.parquet")
java_neg_gdf['class'] = 'java_neg'
sumatra_neg_gdf = gpd.read_parquet("/home/christopher.x.ren/datasets/ra_tea/neg_gdf_v1_sumatra_2024-11-10.parquet")
sumatra_neg_gdf['class'] = 'sumatra_neg'

neg_gdf = gpd.GeoDataFrame(pd.concat([java_neg_gdf, sumatra_neg_gdf]), geometry='geometry')
neg_gdf.to_parquet("/home/christopher.x.ren/datasets/ra_tea/neg_gdf_v2_java_sumatra_2024-11-11.parquet")

In [66]:
import ee
import geopandas as gpd
import geemap
import shapely
credentials = ee.ServiceAccountCredentials(
    '242968905260-compute@developer.gserviceaccount.com',
    '/home/christopher.x.ren/.config/earthengine/earthindex-7d2c9b94c507.json'
)
ee.Initialize(project='earthindex', credentials=credentials)
sumatra_aoi = gpd.read_file('gs://demeter-labs/tea/geometries/ra_sumatra_only_aoi.geojson')
java_aoi = gpd.read_file('gs://demeter-labs/tea/geometries/java_bounding_box.geojson')
ee_java_aoi = ee.Geometry(shapely.geometry.mapping(java_aoi.geometry.iloc[0]))
ee_sumatra_aoi = ee.Geometry(shapely.geometry.mapping(sumatra_aoi.geometry.iloc[0]))
esri_lc_classes=[1, 2, 4, 5, 9]
num_points_per_class=[2, 1000, 1000, 1000, 300]

In [67]:
m = geemap.Map(center=[-0.7893, 113.9213], zoom=5)

# Add Google Maps hybrid layer
url = 'https://mt1.google.com/vt/lyrs=y&x={x}&y={y}&z={z}'
m.add_tile_layer(url, name='Google Hybrid', attribution='Google')

lulc_viz = {
    'names': ['Water', 'Trees', 'Flooded Vegetation', 'Crops', 'Built Area', 
              'Bare Ground', 'Snow/Ice', 'Clouds', 'Rangeland'],
    'colors': ['#1A5BAB', '#358221', '#87D19E', '#FFDB5C', '#ED022A',
               '#EDE9E4', '#F2FAFF', '#C8C8C8', '#800080']
}

def remap_lulc(image):
    return image.remap([1,2,4,5,7,8,9,10,11], [1,2,3,4,5,6,7,8,9])

esri_lulc = ee.ImageCollection("projects/sat-io/open-datasets/landcover/ESRI_Global-LULC_10m_TS")
lulc2023 = esri_lulc.filterDate('2023-01-01', '2023-12-31').mosaic()
lulc2023_remapped = remap_lulc(lulc2023)
m.addLayer(lulc2023_remapped, {'min': 1, 'max': 9, 'palette': lulc_viz['colors']}, 'ESRI LULC 2023')
m.add_gdf(sumatra_aoi, layer_name='Sumatra Sampling AOI')
m.addLayer(geemap.gdf_to_ee(pos_gdf), {'color': 'blue'}, 'Positive Points')


In [68]:
import numpy as np
sumatra_num_points = (np.array(num_points_per_class) * 30).tolist()
java_num_points = (np.array(num_points_per_class) * 30).tolist()

sumatra_lulc_samples = lulc2023_remapped.stratifiedSample(  
    region=ee_sumatra_aoi,
    scale=320,
    numPoints=0,
    classValues=esri_lc_classes, 
    classPoints=sumatra_num_points,
    seed=0,
    geometries=True
)

java_lulc_samples = lulc2023_remapped.stratifiedSample(  
    region=ee_java_aoi,
    scale=320,
    numPoints=0,
    classValues=esri_lc_classes, 
    classPoints=java_num_points,
    seed=0,
    geometries=True
)


m.addLayer(sumatra_lulc_samples, {'color': 'black'}, 'Sumatra LULC Sampled Points')
m.addLayer(java_lulc_samples, {'color': 'white'}, 'Java LULC Sampled Points')


In [69]:
java_class_mapping = {
    1: 'java_water',
    2: 'java_trees', 
    4: 'java_crops',
    5: 'java_crops',
    9: 'java_rangeland'
}

sumatra_class_mapping = {
    1: 'sumatra_water',
    2: 'sumatra_trees', 
    4: 'sumatra_crops',
    5: 'sumatra_crops',
    9: 'sumatra_rangeland'
}

java_neg_samples_gdf = geemap.ee_to_gdf(java_lulc_samples)
java_neg_samples_gdf['class'] = java_neg_samples_gdf['remapped'].map(java_class_mapping)

sumatra_neg_samples_gdf = geemap.ee_to_gdf(sumatra_lulc_samples)
sumatra_neg_samples_gdf['class'] = sumatra_neg_samples_gdf['remapped'].map(sumatra_class_mapping)

# Combine samples before filtering
lulc_samples_gdf = gpd.GeoDataFrame(pd.concat([java_neg_samples_gdf, sumatra_neg_samples_gdf]), geometry='geometry')
print(f"Original number of samples: {len(lulc_samples_gdf)}")

# Get UTM zones for all points
utm_zones = ((pos_gdf.geometry.x + 180) / 6 + 1).astype(int)
lulc_samples_gdf['utm_zone'] = ((lulc_samples_gdf.geometry.x + 180) / 6 + 1).astype(int)
unique_zones = utm_zones.unique()

# Process each UTM zone separately and combine results
filtered_samples = []
for zone in unique_zones:
    utm_crs = f'EPSG:326{zone:02d}'
    
    # Get points in this zone
    pos_zone_mask = utm_zones == zone
    lulc_zone_mask = lulc_samples_gdf['utm_zone'].values == zone
    zone_pos_gdf = pos_gdf[pos_zone_mask]
    zone_samples_gdf = lulc_samples_gdf[lulc_zone_mask]
    
    # Project points in this zone to their UTM CRS
    zone_pos_utm = zone_pos_gdf.to_crs(utm_crs)
    zone_samples_utm = zone_samples_gdf.to_crs(utm_crs)
    
    # Buffer the positive points
    zone_pos_buffered = zone_pos_utm.copy()
    zone_pos_buffered.geometry = zone_pos_utm.geometry.buffer(320)
    zone_union = zone_pos_buffered.geometry.unary_union
    
    # Filter samples in this zone
    zone_filtered = zone_samples_utm[~zone_samples_utm.geometry.intersects(zone_union)]
    filtered_samples.append(zone_filtered.to_crs('EPSG:4326'))

# Combine results and convert back to WGS84
lulc_samples_gdf_filtered = gpd.GeoDataFrame(pd.concat(filtered_samples), geometry='geometry')
lulc_samples_gdf_filtered = lulc_samples_gdf_filtered.to_crs('EPSG:4326')
print(f"Number of samples after filtering: {len(lulc_samples_gdf_filtered)}")

Original number of samples: 198120
Number of samples after filtering: 196861


In [70]:
lulc_samples_gdf_filtered.to_parquet(
    "/home/christopher.x.ren/datasets/ra_tea/java_sumatra_water_built_tree_rangeland_samples_196861.parquet")