In [1]:
import geopandas as gpd
import os
import pandas as pd
from tqdm import tqdm

planet_gdf = gpd.read_parquet(
    "/home/christopher.x.ren/embeddings/ra_tea/planet_embeddings_v2/ps_monthly_sen2_normalized_analytic_8b_sr_subscription_2024_10_mosaic.parquet",
    columns=["id", "geometry"])

tmp_path = '/tmp/tile_classifier_dataset_v2_java-sumatra.parquet'
os.system(f'gsutil cp gs://demeter-labs/tea/classifier-datasets/tile_classifier_dataset_v2_java-sumatra.parquet {tmp_path}')
v2_dataset = gpd.read_parquet(tmp_path)


# Calculate UTM zone
planet_gdf['utm_zone'] = ((planet_gdf.geometry.centroid.x + 180) / 6 + 1).astype(int)

# Check if points are in southern hemisphere and adjust CRS
if (planet_gdf.geometry.centroid.y < 0).all():
    planet_gdf['utm_zone'] = planet_gdf['utm_zone'] + 30
results = []

# Process each UTM zone group
for zone, group in tqdm(
    planet_gdf.groupby('utm_zone'), desc="Processing UTM zones"):
    utm_crs = f'EPSG:327{zone:02d}'
    projected_group = group.to_crs(utm_crs)
    projected_group.geometry = projected_group.geometry.centroid
    reprojected_group = projected_group.to_crs('EPSG:4326')
    results.append(reprojected_group)

planet_centroid_gdf = gpd.GeoDataFrame(pd.concat(results))


Copying gs://demeter-labs/tea/classifier-datasets/tile_classifier_dataset_v2_java-sumatra.parquet...
/ [1 files][  5.0 MiB/  5.0 MiB]                                                
Operation completed over 1 objects/5.0 MiB.                                      

  planet_gdf['utm_zone'] = ((planet_gdf.geometry.centroid.x + 180) / 6 + 1).astype(int)

  if (planet_gdf.geometry.centroid.y < 0).all():
Processing UTM zones: 100%|██████████| 3/3 [00:23<00:00,  7.97s/it]


In [2]:
v2_dataset['utm_zone'] = ((v2_dataset.geometry.centroid.x + 180) / 6 + 1).astype(int)
# Create empty GeoDataFrame to store final results
final_results = []

# Process each UTM zone
for zone, v2_group in v2_dataset.groupby('utm_zone'):
    # Get matching results subset for this zone
    results_subset = planet_centroid_gdf[planet_centroid_gdf.utm_zone == zone].copy()
    
    # Skip if no matching results
    if len(results_subset) == 0:
        continue
        
    # Determine correct UTM CRS
    if (v2_group.geometry.centroid.y < 0).all():
        utm_crs = f'EPSG:327{zone:02d}'  # Southern hemisphere
    else:
        utm_crs = f'EPSG:326{zone:02d}'  # Northern hemisphere
        
    # Project both datasets to UTM
    v2_utm = v2_group.to_crs(utm_crs)
    results_utm = results_subset.to_crs(utm_crs)
    # Perform spatial join
    joined = gpd.sjoin_nearest(v2_utm, results_utm, how='left', distance_col='distance')
    final_results.append(joined.to_crs('EPSG:4326'))

# Combine all results
final_df = pd.concat(final_results, ignore_index=True)



  v2_dataset['utm_zone'] = ((v2_dataset.geometry.centroid.x + 180) / 6 + 1).astype(int)

  if (v2_group.geometry.centroid.y < 0).all():

  if (v2_group.geometry.centroid.y < 0).all():

  if (v2_group.geometry.centroid.y < 0).all():


In [3]:
filtered_df = final_df[final_df['distance'] <= 160]
print(
    f"Dropped {len(final_df) - len(filtered_df)} rows ({(len(final_df) - len(filtered_df))/len(final_df)*100:.1f}%) due to distance > 160m")



Dropped 25968 rows (12.0%) due to distance > 160m


In [4]:
# Read embeddings using predicate pushdown
embeddings_path = "/home/christopher.x.ren/embeddings/ra_tea/planet_embeddings_v2/ps_monthly_sen2_normalized_analytic_8b_sr_subscription_2024_10_mosaic.parquet"

# Get list of IDs to filter on
ids_to_match = filtered_df['id'].tolist()

# Read only embedding column for matching IDs
embeddings_df = pd.read_parquet(
    embeddings_path,
    columns=['id', 'embedding'],
    filters=[('id', 'in', ids_to_match)]
)

# Merge embeddings with filtered_df
final_df = filtered_df.merge(embeddings_df, on='id', how='left')
final_df

Unnamed: 0,geometry,tile_id,class,label,utm_zone_left,index_right,id,utm_zone_right,distance,embedding
0,POINT (98.77974 2.70397),47NMC_32_16_10_618_472,ei_neg,0,47,4863586,w0qpru1t0p8,47,64.806475,"[-0.06596038, 0.20158483, -0.90695256, 0.03727..."
1,POINT (98.7927 2.69384),47NMC_32_16_10_611_481,ei_neg,0,47,4982624,w0qr2476suh,47,122.106593,"[-0.044717226, 0.19787532, -0.8978772, 0.04486..."
2,POINT (98.7783 2.70397),47NMC_32_16_10_618_471,ei_neg,0,47,4863583,w0qpru0mnrx,47,64.827851,"[-0.07811086, 0.2025042, -0.91841376, 0.044298..."
3,POINT (98.79126 2.69528),47NMC_32_16_10_612_480,ei_neg,0,47,4982609,w0qr24971b5,47,60.475061,"[-0.03977229, 0.20143303, -0.8918887, 0.038123..."
4,POINT (98.7783 2.70107),47NMC_32_16_10_616_471,ei_neg,0,47,4862905,w0qprg8kqrx,47,63.145542,"[-0.048027113, 0.20090581, -0.89744234, 0.0303..."
...,...,...,...,...,...,...,...,...,...,...
200080,POINT (109.10579 -7.2906),49MBM_32_16_10_438_568,ei_pos,1,49,3220251,qqtwdb8hf10,49,98.947921,"[-0.07875076, 0.20877074, -0.95975745, 0.06282..."
200081,POINT (109.1261 -7.28345),49MBM_32_16_10_443_582,ei_pos,1,49,3221376,qqtwe4pxj2d,49,106.225651,"[-0.09031576, 0.21429557, -0.9952133, 0.070712..."
200082,POINT (109.11455 -7.27472),49MBM_32_16_10_449_574,ei_pos,1,49,3222507,qqtwdgy3786,49,93.675178,"[-0.18163538, 0.22114675, -1.0626813, 0.070246..."
200083,POINT (109.12467 -7.2791),49MBM_32_16_10_446_581,ei_pos,1,49,3221659,qqtwe4tzz0d,49,88.092978,"[-0.08802713, 0.21243253, -0.98677, 0.06588699..."


In [7]:
final_df.drop_duplicates(subset='id', keep='first').shape


(189741, 9)

In [9]:
# Save to temporary file first
tmp_path = "/tmp/planet_2024_10_tile_classifier_dataset_v2_embeddings.parquet"
final_df.to_parquet(tmp_path)

# Upload to GCS
from google.cloud import storage
client = storage.Client()
bucket = client.bucket("demeter-labs")
blob = bucket.blob(
    "tea/classifier-datasets/planet_2024_10_tile_classifier_dataset_v2_embeddings.parquet")
blob.upload_from_filename(tmp_path)
