## Local embeddings modeling notebook

In [1]:
%load_ext autoreload
%autoreload 2

import geopandas as gpd
import duckdb
import os
from tqdm import tqdm


from datetime import datetime
import json
import os

import annoy
import geopandas as gpd
import ipyleaflet as ipyl
from IPython.display import display
import ipywidgets as ipyw
import joblib
import numpy as np
import pandas as pd

import sys
sys.path.insert(0, 'src')

from ui import GeoLabeler


Error initializing Earth Engine: None could not be converted to bytes, defaulting to 


In [2]:
with open('config/coffee/coffee_ui_config.json', 'r') as f:
    config = json.load(f)

local_dir = config['local_dir']
annoy_index_path = os.path.join(local_dir, 'embeddings.ann')
annoy_index = annoy.AnnoyIndex(config['index_dim'], 'angular')  # 384 dimensions for ViT-DINO embeddings
annoy_index.load(annoy_index_path)
tile_centroid_path = os.path.join(local_dir, 'centroid_gdf.parquet')
tile_centroid_gdf = gpd.read_parquet(tile_centroid_path)
duckdb_path = os.path.join(local_dir, 'embeddings.db')
embeddings_con = duckdb.connect(duckdb_path)
valid_tile_dir = os.path.join(local_dir, 'tiles')

mgrs_ids = config['mgrs_ids']
start_date = config['start_date']
end_date = config['end_date']
imagery = config['imagery']

gdf = gpd.read_parquet(tile_centroid_path)

In [3]:

from ui import GeoLabeler

BOUNDARY_PATH = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "places/costa_rica.geojson")
BOUNDARY = gpd.read_file(BOUNDARY_PATH)
maptiler_attribution = '<a href="https://www.maptiler.com/copyright/" target="_blank">&copy; MapTiler</a> <a href="https://www.openstreetmap.org/copyright" target="_blank">&copy; OpenStreetMap contributors</a>'

labeler = GeoLabeler(gdf=tile_centroid_gdf,
                    geojson_path=BOUNDARY_PATH,
                    mgrs_ids=mgrs_ids,
                    start_date=start_date,
                    end_date=end_date,
                    imagery=imagery,
                    annoy_index=annoy_index,
                    duckdb_connection=embeddings_con,
                    attribution=maptiler_attribution)


label = ipyw.Label(); display(label)  

def handle_mouse_move(**kwargs):
    lat, lon = kwargs.get('coordinates')
    label_type = "Erase" if labeler.select_val == -100 else "Negative" if labeler.select_val == 0 else "Positive"
    label.value = f'Lat/lon: {lat:.4f}, {lon:.4f}. Mode: {"lasso" if labeler.lasso_mode else "single"}. Labeling: {label_type}'

labeler.map.on_interaction(handle_mouse_move)

Initializing GeoLabeler...
Adding controls...


VBox(children=(Map(center=[9.997282071690657, -84.065583580083], controls=(ZoomControl(options=['position', 'z…

Label(value='')

## Search
First search make take a while as the table is loaded into memory

In [None]:
pos = labeler.gdf.loc[labeler.pos_ids]
neg = labeler.gdf.loc[labeler.neg_ids]

pos_embeddings = labeler.get_embeddings_by_tile_ids(pos['tile_id'].values)
if len(neg) > 0:
    neg_embeddings = labeler.get_embeddings_by_tile_ids(neg['tile_id'].values)
    neg_vec = neg_embeddings.drop(columns=['tile_id', 'row_number']).mean(axis=0).values
else:
    neg_vec = np.zeros(pos_embeddings.shape[1] - 2)
pos_vec = pos_embeddings.drop(columns=['tile_id', 'row_number']).mean(axis=0).values
query_vector = 2 * pos_vec - neg_vec


In [5]:

n_nbors = 13000
nbors = labeler.annoy_index.get_nns_by_vector(query_vector, n_nbors, include_distances=True)
# Filter out any IDs that are already in positive labels
nbors_filtered = [n for n in nbors[0] if n not in labeler.pos_ids]

detections = labeler.gdf.loc[nbors_filtered]
labeler.detection_gdf = detections[['geometry']]
labeler.update_layer(
    labeler.points, json.loads(detections.geometry.to_json()))

## Export

In [14]:
# Export the positives and negatives
pos_export = labeler.gdf.loc[labeler.pos_ids]
neg_export = labeler.gdf.loc[labeler.neg_ids]

# Add label columns
pos_export['label'] = 1
neg_export['label'] = 0

# Combine into one gdf
export_gdf = pd.concat([pos_export, neg_export], ignore_index=True)

# Export to a parquet file
export_path = os.path.join(local_dir, 'labels.parquet')
export_gdf.to_parquet(export_path, index=False)


# Load

In [None]:
# Helper function

def display_labels_on_labeler(labeler, labels_gdf):
    """
    Display positive and negative labels on the GeoLabeler instance.

    Args:
        labeler (GeoLabeler): The GeoLabeler instance.
        labels_gdf (GeoDataFrame): The GeoDataFrame containing labels.
    """
    if labels_gdf is not None:
        pos_tile_ids = labels_gdf.loc[labels_gdf['label'] == 1, 'tile_id'].tolist()
        neg_tile_ids = labels_gdf.loc[labels_gdf['label'] == 0, 'tile_id'].tolist()

        # Get index values from labeler's GeoDataFrame where tile_id is in pos_tile_ids and neg_tile_ids
        pos_indices = labeler.gdf[labeler.gdf['tile_id'].isin(pos_tile_ids)].index.tolist()
        neg_indices = labeler.gdf[labeler.gdf['tile_id'].isin(neg_tile_ids)].index.tolist()

        # Update labeler with positive and negative IDs
        labeler.pos_ids = pos_indices
        labeler.neg_ids = neg_indices

        # Update the map layers to reflect loaded labels
        labeler.update_layers()
        print("Labels displayed on labeler.")
    else:
        print("No labels to display.")



In [None]:
# Load previously exported labels

labels_file_path = os.path.join(local_dir, 'labels.parquet')
if os.path.exists(labels_file_path):
    labels_gdf = gpd.read_parquet(labels_file_path)
    display_labels_on_labeler(labeler, labels_gdf)

In [None]:
# Add polygons from postprocess_detections.py

dissolved = gpd.read_parquet("/Users/ben/EarthGenome/data/ra_coffee/output/tile_classifier_predictions_0_costa_rica_posw1.0_prob_0.99_postprocess.parquet")

labeler.dissolve_layer = ipyl.GeoJSON(
    data=json.loads(dissolved.geometry.to_json()),
    style={'color': 'blue', 'opacity': 0.5, 'weight': 2, 'fillOpacity': 0.1},
    name='Dissolved Polygons'
)

labeler.map.add_layer(labeler.dissolve_layer)