## Download files to local dir

In [None]:
from concurrent.futures import ThreadPoolExecutor
import subprocess
from tqdm import tqdm
import time
import os

def download_file(s3_path):
    local_dir = "/home/christopher.x.ren/embeddings/ra_tea/planet_embeddings_v2"
    filename = s3_path.split('/')[-1]
    local_path = os.path.join(local_dir, filename)
    
    # Create directory if it doesn't exist
    os.makedirs(local_dir, exist_ok=True)
    
    start_time = time.time()
    cmd = f"aws s3 cp {s3_path} {local_path}"
    try:
        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
        duration = time.time() - start_time
        print(f"Downloaded {filename} in {duration:.1f}s")
        if result.stderr:
            print(f"Warnings for {filename}:\n{result.stderr}")
        return local_path
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {filename}:\n{e.stderr}")
        return None
paths = []
for year in [2023, 2024]:
    start_month = 9 if year == 2023 else 1
    end_month = 12 if year == 2023 else 8
    
    for month in range(start_month, end_month + 1):
        path = f"s3://ra-ei-public/v2/ps_monthly_sen2_normalized_analytic_8b_sr_subscription_{year}_{month:02d}_mosaic.parquet"
        paths.append(path)
print(f"Starting download of {len(paths)} files...")
with ThreadPoolExecutor() as executor:
    local_paths = list(tqdm(
        executor.map(download_file, paths),
        total=len(paths),
        desc="Downloading files"
    ))

local_paths = [p for p in local_paths if p is not None]
print(f"\nDownload complete. {len(local_paths)}/{len(paths)} files downloaded successfully")

In [2]:
import geopandas as gpd
from joblib import Parallel, delayed
import pandas as pd
from pathlib import Path

def process_parquet(file_path):
    tile = gpd.read_parquet(file_path).to_crs(4326)
    mgrs_id = str(file_path).split('/')[-1].split('.')[0][:5]
    return gpd.GeoDataFrame({'mgrs_id': [mgrs_id], 'geometry': [tile.union_all()]})

parquet_dir = Path("/home/christopher.x.ren/embeddings/ra_tea/valid_tiles")
parquet_files = list(parquet_dir.glob("*.parquet"))

mgrs_tiles = Parallel(n_jobs=-1)(
    delayed(process_parquet)(f) for f in parquet_files
)

mgrs_tile_union = gpd.GeoDataFrame(pd.concat(mgrs_tiles, ignore_index=True))



In [32]:
reference_embedding_gdf = gpd.read_parquet(local_paths[0], columns=['geometry', 'id'])

Unnamed: 0,mgrs_id,geometry
0,48MVU,"POLYGON ((104.11012 -5.56685, 104.11012 -5.565..."
1,47NPC,"POLYGON ((99.89765 1.81792, 99.89765 1.81864, ..."
2,47MRS,"POLYGON ((101.69836 -3.4182, 101.69836 -3.4174..."
3,47MQV,"POLYGON ((100.79583 -0.89633, 100.79583 -0.895..."
4,47NLC,"MULTIPOLYGON (((97.20042 1.95958, 97.20041 1.9..."
...,...,...
58,48MUC,"POLYGON ((103.19952 -2.70342, 103.19952 -2.701..."
59,49MCN,"POLYGON ((109.18713 -7.22879, 109.18714 -7.227..."
60,47MPU,"POLYGON ((100.09191 -1.10285, 100.09047 -1.102..."
61,48MSB,"POLYGON ((101.99745 -3.60016, 101.99745 -3.598..."


In [33]:
# Spatial join to find intersecting tiles
intersecting_tiles = gpd.sjoin(reference_embedding_gdf, mgrs_tile_union, how='inner', predicate='intersects')

# Create new dataframe with mgrs_id and id columns
mgrs_id_mapping = intersecting_tiles[['id', 'mgrs_id']].copy()

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  intersecting_tiles = gpd.sjoin(reference_embedding_gdf, mgrs_tile_union, how='inner', predicate='intersects')


In [36]:
mgrs_id_mapping.to_parquet("gs://demeter-labs/tea/mgrs_id_mapping_tom_tiles.parquet")

In [39]:
print(f"Length before deduplication: {len(mgrs_id_mapping)}")
mgrs_id_mapping = mgrs_id_mapping.drop_duplicates(subset=['id'], keep='first')
print(f"Length after deduplication: {len(mgrs_id_mapping)}")
mgrs_id_mapping.to_parquet("gs://demeter-labs/tea/mgrs_id_mapping_tom_tiles_deduplicated.parquet")


Length before deduplication: 9119038
Length after deduplication: 9119038


In [41]:
import duckdb
from tqdm import tqdm
def process_tile(mgrs_id, local_dir, id_mapping):
    print(f"\nProcessing tile {mgrs_id}...")
    start_time = time.time()
    
    print("Connecting to DuckDB...")
    con = duckdb.connect()
    con.sql("INSTALL spatial; LOAD spatial;")
    
    # Get relevant IDs for this MGRS tile
    tile_ids = id_mapping[id_mapping['mgrs_id'] == mgrs_id]['id'].tolist()
    ids_string = ", ".join([f"'{id}'" for id in tile_ids])
    
    results = []
    for file_path in tqdm(os.listdir(local_dir)):
        if not file_path.endswith('.parquet'):
            continue
            
        full_path = os.path.join(local_dir, file_path)
        # Extract year and month from filename
        parts = file_path.split('_')
        year_month = f"{parts[-3]}_{parts[-2]}"
        
        sql_query = f"""
            SELECT id, embedding, '{year_month}' as time_period
            FROM read_parquet('{full_path}')
            WHERE id IN ({ids_string})
        """
        
        try:
            df = con.execute(sql_query).df()
            if not df.empty:
                # Rename embedding column to include year_month
                df = df.rename(columns={'embedding': f'embedding_{year_month}'})
                results.append(df)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue
    
    con.close()
    
    if results:
        # Merge all results on ID
        final_df = results[0]
        for df in results[1:]:
            final_df = final_df.merge(df[['id', df.columns[-2]]], on='id', how='outer')
        
        print(f"Total processing time for tile {mgrs_id}: {time.time() - start_time:.2f}s")
        return final_df
    else:
        print(f"No data found for tile {mgrs_id}")
        return None

# Example usage:
result = process_tile(
    "48MVU", 
    "/home/christopher.x.ren/embeddings/ra_tea/planet_embeddings_v2",
    mgrs_id_mapping
)


Processing tile 48MVU...
Connecting to DuckDB...


  0%|          | 0/12 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  8%|▊         | 1/12 [00:28<05:09, 28.10s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 17%|█▋        | 2/12 [00:54<04:31, 27.11s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 25%|██▌       | 3/12 [01:24<04:13, 28.21s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 33%|███▎      | 4/12 [01:50<03:40, 27.52s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 42%|████▏     | 5/12 [02:17<03:10, 27.22s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 50%|█████     | 6/12 [02:45<02:45, 27.54s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 58%|█████▊    | 7/12 [03:18<02:26, 29.34s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 67%|██████▋   | 8/12 [03:45<01:54, 28.53s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 75%|███████▌  | 9/12 [04:11<01:23, 27.87s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 83%|████████▎ | 10/12 [04:44<00:58, 29.30s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 92%|█████████▏| 11/12 [05:12<00:28, 29.00s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

100%|██████████| 12/12 [05:43<00:00, 28.64s/it]


: 

In [1]:
mgrs_id = "48MVU"
id_mapping = mgrs_id_mapping
local_dir = "/home/christopher.x.ren/embeddings/ra_tea/planet_embeddings_v2"

print("Connecting to DuckDB...")
con = duckdb.connect()
con.sql("INSTALL spatial; LOAD spatial;")

# Get relevant IDs for this MGRS tile
tile_ids = id_mapping[id_mapping['mgrs_id'] == mgrs_id]['id'].tolist()
ids_string = ", ".join([f"'{id}'" for id in tile_ids])
file_path = os.path.join(local_dir, "ps_monthly_sen2_normalized_analytic_8b_sr_subscription_2023_09_mosaic.parquet")
# Extract year and month from filename
parts = os.path.basename(file_path).split('_')
year_month = f"{parts[-3]}_{parts[-2]}"

sql_query = f"""
    SELECT id, embedding, '{year_month}' as time_period
    FROM read_parquet('{file_path}')
    WHERE id IN ({ids_string})
"""

try:
    df = con.execute(sql_query).df()
    if not df.empty:
        # Rename embedding column to include year_month
        df = df.rename(columns={'embedding': f'embedding_{year_month}'})
        results = [df]
except Exception as e:
    print(f"Error processing {file_path}: {e}")
    results = []

con.close()


NameError: name 'mgrs_id_mapping' is not defined