In [434]:
import eotdl

print(eotdl.__version__)


2025.04.22-3


In this notebook we generate the dataset for the use case

1. Generate list of Satellogic images to be used (containing bb and acquisition time)
2. Explore available S1/S2 images with different criteria
	- bounding box overlap
	- acquisition time overlap
3. Download matching S1/S2 at given resolution
4. Generate metadata and ingest to EOTDL 

In [435]:
import geopandas as gpd

gdf = gpd.read_parquet('~/Desktop/EarthPulse_Local_Data/data/satellogic-earthview-items.parquet')
gdf.head()


Unnamed: 0,json_path,zone,region,date,geometry
0,data/json/zone=04N/region=603411_2346301/date=...,04N,603411_2346301,2022-09-15,"POLYGON ((-157.99991 21.21181, -157.99988 21.2..."
1,data/json/zone=04N/region=603411_2346685/date=...,04N,603411_2346685,2022-09-15,"POLYGON ((-157.99988 21.21528, -157.99986 21.2..."
2,data/json/zone=04N/region=603411_2347069/date=...,04N,603411_2347069,2022-09-15,"POLYGON ((-157.99986 21.21875, -157.99984 21.2..."
3,data/json/zone=04N/region=603411_2347453/date=...,04N,603411_2347453,2022-09-15,"POLYGON ((-157.99984 21.22221, -157.99981 21.2..."
4,data/json/zone=04N/region=603411_2347837/date=...,04N,603411_2347837,2022-09-15,"POLYGON ((-157.99981 21.22568, -157.99979 21.2..."


In [436]:
gdf.shape

(7095985, 5)

In [437]:
from find_matches import search_matches_by_sentinel
from eotdl.tools import bbox_from_centroid

sample =  gdf.sample(1)

# Prepare sample details
date = sample.date.iloc[0]
json_path = sample.json_path.iloc[0]
centroid = sample.geometry.iloc[0].centroid

_, s1_matches = search_matches_by_sentinel((_, date, centroid), collection_id="sentinel-1-grd")
_, s2_matches = search_matches_by_sentinel((_, date, centroid), collection_id="sentinel-2-l2a")

# todo: resample if no s1 matches or no s2 matches after cloud filter?

print(f"s1_matches: {s1_matches}\n")
print(f"s2_matches {s2_matches}\n")

s1_matches: [{'id': 'S1A_IW_GRDH_1SDV_20220811T130248_20220811T130313_044505_054F9E_FE0E', 'properties': {'datetime': '2022-08-11T13:02:48Z'}}, {'id': 'S1A_IW_GRDH_1SDV_20220811T010140_20220811T010205_044498_054F5D_AAA9', 'properties': {'datetime': '2022-08-11T01:01:40Z'}}]

s2_matches [{'id': 'S2A_MSIL2A_20220811T173921_N0400_R098_T13SDV_20220812T013858', 'properties': {'datetime': '2022-08-11T17:54:23Z', 'eo:cloud_cover': 24.68}}]



In [438]:
import shutil
import os

from download_images import download_images_to_fastdata, download_sat_image

# Remove and recreate the sample folder structure
shutil.rmtree('sample', ignore_errors=True)
os.makedirs('sample/satellogic', exist_ok=True)
os.makedirs('sample/sentinel1', exist_ok=True)
os.makedirs('sample/sentinel2', exist_ok=True)

print("collecting...")

# Base name for all files
base_name = json_path.split('/')[-1].replace('_metadata.json', '')
satellogic_name = f"{base_name}_TOA.tif"
sent1_name = f"{base_name}_S1GRD.tiff"
sent2_name = f"{base_name}_S2L2A.tiff"

# Source directory (local dataset location)
local_data_path = os.path.expanduser("~/Desktop/EarthPulse_Local_Data/data")

# Source paths
src_sat = f"{local_data_path}/tifs/satellogic/{satellogic_name}"
src_s1 = f"{local_data_path}/tifs/sentinel1/{sent1_name}"
src_s2 = f"{local_data_path}/tifs/sentinel2/{sent2_name}"

# Destination paths
dst_sat = f"sample/satellogic/{satellogic_name}"
dst_s1 = f"sample/sentinel1/{sent1_name}"
dst_s2 = f"sample/sentinel2/{sent2_name}"

# download s1, s2, and hr to /fastdata/data/tifs
download_images_to_fastdata((s1_matches, s2_matches, date, json_path, centroid))

# copy s1/s2/hr images from /fastdata/data/tifs to /sample local dir
shutil.copy2(src_sat, dst_sat)
if os.path.exists(src_s2):
	shutil.copy2(src_s1, dst_s1)
if os.path.exists(src_s2):
	shutil.copy2(src_s2, dst_s2)

print("✅ All files copied successfully.")
    

collecting...


FileNotFoundError: [Errno 2] No such file or directory: '/Users/benastahl/Desktop/EarthPulse_Local_Data/data/json/zone=13N/region=463235_3976142/date=2022-08-12/20220812_202535_SN30_13N_463235_3976142_metadata.json'

In [None]:
import matplotlib.pyplot as plt
import rasterio as rio
import numpy as np

fig, axs = plt.subplots(1, 3, figsize=(12, 4))

# Satellogic (HR)
axs[0].imshow((rio.open(dst_sat).read()[:3, ...].transpose(1, 2, 0) / 3000).clip(0, 1))
axs[0].set_title("Satellogic (HR)")
axs[0].axis('off')

if os.path.exists(dst_s2):
	# Sentinel-2 (RGB: bands 4, 3, 2)
	axs[1].imshow((rio.open(dst_s2).read()[(3, 2, 1), ...].transpose(1, 2, 0) / 3000).clip(0, 1))
axs[1].set_title("Sentinel-2 (RGB)")
axs[1].axis('off')

if os.path.exists(dst_s1):
	# Sentinel-1 (bands 3, 2, 1 if available; fallback to band 1 grayscale)
	with rio.open(dst_s1) as src:
		band = src.read(1).astype(np.float32)
	
		# Optional: remove very high outliers for better contrast
		band = np.clip(band, 0, np.percentile(band, 99))
	
		# Normalize to 0–1
		band = (band - band.min()) / (band.max() - band.min() + 1e-5)
	
	axs[2].imshow(band, cmap='gray')
axs[2].set_title("Sentinel-1 (normalized)")
axs[2].axis('off')

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt


# plot histogram with number of images per zone
fig = plt.figure(figsize=(15, 3))
gdf['zone'].value_counts().plot(kind='bar', ax=fig.gca(), grid=True)
plt.show()


In [None]:
len(gdf['zone'].unique()), len(gdf['region'].unique())

In [None]:
gdf['zone'].value_counts().min()

In [None]:
# plot histogram with number of images per month (all images are from 2022)
fig = plt.figure(figsize=(15, 3))
gdf['date'].dt.month.value_counts().plot(kind='bar', ax=fig.gca(), grid=True)
plt.show()


In [None]:
zones = sorted(gdf['zone'].unique())
zone_gdf = gdf[gdf['zone'] == zones[4]]
n = min(5, len(zone_gdf))
sample = zone_gdf.sample(n, random_state=2025)
sample