In [1]:
import eotdl

from eotdl.tutorials.usecases.useCaseA.find_matches import find_sentinel_matches


In this notebook we generate the dataset for the use case

1. Generate list of Satellogic images to be used (containing bb and acquisition time)
2. Explore available S1/S2 images with different criteria
	- bounding box overlap
	- acquisition time overlap
3. Download matching S1/S2 at given resolution
4. Generate metadata and ingest to EOTDL 

In [2]:
import geopandas as gpd

# gdf = gpd.read_parquet('/fastdata/Satellogic/data/satellogic-earthview-items.parquet')
gdf = gpd.read_parquet('/fastdata/Satellogic/data/satellogic-earthview-items-with-matches.parquet')
# json_path, zone, region, date, geometry, matches
# json_path, zone, region, date, geometry
gdf.head()


Unnamed: 0,json_path,zone,region,date,geometry,matches
0,data/json/zone=51S/region=357067_6579845/date=...,51S,357067_6579845,2022-10-26,"POLYGON ((121.50829 -30.90962, 121.50835 -30.9...",[{'id': 'S2B_MSIL2A_20221027T015619_N0400_R117...
1,data/json/zone=46N/region=374867_2457086/date=...,46N,374867_2457086,2022-12-25,"POLYGON ((91.7897 22.2112, 91.78967 22.21467, ...",[{'id': 'S2B_MSIL2A_20221227T043209_N0509_R133...
2,data/json/zone=38N/region=601805_4299370/date=...,38N,601805_4299370,2022-10-05,"POLYGON ((46.17736 38.83375, 46.17741 38.83721...",[{'id': 'S2B_MSIL2A_20221008T074809_N0400_R135...
3,data/json/zone=20S/region=598796_7170926/date=...,20S,598796_7170926,2022-09-03,"POLYGON ((-62.01249 -25.57924, -62.01252 -25.5...",[{'id': 'S2B_MSIL2A_20220906T140709_N0400_R110...
4,data/json/zone=50N/region=698978_4416263/date=...,50N,698978_4416263,2022-09-04,"POLYGON ((119.33101 39.86955, 119.33112 39.873...",[{'id': 'S2A_MSIL2A_20220906T025541_N0400_R032...


In [3]:
gdf.shape

(1000000, 6)

In [4]:
gdf['num_matches'] = gdf.matches.apply(len)
gdf['num_matches'].value_counts()

num_matches
2     342069
1     284880
3     168153
4     104093
6      75391
8       7456
0       4926
12      4115
9       3116
5       2010
11      1322
10      1273
7        963
15       233
Name: count, dtype: int64

In [5]:
gdf['matches'].sample(1).values

array([array([{'id': 'S2B_MSIL2A_20221211T173729_N0509_R055_T13QDD_20221211T202142', 'properties': {'datetime': '2022-12-11T17:48:04Z', 'eo:cloud_cover': 0.89}}],
             dtype=object)                                                                                                                                        ],
      dtype=object)

In [6]:
sample =  gdf.sample(1)

results = sample.matches.values

print(results[0])

[{'id': 'S2B_MSIL2A_20220815T062629_N0400_R077_T41SMS_20220815T081541', 'properties': {'datetime': '2022-08-15T06:40:18Z', 'eo:cloud_cover': 0.0}}
 {'id': 'S2B_MSIL2A_20220815T062629_N0400_R077_T41SMT_20220815T081541', 'properties': {'datetime': '2022-08-15T06:40:04Z', 'eo:cloud_cover': 0.0}}
 {'id': 'S2A_MSIL2A_20220813T063641_N0400_R120_T41SMS_20220813T103103', 'properties': {'datetime': '2022-08-13T06:50:22Z', 'eo:cloud_cover': 0.0}}
 {'id': 'S2A_MSIL2A_20220813T063641_N0400_R120_T41SMT_20220813T103103', 'properties': {'datetime': '2022-08-13T06:50:09Z', 'eo:cloud_cover': 7.0}}
 {'id': 'S2A_MSIL2A_20220810T062641_N0400_R077_T41SMS_20220810T110156', 'properties': {'datetime': '2022-08-10T06:40:27Z', 'eo:cloud_cover': 0.0}}
 {'id': 'S2A_MSIL2A_20220810T062641_N0400_R077_T41SMT_20220810T110156', 'properties': {'datetime': '2022-08-10T06:40:13Z', 'eo:cloud_cover': 0.0}}]


In [7]:
from datetime import datetime
import json
from pathlib import Path
import requests
from eotdl.tools import bbox_from_centroid
from eotdl.access import download_sentinel_imagery
import shutil
import os

CLOUD_COVER_THRESHOLD = 0.1
WIDTH = 38
HEIGHT = 38

shutil.rmtree('sample')
os.makedirs('sample', exist_ok=True)


# starting from HR data (Satellogic), find available S1/S2 that matches in time/area
# take all matches, remove ones with >= 10% cloud cover,

print("collecting...")

if len(results[0]) > 0:
	
	samp_results = results[0]
	
	# filter by cloud cover
	results_filtered = [
		r for r in samp_results
		if r['properties']['eo:cloud_cover'] <= CLOUD_COVER_THRESHOLD
	]

	for r in samp_results:
		print(r)

	if len(results_filtered) > 0:
        
		# Find sat image and sent2 image that are closest in time
		date = sample.date.iloc[0]
		closest_match = min(results_filtered, key=lambda x: abs(datetime.fromisoformat(x['properties']['datetime'].replace('Z','')) - date))
		
		# in short, satellogic segment downloads from url using metadata, and saves in chunks to local sample dir
        # the sentinal 2 segment downloads image from local path (/fastdata/Satellogic/data/), 
        
        # sat (hr) image is huge. sample var is the details of the sat image, including its metadata and where to download it. 
        # we download the sent2 image of a small box.
		
		# satellogic (HR DATA)
        
		# get path to metadata file
		json_path = sample.json_path.iloc[0]  #
		json_path = json_path.replace('data/', '/fastdata/Satellogic/data/')  #
		with open(json_path, 'r') as f:
			metadata = json.load(f)
            
        # get download link from metadata    
		url = metadata['assets']['analytic']['href']
        
        # make path where the sat (hr) image will be saved. download, and then write in chunks to that location. 
		output_path = Path("sample/satellogic") / url.split('/')[-1]
		output_path.parent.mkdir(parents=True, exist_ok=True)
		response = requests.get(url, stream=True)
		response.raise_for_status()
		with open(output_path, 'wb') as f:
			for chunk in response.iter_content(chunk_size=8192):
				f.write(chunk)
		
		# sentinel 1
        
		# get sentinal 2 image path by simply replacing json path ending with .tif ending.
		print("downloading S1...")
		name = json_path.split('/')[-1].replace('_metadata.json', '_S1GRD')
		dst_path_sentinel = "sample/sentinel1/" + name + '.tif'

		# get centroid/center point of sat image.
		centroid = sample.geometry.iloc[0].centroid

		# declare bounds/little square of the close up of the sat image that you want. 
		custom_bbox = bbox_from_centroid(x=centroid.y, y=centroid.x, pixel_size=10, width=WIDTH, height=HEIGHT)
		# print(sample.geometry.iloc[0].bounds)
		print(custom_bbox, closest_match['properties']['datetime'].split('T')[0])

		# download the sat image with those bounds.
		download_sentinel_imagery("sample/sentinel1", closest_match['properties']['datetime'], custom_bbox, "sentinel-1-grd", name=name)
		# Path(str(dst_path_sentinel).replace('.tif', '.json')).unlink(missing_ok=True)
        
		
		# sentinel 2

		# get sentinal 2 image path by simply replacing json path ending with .tif ending.
		print("downloading S2...")
		name = json_path.split('/')[-1].replace('_metadata.json', '_S2L2A')
		dst_path_sentinel = "sample/sentinel2/" + name + '.tif'
        
        # get centroid/center point of sat image.
		centroid = sample.geometry.iloc[0].centroid
        
        # declare bounds/little square of the close up of the sat image that you want. 
		custom_bbox = bbox_from_centroid(x=centroid.y, y=centroid.x, pixel_size=10, width=WIDTH, height=HEIGHT)
		# print(sample.geometry.iloc[0].bounds)
		print(custom_bbox, closest_match['properties']['datetime'].split('T')[0])
        
        # download the sat image with those bounds.
		download_sentinel_imagery("sample/sentinel2", closest_match['properties']['datetime'], custom_bbox, "sentinel-2-l2a", name=name)
		# Path(str(dst_path_sentinel).replace('.tif', '.json')).unlink(missing_ok=True)

    

  from .autonotebook import tqdm as notebook_tqdm


collecting...
{'id': 'S2B_MSIL2A_20220815T062629_N0400_R077_T41SMS_20220815T081541', 'properties': {'datetime': '2022-08-15T06:40:18Z', 'eo:cloud_cover': 0.0}}
{'id': 'S2B_MSIL2A_20220815T062629_N0400_R077_T41SMT_20220815T081541', 'properties': {'datetime': '2022-08-15T06:40:04Z', 'eo:cloud_cover': 0.0}}
{'id': 'S2A_MSIL2A_20220813T063641_N0400_R120_T41SMS_20220813T103103', 'properties': {'datetime': '2022-08-13T06:50:22Z', 'eo:cloud_cover': 0.0}}
{'id': 'S2A_MSIL2A_20220813T063641_N0400_R120_T41SMT_20220813T103103', 'properties': {'datetime': '2022-08-13T06:50:09Z', 'eo:cloud_cover': 7.0}}
{'id': 'S2A_MSIL2A_20220810T062641_N0400_R077_T41SMS_20220810T110156', 'properties': {'datetime': '2022-08-10T06:40:27Z', 'eo:cloud_cover': 0.0}}
{'id': 'S2A_MSIL2A_20220810T062641_N0400_R077_T41SMT_20220810T110156', 'properties': {'datetime': '2022-08-10T06:40:13Z', 'eo:cloud_cover': 0.0}}
[62.24540369593239, 33.406532003378786, 62.24946054567369, 33.409984012515494] 2022-08-13


ValueError: Your are not logged in and have not provided Sentinel Hub credentials. Please, crete a .env file with your SH_CLIENT_ID and SH_CLIENT_SECRET, or login

In [25]:
import matplotlib.pyplot as plt
import rasterio as rio

fig, axs = plt.subplots(1, 2, figsize=(5, 3))
axs[0].imshow((rio.open(output_path).read()[:3,...].transpose(1, 2, 0) / 3000).clip(0, 1))
axs[0].axis('off')
axs[1].imshow((rio.open(dst_path_sentinel).read()[(3,2,1),...].transpose(1, 2, 0) / 3000).clip(0, 1))
axs[1].axis('off')
plt.show()

ValueError: Key backend: 'module://matplotlib_inline.backend_inline' is not a valid value for backend; supported values are ['gtk3agg', 'gtk3cairo', 'gtk4agg', 'gtk4cairo', 'macosx', 'nbagg', 'notebook', 'qtagg', 'qtcairo', 'qt5agg', 'qt5cairo', 'tkagg', 'tkcairo', 'webagg', 'wx', 'wxagg', 'wxcairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template']

In [29]:
import matplotlib.pyplot as plt


# plot histogram with number of images per zone
fig = plt.figure(figsize=(15, 3))
gdf['zone'].value_counts().plot(kind='bar', ax=fig.gca(), grid=True)
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
len(gdf['zone'].unique()), len(gdf['region'].unique())

In [None]:
gdf['zone'].value_counts().min()

In [None]:
# plot histogram with number of images per month (all images are from 2022)
fig = plt.figure(figsize=(15, 3))
gdf['date'].dt.month.value_counts().plot(kind='bar', ax=fig.gca(), grid=True)
plt.show()


In [None]:
zones = sorted(gdf['zone'].unique())
zone_gdf = gdf[gdf['zone'] == zones[4]]
sample = zone_gdf.sample(5, random_state=2025)
sample

Download the images

In [7]:
!rm -rf /fastdata/Satellogic/data/tifs/satellogic/*
!rm -rf /fastdata/Satellogic/data/tifs/sentinel2/*

/bin/bash: line 1: /usr/bin/rm: Argument list too long
/bin/bash: line 1: /usr/bin/rm: Argument list too long


In [None]:
from dataset import search_matches


# now supports S2 + S1
matches = []
for row, item in sample.iterrows():
	print(item.json_path)
	sent1_matches = find_sentinel_matches(row, item.date, item.geometry.bounds, "sentinel-1-grd")
	sent2_matches = find_sentinel_matches(row, item.date, item.geometry.bounds, "sentinel-2-l2a")
	
	matches.append(sent1_matches + sent2_matches)

In [None]:
matches

In [None]:
import rasterio as rio

for match in matches:
	if match is None: continue
	s2_image, sat_image = match
	fig, axs = plt.subplots(1, 2, figsize=(5, 3))
	axs[0].imshow((rio.open(sat_image).read()[:3,...].transpose(1, 2, 0) / 4000).clip(0, 1))
	axs[0].axis('off')
	axs[1].imshow((rio.open(s2_image).read()[(3,2,1),...].transpose(1, 2, 0) / 4000).clip(0, 1))
	axs[1].axis('off')
	plt.show()

