In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from shapely.geometry import Polygon
import pyarrow.parquet as pq
import pystac
from tqdm import tqdm
import stac_geoparquet
import requests
from dotenv import load_dotenv
import os

load_dotenv()


True

We have a list of the files ingested in ESA PRR

In [3]:
# Read file and process lines
with open('esawaai.csv', 'r') as f:
    lines = f.readlines()

# Remove leading/trailing whitespace and split on first space
data = []
for line in lines:
    line = line.strip()
    if line:  # Skip empty lines
        size, path = line.split(' ', 1)
        data.append([int(size), path])

# Convert to pandas DataFrame
df = pd.DataFrame(data, columns=['size', 'path'])
df['id'] = df.apply(lambda x: x['path'].split('/')[-1], axis=1)

df

Unnamed: 0,size,path,id
0,1533285,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw1-esawaai-1sdv-20190826t183441-20190826t...
1,185044694,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw2-esawaai-1sdv-20190826t183442-20190826t...
2,168779279,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw3-esawaai-1sdv-20190826t183441-20190826t...
3,144352380,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw1-esawaai-1sdv-20190827t075748-20190827t...
4,209491051,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw2-esawaai-1sdv-20190827t075749-20190827t...
...,...,...,...
18484,185157726,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw2-esawaai-1sdv-20231104t182359-20231104t...
18485,169103739,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw3-esawaai-1sdv-20231104t182359-20231104t...
18486,129979939,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw1-esawaai-1sdv-20231104t182450-20231104t...
18487,184753718,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw2-esawaai-1sdv-20231104t182448-20231104t...


In [4]:
df['size'].sum()

2822481164128

Post to PRR to create the items in the collection

In [5]:
def post_prr(args):
    id, path = args
    response = requests.post(
        'https://eoresults.esa.int/reg-api/collections/ESAWAAI/items',
        auth=(os.getenv('PRR_USER'), os.getenv('PRR_PWD')),
        headers={
            'accept': 'application/json',
            'Content-Type': 'application/json'
        },
        json={
            "type": "Feature",
            "stac_version": "1.0.0",
            "stac_extensions": [
                "https://stac-extensions.github.io/alternate-assets/v1.1.0/schema.json",
                "https://stac-extensions.github.io/storage/v1.0.0/schema.json"
            ],
            "id": id,
            "properties": {
                "datetime": datetime.now().isoformat() + 'Z'
            },
            "assets": {
                "PRODUCT": {
                    "href": path
                }
            }
        }
    )
    return response.json()

In [6]:
from concurrent.futures import ThreadPoolExecutor

args = [(row['id'], row['path']) for _, row in df.iterrows()]
if False:
    with ThreadPoolExecutor() as pool:
        with tqdm(total=len(args)) as progress:
            futures = []

            for arg in args:
                future = pool.submit(post_prr, arg) 
                future.add_done_callback(lambda p: progress.update())
                futures.append(future)

            results = []
            for future in futures:
                result = future.result()
                results.append(result)

We can generate a STAC-compliant parquet file for EOTDL with the links to the PRR items.

In [17]:
data = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    # print(row['path'])
    size = row['size']
    item_id = row['path'] # Do we want to use the path as the item id?
    asset_href = 'https://eoresults.esa.int/d/' + '/'.join(row['path'].split('/')[:4]) + f'/{row["id"]}' + f'/{row["id"]}'
    asset_href = asset_href.replace('/d/esawaai/', '/d/ESAWAAI/')
    # print(asset_href)
    checksum = None # TODO
    data.append({
        'type': 'Feature',
        'stac_version': '1.0.0',
        'stac_extensions': [],
        'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
        'id': item_id,
        'bbox': {
            'xmin': 0.0,
            'ymin': 0.0,
            'xmax': 0.0,
            'ymax': 0.0
        }, 
        'geometry': Polygon(), # empty polygon
        'assets': { 'asset': { 
            'href': asset_href,
            'checksum': checksum,
            'timestamp': datetime.now(),
            'size': size,
        }},
        "links": [],
        # anything below are properties (need at least one!)
        'repository': 'eotdl',		
    })

gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet('outputs/catalog.parquet')
gdf

100%|██████████| 18489/18489 [00:00<00:00, 55344.34it/s]


Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-05-16 09:49:10.476460,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
1,Feature,1.0.0,[],2025-05-16 09:49:10.476537,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
2,Feature,1.0.0,[],2025-05-16 09:49:10.476566,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
3,Feature,1.0.0,[],2025-05-16 09:49:10.476591,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
4,Feature,1.0.0,[],2025-05-16 09:49:10.476611,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
...,...,...,...,...,...,...,...,...,...,...
18484,Feature,1.0.0,[],2025-05-16 09:49:10.809711,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
18485,Feature,1.0.0,[],2025-05-16 09:49:10.809727,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
18486,Feature,1.0.0,[],2025-05-16 09:49:10.809742,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl
18487,Feature,1.0.0,[],2025-05-16 09:49:10.809756,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://eoresults.esa.int/...,[],eotdl


Assuming the `catalog.parquet` file is correct, we can ingest directly into EOTDL. Otherwise, update the file before ingestion

In [18]:
# create README.md

text = """---
name: ESAWAAI
authors: 
  - ESAWAAI
license: free
source: https://github.com/earthpulse/eotdl/tree/main/tutorials/usecases/ESAWAII
---

# ESAWAAI

Dataset for the ESAWAAI project.
"""

with open("outputs/README.md", "w") as outfile:
    outfile.write(text)

In [22]:
import os 

os.environ['EOTDL_API_URL'] = 'https://api.eotdl.com/'

In [23]:
from eotdl.datasets import ingest_dataset_catalog

ingest_dataset_catalog('outputs')

Ingesting files: 100%|██████████| 18489/18489 [00:00<00:00, 101668.93it/s]


PosixPath('outputs/catalog.parquet')

We can now access the dataset in EOTDL

In [26]:
from eotdl.datasets import retrieve_datasets

retrieve_datasets('ESAWAAI')

['ESAWAAI']

In [28]:
from eotdl.datasets import stage_dataset

stage_dataset('ESAWAAI', path='data')

'data/ESAWAAI'

In [29]:
catalog = gpd.read_parquet('data/ESAWAAI/catalog.v1.parquet')
catalog

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-05-16 09:49:10.476460,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
1,Feature,1.0.0,[],2025-05-16 09:49:10.476537,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
2,Feature,1.0.0,[],2025-05-16 09:49:10.476566,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
3,Feature,1.0.0,[],2025-05-16 09:49:10.476591,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
4,Feature,1.0.0,[],2025-05-16 09:49:10.476611,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
...,...,...,...,...,...,...,...,...,...,...
18484,Feature,1.0.0,[],2025-05-16 09:49:10.809711,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
18485,Feature,1.0.0,[],2025-05-16 09:49:10.809727,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
18486,Feature,1.0.0,[],2025-05-16 09:49:10.809742,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl
18487,Feature,1.0.0,[],2025-05-16 09:49:10.809756,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,"{'asset': {'checksum': None, 'href': 'https://...",[],eotdl


In [30]:
catalog.assets.iloc[0]

{'asset': {'checksum': None,
  'href': 'https://eoresults.esa.int/d/ESAWAAI/2019/08/26/s1a-iw1-esawaai-1sdv-20190826t183441-20190826t183507-028744-034136-a15.nc/s1a-iw1-esawaai-1sdv-20190826t183441-20190826t183507-028744-034136-a15.nc',
  'size': 1533285,
  'timestamp': datetime.datetime(2025, 5, 16, 9, 49, 10, 476499)}}