In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from shapely.geometry import Polygon
import pyarrow.parquet as pq
import pystac
from tqdm import tqdm
import stac_geoparquet
import requests
from dotenv import load_dotenv
import os

load_dotenv()


True

We have a list of the files ingested in ESA PRR

In [None]:
# Read file and process lines
with open('esawaai.csv', 'r') as f:
    lines = f.readlines()

# Remove leading/trailing whitespace and split on first space
data = []
for line in lines:
    line = line.strip()
    if line:  # Skip empty lines
        size, path = line.split(' ', 1)
        data.append([int(size), path])

# Convert to pandas DataFrame
df = pd.DataFrame(data, columns=['size', 'path'])
df['id'] = df.apply(lambda x: x['path'].split('/')[-1], axis=1)

df

Unnamed: 0,size,path,id
0,1533285,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw1-esawaai-1sdv-20190826t183441-20190826t...
1,185044694,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw2-esawaai-1sdv-20190826t183442-20190826t...
2,168779279,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw3-esawaai-1sdv-20190826t183441-20190826t...
3,144352380,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw1-esawaai-1sdv-20190827t075748-20190827t...
4,209491051,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...,s1a-iw2-esawaai-1sdv-20190827t075749-20190827t...
...,...,...,...
18484,185157726,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw2-esawaai-1sdv-20231104t182359-20231104t...
18485,169103739,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw3-esawaai-1sdv-20231104t182359-20231104t...
18486,129979939,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw1-esawaai-1sdv-20231104t182450-20231104t...
18487,184753718,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...,s1a-iw2-esawaai-1sdv-20231104t182448-20231104t...


In [14]:
def post_prr(args):
    id, path = args
    response = requests.post(
        'https://eoresults.esa.int/reg-api/collections/ESAWAAI/items',
        auth=(os.getenv('PRR_USER'), os.getenv('PRR_PWD')),
        headers={
            'accept': 'application/json',
            'Content-Type': 'application/json'
        },
        json={
            "type": "Feature",
            "stac_version": "1.0.0",
            "stac_extensions": [
                "https://stac-extensions.github.io/alternate-assets/v1.1.0/schema.json",
                "https://stac-extensions.github.io/storage/v1.0.0/schema.json"
            ],
            "id": id,
            "properties": {
                "datetime": datetime.now().isoformat() + 'Z'
            },
            "assets": {
                "PRODUCT": {
                    "href": path
                }
            }
        }
    )
    return response.json()

In [None]:
from concurrent.futures import ThreadPoolExecutor

args = [(row['id'], row['path']) for _, row in df.iterrows()]

with ThreadPoolExecutor() as pool:
    with tqdm(total=len(args)) as progress:
        futures = []

        for arg in args:
            future = pool.submit(post_prr, arg) # enviamos la tupla de argumentos
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        results = []
        for future in futures:
            result = future.result()
            results.append(result)

  0%|          | 0/18489 [00:00<?, ?it/s]Process SpawnProcess-1:
Traceback (most recent call last):
  File "/Users/juan/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/juan/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/juan/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/concurrent/futures/process.py", line 252, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/juan/.local/share/uv/python/cpython-3.12.9-macos-aarch64-none/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'post_prr' on <module '__main__' (<class '_frozen_importlib.BuiltinI

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

We can generate a STAC-compliant parquet file. We should add here all the required metadata (PRR updates later are not supported).

In [4]:
data = []
for _, row in df.iterrows():
    # print(row['path'])
    size = row['size']
    item_id = row['path'] # Do we want to use the path as the item id?
    asset_href = 'https://...' # TODO
    checksum = None # TODO
    data.append({
        'type': 'Feature',
        'stac_version': '1.0.0',
        'stac_extensions': [],
        'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
        'id': item_id,
        'bbox': {
            'xmin': 0.0,
            'ymin': 0.0,
            'xmax': 0.0,
            'ymax': 0.0
        }, 
        'geometry': Polygon(), # empty polygon
        'assets': { 'asset': { 
            'href': asset_href,
            'checksum': checksum,
            'timestamp': datetime.now(),
            'size': size,
        }},
        "links": [],
        # anything below are properties (need at least one!)
        'repository': 'eotdl',		
    })
    break

gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet('catalog.parquet')
gdf

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-04-08 13:13:44.341092,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,"{'asset': {'href': 'https://...', 'checksum': ...",[],eotdl


Once the parquet file has been generated, we can convert to STAC items and POST to PRR

In [5]:
table = pq.read_table('catalog.parquet')
items = []
for item in tqdm(stac_geoparquet.arrow.stac_table_to_items(table), total=len(table)):
    item = pystac.Item.from_dict(item)
    # TODO: POST to PRR

100%|██████████| 1/1 [00:00<00:00, 184.09it/s]


Assuming the `catalog.parquet` file is correct, we can ingest directly into EOTDL. Otherwise, update the file before ingestion

In [6]:
from eotdl.datasets import ingest_dataset_catalog

# TODO: create README.md

ingest_dataset_catalog('.')

[Errno 2] No such file or directory: 'README.md'


Exception: Error loading metadata

We can now access the dataset in EOTDL

In [7]:
from eotdl.datasets import retrieve_datasets

retrieve_datasets('ESAWAII')

[]

In [8]:
from eotdl.datasets import stage_dataset

stage_dataset('ESAWAII', path='output')

Exception: Dataset doesn't exist

In [9]:
catalog = gpd.read_parquet('output/catalog.v1.parquet')
catalog

FileNotFoundError: [Errno 2] Failed to open local file 'output/catalog.v1.parquet'. Detail: [errno 2] No such file or directory