In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from shapely.geometry import Polygon
import pyarrow.parquet as pq
import pystac
from tqdm import tqdm
import stac_geoparquet

We have a list of the files ingested in ESA PRR

In [3]:
# Read file and process lines
with open('esawaai.csv', 'r') as f:
    lines = f.readlines()

# Remove leading/trailing whitespace and split on first space
data = []
for line in lines:
    line = line.strip()
    if line:  # Skip empty lines
        size, path = line.split(' ', 1)
        data.append([int(size), path])

# Convert to pandas DataFrame
df = pd.DataFrame(data, columns=['size', 'path'])

df

Unnamed: 0,size,path
0,1533285,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...
1,185044694,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...
2,168779279,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...
3,144352380,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...
4,209491051,esawaai/2019/08/27/S1A_IW_ESAWAAI__1SDV_201908...
...,...,...
18484,185157726,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...
18485,169103739,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...
18486,129979939,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...
18487,184753718,esawaai/2023/11/04/S1A_IW_ESAWAAI__1SDV_202311...


We can generate a STAC-compliant parquet file. We should add here all the required metadata (PRR updates later are not supported).

In [4]:
data = []
for _, row in df.iterrows():
	# print(row['path'])
	size = row['size']
	item_id = row['path'] # Do we want to use the path as the item id?
	asset_href = 'https://...' # TODO
	checksum = None # TODO
	data.append({
		'type': 'Feature',
		'stac_version': '1.0.0',
		'stac_extensions': [],
		'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
		'id': item_id,
		'bbox': {
			'xmin': 0.0,
			'ymin': 0.0,
			'xmax': 0.0,
			'ymax': 0.0
		}, 
		'geometry': Polygon(), # empty polygon
		'assets': { 'asset': { 
			'href': asset_href,
			'checksum': checksum,
			'timestamp': datetime.now(),
			'size': size,
		}},
		"links": [],
		# anything below are properties (need at least one!)
		'repository': 'eotdl',		
	})
	break

gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet('catalog.parquet')
gdf

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-04-08 13:13:44.341092,esawaai/2019/08/26/S1A_IW_ESAWAAI__1SDV_201908...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,"{'asset': {'href': 'https://...', 'checksum': ...",[],eotdl


Once the parquet file has been generated, we can convert to STAC items and POST to PRR

In [5]:
table = pq.read_table('catalog.parquet')
items = []
for item in tqdm(stac_geoparquet.arrow.stac_table_to_items(table), total=len(table)):
	item = pystac.Item.from_dict(item)
	# TODO: POST to PRR

100%|██████████| 1/1 [00:00<00:00, 184.09it/s]


Assuming the `catalog.parquet` file is correct, we can ingest directly into EOTDL. Otherwise, update the file before ingestion

In [6]:
from eotdl.datasets import ingest_dataset_catalog

# TODO: create README.md

ingest_dataset_catalog('.')

[Errno 2] No such file or directory: 'README.md'


Exception: Error loading metadata

We can now access the dataset in EOTDL

In [7]:
from eotdl.datasets import retrieve_datasets

retrieve_datasets('ESAWAII')

[]

In [8]:
from eotdl.datasets import stage_dataset

stage_dataset('ESAWAII', path='output')

Exception: Dataset doesn't exist

In [9]:
catalog = gpd.read_parquet('output/catalog.v1.parquet')
catalog

FileNotFoundError: [Errno 2] Failed to open local file 'output/catalog.v1.parquet'. Detail: [errno 2] No such file or directory