In [1]:
from minio import Minio
import dotenv
import os
from glob import glob
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from shapely.geometry import Polygon
from pathlib import Path
import hashlib
import geopandas as gpd

dotenv.load_dotenv()

True

In [2]:
client = Minio(
	endpoint=os.environ['S3_ENDPOINT'],
	access_key=os.environ['ACCESS_KEY_ID'],
	secret_key=os.environ['SECRET_ACCESS_KEY'],
	secure=True,
)

In [None]:
dataset_id = "685fdfd4acc05f06ec9919e5"
path = "/fastdata/charter-slim-dataset/"

In [35]:
files = glob(path + '**/*', recursive=True)
len(files)

350

In [36]:
for file in tqdm(files):
    if os.path.isdir(file): continue
    object_name = dataset_id + '/' + file.replace(path, '')
    # stats = client.stat_object(os.environ['BUCKET'], object_name)
    # if stats: continue
    client.fput_object(
        os.environ['BUCKET'],
        object_name,
        file,
    )

100%|██████████| 350/350 [30:36<00:00,  5.25s/it]  


In [38]:
def calculate_checksum(file_path):
    sha1_hash = hashlib.sha1()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            sha1_hash.update(chunk)
    return sha1_hash.hexdigest()

def create_stac_item(item_id, asset_href):
	return {
		'type': 'Feature',
		'stac_version': '1.0.0',
		'stac_extensions': [],
		'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
		'id': item_id,
		'bbox': {
			'xmin': 0.0,
			'ymin': 0.0,
			'xmax': 0.0,
			'ymax': 0.0
		}, 
		'geometry': Polygon(), # empty polygon
		'assets': { 'asset': { 
			'href': asset_href,
			'checksum': calculate_checksum(asset_href),
			'timestamp': datetime.now(),
			'size': Path(asset_href).stat().st_size,
		}},
		"links": [],
		# 'collection': 'source',
		# anything below are properties (need at least one!)
		'repository': 'eotdl',				
	}

In [39]:
catalog_path = Path(path + "catalog.parquet")
files = [f for f in files if f != str(catalog_path)]
data = []
for file in tqdm(files):
    file_path = Path(file)
    if file_path.is_file():
        relative_path = os.path.relpath(file_path, catalog_path.parent)
        absolute_path = str(file_path)
        stac_item = create_stac_item(relative_path, absolute_path)
        stac_item['assets']['asset']['href'] = f"https://api.eotdl.com/datasets/{dataset_id}/stage/{relative_path}"
        data.append(stac_item)
gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet(catalog_path)

100%|██████████| 350/350 [03:31<00:00,  1.65it/s]


In [40]:
gdf

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-06-30 14:01:10.593326,README.md,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
1,Feature,1.0.0,[],2025-06-30 14:01:10.594311,charter-dataset.filelist,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
2,Feature,1.0.0,[],2025-06-30 14:01:10.594682,charter-dataset.structure,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
3,Feature,1.0.0,[],2025-06-30 14:01:10.594775,Dataset-details-for-AOI.xlsx,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
4,Feature,1.0.0,[],2025-06-30 14:01:10.594860,catalog.json,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
...,...,...,...,...,...,...,...,...,...,...
256,Feature,1.0.0,[],2025-06-30 14:04:27.852027,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
257,Feature,1.0.0,[],2025-06-30 14:04:30.201271,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
258,Feature,1.0.0,[],2025-06-30 14:04:34.354946,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
259,Feature,1.0.0,[],2025-06-30 14:04:36.967099,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl


In [45]:
client.fput_object(
    os.environ['BUCKET'],
    f'{dataset_id}/catalog.v1.parquet',
    catalog_path,
)   

<minio.helpers.ObjectWriteResult at 0x7f9e8259cb60>

In [50]:
size = 0
for asset in gdf.assets:
    size += asset['asset']['size']
size

147608990865

In [53]:
gdf = gpd.read_parquet("../kk2/international-charter-eo4ai-etq-challenge/catalog.v1.parquet")

In [57]:
gdf.assets.values

array([{'asset': {'checksum': '8fbe6b97b146115e811e30619b57decab6167798', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/README.md', 'size': 29331, 'timestamp': datetime.datetime(2025, 6, 30, 14, 1, 10, 594245)}},
       {'asset': {'checksum': '6c5d4f69b732e11bf70ed54c33f2aab3a699c366', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/charter-dataset.filelist', 'size': 15743, 'timestamp': datetime.datetime(2025, 6, 30, 14, 1, 10, 594640)}},
       {'asset': {'checksum': 'fea49f952002c879d88a2cbedae1b8b92b240f25', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/charter-dataset.structure', 'size': 15743, 'timestamp': datetime.datetime(2025, 6, 30, 14, 1, 10, 594742)}},
       {'asset': {'checksum': '3b4e221915ab9335ea371f0fc4d6a371021d3df2', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/Dataset-details-for-AOI.xlsx', 'size': 11640, 'timestamp': datetime.datetime(2025, 6, 30, 14, 1, 10, 594820)}}

In [58]:
from eotdl.datasets import stage_dataset_file

stage_dataset_file(
    'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/Testing/Call_1095_Myanmar/WV02N21_849583E096_0173612024123000000000MS00-calibrated/Optical_Calibration/r-blue.tif',
    '../kk2',
)

'../kk2/Testing/Call_1095_Myanmar/WV02N21_849583E096_0173612024123000000000MS00-calibrated/Optical_Calibration/r-blue.tif'