In [1]:
from minio import Minio
import dotenv
import os
from glob import glob
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from shapely.geometry import Polygon
from pathlib import Path
import hashlib
import geopandas as gpd

dotenv.load_dotenv()

True

In [2]:
client = Minio(
	endpoint=os.environ['S3_ENDPOINT'],
	access_key=os.environ['ACCESS_KEY_ID'],
	secret_key=os.environ['SECRET_ACCESS_KEY'],
	secure=True,
)

In [3]:
dataset_id = "685fdfd4acc05f06ec9919e5"
path = "/fastdata/charter-slim-dataset/"

In [8]:
# # Delete all objects under the prefix, sorting by depth (number of '/' in object_name) descending
objects_to_delete = list(client.list_objects(os.environ['BUCKET'], prefix=dataset_id + '/', recursive=True))
# # Sort by number of '/' in object_name, descending (deepest first)
# objects_to_delete.sort(key=lambda obj: obj.object_name.count('/'), reverse=True)
for obj in objects_to_delete:
    # client.remove_object(os.environ['BUCKET'], obj.object_name)
    print(obj.object_name)

685fdfd4acc05f06ec9919e5/
685fdfd4acc05f06ec9919e5/Earthquake-challenge_v3.ipynb
685fdfd4acc05f06ec9919e5/Phase_1_Dataset_Details.xlsx
685fdfd4acc05f06ec9919e5/README.md
685fdfd4acc05f06ec9919e5/Training/
685fdfd4acc05f06ec9919e5/catalog.json
685fdfd4acc05f06ec9919e5/catalog.parquet
685fdfd4acc05f06ec9919e5/catalog.v1.parquet
685fdfd4acc05f06ec9919e5/charter-dataset.filelist
685fdfd4acc05f06ec9919e5/charter-dataset.structure
685fdfd4acc05f06ec9919e5/example_submission.zip
685fdfd4acc05f06ec9919e5/Training/Annotations/
685fdfd4acc05f06ec9919e5/Training/Call_1075_China/
685fdfd4acc05f06ec9919e5/Training/Call_1095_Myanmar/
685fdfd4acc05f06ec9919e5/Training/Call_919_Turkey/
685fdfd4acc05f06ec9919e5/Training/Call_920_Sirya/
685fdfd4acc05f06ec9919e5/Training/Call_965_Morocco/
685fdfd4acc05f06ec9919e5/Training/Call_970_Afghanistan/
685fdfd4acc05f06ec9919e5/Training/training_dataset.contents
685fdfd4acc05f06ec9919e5/Training/Annotations/Call_1075_China/
685fdfd4acc05f06ec9919e5/Training/Annota

In [10]:
files = glob(path + '**/*', recursive=True)
len(files)

275

In [7]:
for file in tqdm(files):
    if os.path.isdir(file): continue
    object_name = dataset_id + '/' + file.replace(path, '')
    # stats = client.stat_object(os.environ['BUCKET'], object_name)
    # if stats: continue
    client.fput_object(
        os.environ['BUCKET'],
        object_name,
        file,
    )

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


In [11]:
def calculate_checksum(file_path):
    sha1_hash = hashlib.sha1()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            sha1_hash.update(chunk)
    return sha1_hash.hexdigest()

def create_stac_item(item_id, asset_href):
	return {
		'type': 'Feature',
		'stac_version': '1.0.0',
		'stac_extensions': [],
		'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
		'id': item_id,
		'bbox': {
			'xmin': 0.0,
			'ymin': 0.0,
			'xmax': 0.0,
			'ymax': 0.0
		}, 
		'geometry': Polygon(), # empty polygon
		'assets': { 'asset': { 
			'href': asset_href,
			'checksum': calculate_checksum(asset_href),
			'timestamp': datetime.now(),
			'size': Path(asset_href).stat().st_size,
		}},
		"links": [],
		# 'collection': 'source',
		# anything below are properties (need at least one!)
		'repository': 'eotdl',				
	}

In [12]:
catalog_path = Path(path + "catalog.parquet")
files = [f for f in files if f != str(catalog_path)]
data = []
for file in tqdm(files):
    file_path = Path(file)
    if file_path.is_file():
        relative_path = os.path.relpath(file_path, catalog_path.parent)
        absolute_path = str(file_path)
        stac_item = create_stac_item(relative_path, absolute_path)
        stac_item['assets']['asset']['href'] = f"https://api.eotdl.com/datasets/{dataset_id}/stage/{relative_path}"
        data.append(stac_item)
gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet(catalog_path)

100%|██████████| 274/274 [04:26<00:00,  1.03it/s]


In [13]:
gdf

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-07-11 13:20:04.050435,README.md,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
1,Feature,1.0.0,[],2025-07-11 13:20:04.050968,charter-dataset.filelist,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
2,Feature,1.0.0,[],2025-07-11 13:20:04.051287,charter-dataset.structure,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
3,Feature,1.0.0,[],2025-07-11 13:20:04.051567,Earthquake-challenge_v3.ipynb,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
4,Feature,1.0.0,[],2025-07-11 13:20:04.057726,example_submission.zip,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
...,...,...,...,...,...,...,...,...,...,...
201,Feature,1.0.0,[],2025-07-11 13:24:08.771387,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
202,Feature,1.0.0,[],2025-07-11 13:24:12.850456,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
203,Feature,1.0.0,[],2025-07-11 13:24:20.072751,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
204,Feature,1.0.0,[],2025-07-11 13:24:24.620890,Training/Call_920_Sirya/DS_PHR1B_2022122008263...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl


In [14]:
client.fput_object(
    os.environ['BUCKET'],
    f'{dataset_id}/catalog.v1.parquet',
    catalog_path,
)   

<minio.helpers.ObjectWriteResult at 0x7bc8c8111550>

In [15]:
size = 0
for asset in gdf.assets:
    size += asset['asset']['size']
size

120877495045

In [16]:
gdf = gpd.read_parquet("../kk2/international-charter-eo4ai-etq-challenge/catalog.v1.parquet")

In [17]:
gdf.shape

(206, 10)

In [18]:
gdf.assets.values

array([{'asset': {'checksum': '4e3efaff758931f0d952935dad3d02593e3a7348', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/README.md', 'size': 11830, 'timestamp': datetime.datetime(2025, 7, 11, 13, 20, 4, 50808)}},
       {'asset': {'checksum': '6c5d4f69b732e11bf70ed54c33f2aab3a699c366', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/charter-dataset.filelist', 'size': 15743, 'timestamp': datetime.datetime(2025, 7, 11, 13, 20, 4, 51170)}},
       {'asset': {'checksum': 'fea49f952002c879d88a2cbedae1b8b92b240f25', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/charter-dataset.structure', 'size': 15743, 'timestamp': datetime.datetime(2025, 7, 11, 13, 20, 4, 51461)}},
       {'asset': {'checksum': '1cd6c824b3a9c23933c6e86982bfa5fe3d18b822', 'href': 'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/Earthquake-challenge_v3.ipynb', 'size': 1466852, 'timestamp': datetime.datetime(2025, 7, 11, 13, 20, 4, 57666)}},

In [19]:
from eotdl.datasets import stage_dataset_file

stage_dataset_file(
    'https://api.eotdl.com/datasets/685fdfd4acc05f06ec9919e5/stage/Earthquake-challenge_v3.ipynb',
    '../kk2',
)

'../kk2/Earthquake-challenge_v3.ipynb'