In [1]:
from minio import Minio
import dotenv
import os
from glob import glob
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from shapely.geometry import Polygon
from pathlib import Path
import hashlib
import geopandas as gpd

dotenv.load_dotenv()

True

In [2]:
client = Minio(
	endpoint=os.environ['S3_ENDPOINT'],
	access_key=os.environ['ACCESS_KEY_ID'],
	secret_key=os.environ['SECRET_ACCESS_KEY'],
	secure=True,
)

In [3]:
# dataset_id = "685fdfd4acc05f06ec9919e5"
dataset_id = "68b957ec9fc8ac5128705dbf" # phase 2
# path = "/fastdata/charter-slim-dataset/"
path = "/fastdata/charter-testing-dataset/"

In [None]:
# # Delete all objects under the prefix, sorting by depth (number of '/' in object_name) descending
objects_to_delete = list(client.list_objects(os.environ['BUCKET'], prefix=dataset_id + '/', recursive=True))
# # Sort by number of '/' in object_name, descending (deepest first)
objects_to_delete.sort(key=lambda obj: obj.object_name.count('/'), reverse=True)
for obj in objects_to_delete:
    # try:
    #     client.remove_object(os.environ['BUCKET'], obj.object_name)
    # except Exception as e:
    #     print(e)
    # print(obj.object_name)

In [10]:
files = glob(path + '**/*', recursive=True)
len(files)

93

In [11]:
for file in tqdm(files):
    if os.path.isdir(file): continue
    object_name = dataset_id + '/' + file.replace(path, '')
    # stats = client.stat_object(os.environ['BUCKET'], object_name)
    # if stats: continue
    client.fput_object(
        os.environ['BUCKET'],
        object_name,
        file,
    )

100%|██████████| 93/93 [06:24<00:00,  4.14s/it]


In [12]:
def calculate_checksum(file_path):
    sha1_hash = hashlib.sha1()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            sha1_hash.update(chunk)
    return sha1_hash.hexdigest()

def create_stac_item(item_id, asset_href):
	return {
		'type': 'Feature',
		'stac_version': '1.0.0',
		'stac_extensions': [],
		'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
		'id': item_id,
		'bbox': {
			'xmin': 0.0,
			'ymin': 0.0,
			'xmax': 0.0,
			'ymax': 0.0
		}, 
		'geometry': Polygon(), # empty polygon
		'assets': { 'asset': { 
			'href': asset_href,
			'checksum': calculate_checksum(asset_href),
			'timestamp': datetime.now(),
			'size': Path(asset_href).stat().st_size,
		}},
		"links": [],
		# 'collection': 'source',
		# anything below are properties (need at least one!)
		'repository': 'eotdl',				
	}

In [13]:
catalog_path = Path(path + "catalog.parquet")
files = [f for f in files if f != str(catalog_path)]
data = []
for file in tqdm(files):
    file_path = Path(file)
    if file_path.is_file():
        relative_path = os.path.relpath(file_path, catalog_path.parent)
        absolute_path = str(file_path)
        stac_item = create_stac_item(relative_path, absolute_path)
        stac_item['assets']['asset']['href'] = f"https://api.eotdl.com/datasets/{dataset_id}/stage/{relative_path}"
        data.append(stac_item)
gdf = gpd.GeoDataFrame(data, geometry='geometry')
gdf.to_parquet(catalog_path)

100%|██████████| 92/92 [00:21<00:00,  4.35it/s] 


In [14]:
gdf

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-09-04 11:28:56.252078,catalog.json,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
1,Feature,1.0.0,[],2025-09-04 11:28:56.252698,Dataset-details-for-AOI.xlsx,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
2,Feature,1.0.0,[],2025-09-04 11:28:56.252902,charter-testing-dataset.structure,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
3,Feature,1.0.0,[],2025-09-04 11:28:56.253055,README.md,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
4,Feature,1.0.0,[],2025-09-04 11:28:56.253234,charter-testing-dataset.filelist,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
...,...,...,...,...,...,...,...,...,...,...
63,Feature,1.0.0,[],2025-09-04 11:29:15.872850,Testing/Call_919_Turkey/DS_PHR1B_2020100308420...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
64,Feature,1.0.0,[],2025-09-04 11:29:16.091751,Testing/Call_919_Turkey/DS_PHR1B_2020100308420...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
65,Feature,1.0.0,[],2025-09-04 11:29:16.528232,Testing/Call_919_Turkey/DS_PHR1B_2020100308420...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl
66,Feature,1.0.0,[],2025-09-04 11:29:16.964728,Testing/Call_919_Turkey/DS_PHR1B_2020100308420...,"{'xmin': 0.0, 'ymin': 0.0, 'xmax': 0.0, 'ymax'...",POLYGON EMPTY,{'asset': {'href': 'https://api.eotdl.com/data...,[],eotdl


In [15]:
client.fput_object(
    os.environ['BUCKET'],
    f'{dataset_id}/catalog.v1.parquet',
    catalog_path,
)   

<minio.helpers.ObjectWriteResult at 0x7dda71d86d20>

In [16]:
size = 0
for asset in gdf.assets:
    size += asset['asset']['size']
size

26856678502

In [18]:
gdf = gpd.read_parquet("../kk2/charter-eo4ai-etq-challenge-testing/catalog.v1.parquet")

In [19]:
gdf.shape

(68, 10)

In [20]:
gdf.assets.values

array([{'asset': {'checksum': '4b816935974362125bbe5b59281ababd7198be3e', 'href': 'https://api.eotdl.com/datasets/68b957ec9fc8ac5128705dbf/stage/catalog.json', 'size': 900, 'timestamp': datetime.datetime(2025, 9, 4, 11, 28, 56, 252586)}},
       {'asset': {'checksum': '3b4e221915ab9335ea371f0fc4d6a371021d3df2', 'href': 'https://api.eotdl.com/datasets/68b957ec9fc8ac5128705dbf/stage/Dataset-details-for-AOI.xlsx', 'size': 11640, 'timestamp': datetime.datetime(2025, 9, 4, 11, 28, 56, 252821)}},
       {'asset': {'checksum': '37b8ac203e786221adb58d0072d0eb25e31d5946', 'href': 'https://api.eotdl.com/datasets/68b957ec9fc8ac5128705dbf/stage/charter-testing-dataset.structure', 'size': 1158, 'timestamp': datetime.datetime(2025, 9, 4, 11, 28, 56, 252981)}},
       {'asset': {'checksum': 'b0d8c35662bcd7eb73296a8ffb842efd7282a687', 'href': 'https://api.eotdl.com/datasets/68b957ec9fc8ac5128705dbf/stage/README.md', 'size': 9830, 'timestamp': datetime.datetime(2025, 9, 4, 11, 28, 56, 253163)}},
      

In [21]:
from eotdl.datasets import stage_dataset_file

stage_dataset_file(
    'https://api.eotdl.com/datasets/68b957ec9fc8ac5128705dbf/stage/README.md',
    '../kk2',
)

'../kk2/README.md'