In [1]:
import pandas as pd
from minio import Minio
from datetime import datetime
import json
from dotenv import load_dotenv
import os

load_dotenv()

True

In [9]:
def parse_with_dates(json_str):
    data = json.loads(json_str)
    # parse dates if needed
    return data

df = pd.read_csv("models.csv")

df['versions'] = df['versions'].apply(json.loads)
df['files'] = df['files'].apply(json.loads)
df['folders'] = df['folders'].apply(json.loads)

# df

In [3]:
minio_client = Minio(
	endpoint=os.getenv('S3_ENDPOINT'),
	access_key=os.getenv('ACCESS_KEY_ID'),
	secret_key=os.getenv('SECRET_ACCESS_KEY'),
	secure=True,
)

old_bucket = os.getenv('OLD_BUCKET')
new_bucket = os.getenv('NEW_BUCKET')

In [4]:
# from tqdm import tqdm

# for i, row in tqdm(df.iterrows(), total=len(df)):
# 	files = minio_client.list_objects(new_bucket, row.id, recursive=True)
# 	names = [file.object_name for file in files]
# 	try:
# 		assert f'{row.id}/catalog.v1.parquet' in names, f'{row.id} does not have a catalog.v1.parquet file'
# 	except:
# 		print(row.id, row.name)
		

In [6]:
from shapely.geometry import Polygon
import geopandas as gpd
import random
import rasterio as rio

files_map = []
for row in df.iterrows():
	dataset_id = row[1]['id']
	dataset_name = row[1]['name']
	versions = row[1]['versions']
	files = row[1]['files']
	folders = row[1]['folders']
	# print(dataset_id, dataset_name, versions, files, folders)
	# print(dataset_id, json.loads(files))
	# if len(json.loads(versions)) <= 1:
	# 	continue
	for version in json.loads(versions):
		print(version)
		data = []
		for file in json.loads(files):
			if not version['version_id'] in file['versions']:
				continue
			print(file)
			item_id = file['name']
			if file['version'] > 1:
					item_id = f'{file["name"]}-{random.randint(1, 1000000)}'
					print(file['name'], '->', item_id)
			stac_item  = {
				'type': 'Feature',
				'stac_version': '1.0.0',
				'stac_extensions': [],
				'datetime': datetime.now(),  # must be native timestamp (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp)
				'id': item_id,
				'bbox': {
					'xmin': 0.0,
					'ymin': 0.0,
					'xmax': 0.0,
					'ymax': 0.0
				}, # infer from file or from list of geometries
				'geometry': Polygon(), # empty polygon
				'assets': { 'asset': { # STAC needs this to be a Dict[str, Asset], not list !!! use same key or parquet breaks !!!
					'href': f'https://dev.api.eotdl.com/models/{dataset_id}/stage/{item_id}', # TODO: change to prod
					'checksum': file['checksum'],
					'timestamp': file['createdAt'],
					'size': file['size'],
				}},
				"links": [],
				# 'collection': 'source',
				# anything below are properties (need at least one!)
				'repository': 'eotdl',				
			}
			data.append(stac_item)
			# copy file from old bucket to new bucket
			minio_client.fget_object(
				old_bucket,
				f'{dataset_id}/{file['name']}_{file['version']}',
				f'{dataset_id}/{item_id}'
			)
			minio_client.fput_object(
				new_bucket,
				f'{dataset_id}/{item_id}',
				f'{dataset_id}/{item_id}'
			)
			files_map.append((f'{dataset_id}/{file['name']}_{file['version']}', f'{dataset_id}/{item_id}'))
		if data:
			gdf = gpd.GeoDataFrame(data, geometry='geometry')
			catalog_name = f'catalog.v{version["version_id"]}.parquet'
			gdf.to_parquet(catalog_name)
			# copy parquet to bucket
			minio_client.fput_object(
				new_bucket,
				f'{dataset_id}/{catalog_name}',
				catalog_name
			)
	# break
	
_df = pd.DataFrame(files_map, columns=['old_path', 'new_path'])
# _df.to_csv('files_map.csv', index=False)

{'version_id': 1, 'createdAt': '2023-11-03T16:12:06.732000', 'size': 44718151}
{'name': 'model.onnx', 'size': 44717985, 'checksum': '3eec90939be13a739ce2d70424391a5956c54e95', 'version': 1, 'versions': [1], 'createdAt': '2023-11-03T16:12:06.738000'}
{'version_id': 1, 'createdAt': '2023-11-04T15:10:51.027000', 'size': 331435910}
{'name': 'V7-e6e/yolov7-M1-planeOnly-960--e6e-bs20-150it/yolov7-M1-planeOnly-960--e6e-bs20-150it.pt', 'size': 330804467, 'checksum': '8397db55586fb178599b0b51ac21b9fbca759482', 'version': 1, 'versions': [1], 'createdAt': '2023-11-04T15:10:51.034000'}
{'name': 'Readme.md', 'size': 9229, 'checksum': 'fe102268967b6c5bf81017528a3c1c80566c5208', 'version': 1, 'versions': [1], 'createdAt': '2023-11-04T15:10:51.034000'}
{'name': 'V7-e6e/yolov7-M1-planeOnly-960--e6e-bs20-150it/confusion_matrix.png', 'size': 83333, 'checksum': 'f43c6d69b87f4160c5a28018fe7d2457368ec76d', 'version': 1, 'versions': [1], 'createdAt': '2023-11-04T15:10:51.034000'}
{'name': 'V7-e6e/yolov7-M1-p

In [146]:
_df = pd.read_csv('files_map.csv')
# _df


In [141]:
# data = gpd.read_parquet('catalog.v3.parquet')

# data

In [147]:
# files = minio_client.list_objects(new_bucket)
# for file in files:
# 	print(file.object_name)

In [8]:
# get all files in bucket/id

files = minio_client.list_objects(new_bucket, '65f407248e68180349152973', recursive=True)
for file in files:
	print(file.object_name)


65f407248e68180349152973/
65f407248e68180349152973/catalog.v1.parquet
65f407248e68180349152973/catalog.v2.parquet
65f407248e68180349152973/model.onnx
65f407248e68180349152973/unet-resnet50.onnx
