##### Source 
https://registry.opendata.aws/umbra-open-data/

##### Description
Umbra Spotlight collects including GEC, SICD, SIDD, CPHD data and metadata

In [37]:
import s3fs
import os
import json
import pandas as pd

In [9]:
bucket_name = 's3://umbra-open-data-catalog/'

In [13]:
# list data from Umbra S3 bucket
def listFilesInBucket(bucket):
    fs = s3fs.S3FileSystem(anon = True)
    
    files = fs.ls(bucket)
    return list(files)

In [14]:
print(listFilesInBucket(bucket_name))

['umbra-open-data-catalog/', 'umbra-open-data-catalog/error.html', 'umbra-open-data-catalog/index.html', 'umbra-open-data-catalog/list.js', 'umbra-open-data-catalog/sar-data', 'umbra-open-data-catalog/stac', 'umbra-open-data-catalog/verified.txt']


In [24]:
# list sar_data
for i in listFilesInBucket(f"{bucket_name}/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08"):
    print(i)

umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08/2025-06-16-14-58-04_UMBRA-08.stac.v2.json
umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08/2025-06-16-14-58-04_UMBRA-08_GEC.tif
umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08/2025-06-16-14-58-04_UMBRA-08_SICD.nitf
umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08/2025-06-16-14-58-04_UMBRA-08_SIDD.nitf


In [28]:
my_path = 'sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08'

def createFolder(path):
    # get current directory
    cwd = os.getcwd()
    
    # create directory if it does not exists
    try:
        os.makedirs(f"{cwd}/{path}")
    except OSError:
        pass

In [29]:
createFolder(my_path)

In [31]:
s3_prefix = "s3://umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/"
fs = s3fs.S3FileSystem(anon=True) 

In [32]:
# get stac keys
def getSARStacKeys()
stac_keys = [p for p in fs.glob(f"{s3_prefix}**/*.stac.v2.json")]

In [35]:
stac_keys[0]

'umbra-open-data-catalog/sar-data/tasks/Kalgoorlie Super Pit, Australia/0015903b-453a-4d06-a91f-554408a5f4b7/2025-06-16-14-58-04_UMBRA-08/2025-06-16-14-58-04_UMBRA-08.stac.v2.json'

In [None]:
# extract timestamp and file path
def umbraSARToDataframe(keys):
    rows = []

    for key in stac_keys:
        with fs.open(key, "rb") as f:
            item = json.load(f)
        
        dt = item.get("properties", {}).get("datetime")
        print(dt)
    
        if dt:
            ts = pd.to_datetime(dt)
        else:
            # fallback: .../2025-06-16-14-58-04_UMBRA-08/...
            m = re.search(r"/(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})_", key)
            ts = pd.to_datetime(m.group(1).replace("-", ":"), format="%Y:%m:%d:%H:%M:%S") if m else None
        
        assets = item.get("assets", {})
        # Sometimes assets are not listed completely - then we will restore them from neighbors by name
        base_dir = "/".join(key.split("/")[:-1])
        gec = assets.get("geotiff") or assets.get("GEC") or f"{base_dir}/{base_dir.split('/')[-1]}_GEC.tif"
        sicd = assets.get("SICD") or f"{base_dir}/{base_dir.split('/')[-1]}_SICD.nitf"
        sidd = assets.get("SIDD") or f"{base_dir}/{base_dir.split('/')[-1]}_SIDD.nitf"

        rows.append({"datetime": ts, "stac": f"s3://{key}", "gec": gec if str(gec).startswith("s3://") else f"s3://{gec}",
                 "sicd": sicd if str(sicd).startswith("s3://") else f"s3://{sidd}"})

    df = pd.DataFrame(rows).dropna(subset=["datetime"]).sort_values("datetime").reset_index(drop=True)
    
    return df

    