In [23]:
from dotenv import load_dotenv
import os
from hera.workflows import models, CronWorkflow, script, Artifact, Parameter, DAG, Steps, Step, NoneArchiveStrategy, Workflow
from hera.shared import global_config

load_dotenv("/home/otto/s1_zarr/.env")

True

In [75]:
global_config.host = "https://services.eodc.eu/workflows/"
global_config.namespace = "inca"
global_config.token = os.getenv("argo_token_prod")
global_config.image = "ghcr.io/oscipal/image_zarr:latest"

In [60]:
nfs_volume = [models.Volume(
    name="eodc-mount",
    persistent_volume_claim={"claimName": "eodc-nfs-claim"},
    )]

security_context = {"runAsUser": 59100,
                    "runAsGroup": 59100}

In [67]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])

def extend_time_dimension(store_path: str = "/eodc/private/tempearth/s1sig0.zarr"):
    import datetime
    import numpy as np
    import zarr

    now = datetime.datetime.now()
    now_np = np.datetime64(now).astype('datetime64[D]')
    origin = np.datetime64("2014-10-01")

    new_shape = int((now_np-origin).astype(int))
    new_extent = np.arange(0,new_shape,1)

    store = zarr.storage.LocalStore(store_path)
    group = zarr.group(store=store)

    array_names=set(group.array_keys())
    coords = {"time", "x", "y", "relative_orbit_number"}
    data_arrays = array_names-coords

    group["time"].resize(new_shape)
    for array in data_arrays:
        group_shape  = group[array].shape
        group[array].resize((group_shape[0], new_shape, group_shape[2], group_shape[3]))

    zarr.consolidate_metadata(store)
    store = zarr.storage.LocalStore(store_path)
    group = zarr.group(store=store)

    group["time"][:]=new_extent


In [None]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])

def write_data(tile: str, store_path: str = "/eodc/private/tempearth/s1sig0.zarr"):
    print("test1")
    import pystac_client as pc
    import xarray as xr
    import zarr
    import numpy as np
    import rioxarray
    import pandas as pd
    from datetime import datetime
    from collections import defaultdict
    import yaml

    def group_by_relative_orbit(items, key="sat:relative_orbit"):
        groups = defaultdict(list)
        for it in items:
            groups[it[0].properties[key]].append(it)
        return dict(groups)

    def read_and_pop_yaml(filename):
        with open(filename) as f:
            data = yaml.safe_load(f)
        popped = data["ranges"].pop(0)  # remove and get first range
        with open(filename, "w") as f:
            yaml.dump(data, f)
        return popped

    def get_idx(array1, array2):
        min = np.where(array1==array2[0])[0][0]
        max = np.where(array1==array2[-1])[0][0]+1
        return min, max

    def load_data(item, pols):
        if type(pols)==str:
            data = rioxarray.open_rasterio(item.assets[pols].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pols)
        else:
            data = []
            for pol in pols:
                data.append(rioxarray.open_rasterio(item.assets[pol].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pol))
            
            data = xr.merge(data)
        return data.squeeze()

    def get_datetime(item):
        return datetime.strptime(item.properties["datetime"], "%Y-%m-%dT%H:%M:%SZ")

    def group_dates(item_list):
        grouped_items = [[]]
        i=0
        for item in item_list:
            
            if not grouped_items[i]:
                grouped_items[i].append(item)
            
            else: 
                if get_datetime(item) - get_datetime(grouped_items[i][-1]) <= pd.Timedelta(seconds=100):
                    grouped_items[i].append(item)

                else:
                    grouped_items.append([item])
                    i+=1
        return grouped_items

    def read_and_merge_items(items, pols):
        first = True
        if type(pols)==list:
            datasets = []
            for pol in pols:
                for item in items:
                    ds = load_data(item, pol)
                    
                    if first:
                        data = ds
                        first = False
                    
                    else:
                        data = xr.where(data==-9999, ds, data, keep_attrs=True)

                if "time" in data.dims:      
                    datasets.append(data)
                else:
                    datasets.append(data.expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)))

                first=True
            data = xr.merge(datasets)

        else:
            for item in items:
                ds = load_data(item, pols)
                
                if first:
                    data = ds
                    first = False
                
                else:
                    data = xr.where(data==-9999, ds, data, keep_attrs=True)

            data = data.to_dataset(name=pols)

        return data.squeeze()

    print("start")
    pc_client = pc.Client.open("https://stac.eodc.eu/api/v1")
    time_range = "2025-01-01/2025-02-01"#read_and_pop_yaml("/eodc/private/tempearth/s1sig0_timesteps.yaml")
    search = pc_client.search(
        collections=["SENTINEL1_SIG0_20M"],
        datetime=time_range,
        query={"Equi7_TileID": {"eq": f"EU020M_{tile}T3"}})

    items_eodc = search.item_collection()
    print("test print")
    if items_eodc:

        item_list = list(items_eodc)[::-1]
        grouped_items = group_dates(item_list)

        store = zarr.storage.LocalStore(store_path)
        group = zarr.group(store=store)
        x_extent = group["x"][:]
        y_extent = group["y"][:]
        rel_orbit_extent = group["relative_orbit_number"][:]

        datasets = []
    
        sensing_origin = np.datetime64("2014-10-01T00:00:00")

        start = np.datetime64(time_range.split("/", 1)[0].strip(), "D")
        end = np.datetime64(time_range.split("/", 1)[1].strip(), "D")
        
        print("test print")
        grouped_orbits = group_by_relative_orbit(grouped_items)

        for orbit, items_orbits in grouped_orbits.items():
            print(f"{orbit} started")
            orbit_index = np.where(rel_orbit_extent==orbit)[0][0]
            datasets_orbits = []

            for items in items_orbits:
                ds = read_and_merge_items(items, ["VV", "VH"])
                
                ds = ds.expand_dims({"rel_orbit_number": [ds.attrs["rel_orbit_number"]]})
                ds["sensing_date"] = (ds['time'].values.astype("datetime64[s]") - sensing_origin).astype("int64")
                ds["abs_orbit_number"] = ds.attrs["abs_orbit_number"]
                
                ds['time'] = ds['time'].astype('datetime64[D]')

                datasets_orbits.append(ds)
                ds = None

            combined_orbits = xr.concat(datasets_orbits, dim="time", combine_attrs="override")
            full_times = pd.date_range(start=start, end=end, freq='D')
            result = combined_orbits.reindex(time=full_times, fill_value=-9999)
            datasets.append(result_orbits)

            

            # combined = xr.concat(datasets, dim="rel_orbit_number", combine_attrs="override").sortby("rel_orbit_number")
            # result = combined.reindex(rel_orbit_number=rel_orbit_extent, fill_value=-9999)

            sensing_dates = result["sensing_date"].values.reshape(1,result.sizes["time"],1,1)
            abs_orbit_numbers = result["abs_orbit_number"].values.reshape(1,result.sizes["time"],1,1)

            result["x"] = result.x-10
            result["y"] = result.y+10

            x_min, x_max = get_idx(x_extent, result["x"].values)
            y_min, y_max = get_idx(y_extent, result["y"].values)

            time_origin = np.datetime64("2014-10-01")
            time_min = (result.time.min().values.astype("datetime64[D]") - time_origin).astype("int64")
            time_max = (result.time.max().values.astype("datetime64[D]") - time_origin).astype("int64")

            group["VH"][orbit_index,time_min:time_max, y_min:y_max, x_min:x_max] = result["VH"].values
            group["VV"][orbit_index,time_min:time_max, y_min:y_max, x_min:x_max] = result["VV"].values

            sensing_dates = np.broadcast_to(sensing_dates, (1,time_max-time_min, y_max-y_min, x_max-x_min))
            abs_orbit_numbers = np.broadcast_to(abs_orbit_numbers, (1,time_max-time_min, y_max-y_min, x_max-x_min))
            rel_orbit_numbers = np.broadcast_to(rel_orbit_numbers, (1,time_max-time_min, y_max-y_min, x_max-x_min))

            group["sensing_date"][orbit_index,time_min:time_max, y_min:y_max, x_min:x_max] = sensing_dates
            group["absolute_orbit_number"][orbit_index,time_min:time_max, y_min:y_max, x_min:x_max] = abs_orbit_numbers
            print(f"{orbit} done", time_min, time_max, y_min, y_max, x_min, x_max)
        print("success")

    else:
        print("no items in collection")

In [None]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])
def test(tile: str, store_path: str = "/eodc/private/tempearth/s1sig0.zarr"):
    print("start")
    import pystac_client as pc
    import xarray as xr
    import zarr
    import numpy as np
    import rioxarray
    import pandas as pd
    from datetime import datetime

    def get_idx(array1, array2):
        min = np.where(array1==array2[0])[0][0]
        max = np.where(array1==array2[-1])[0][0]+1
        return min, max

    def load_data(item, pols):
        if type(pols)==str:
            data = rioxarray.open_rasterio(item.assets[pols].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pols)
        else:
            data = []
            for pol in pols:
                data.append(rioxarray.open_rasterio(item.assets[pol].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pol))
            
            data = xr.merge(data)
        return data.squeeze()

    def get_datetime(item):
        return datetime.strptime(item.properties["datetime"], "%Y-%m-%dT%H:%M:%SZ")

    def group_dates(item_list):
        grouped_items = [[]]
        i=0
        for item in item_list:
            
            if not grouped_items[i]:
                grouped_items[i].append(item)
            
            else: 
                if get_datetime(item) - get_datetime(grouped_items[i][-1]) <= pd.Timedelta(seconds=100):
                    grouped_items[i].append(item)

                else:
                    grouped_items.append([item])
                    i+=1
        return grouped_items

    def read_and_merge_items(items, pols):
        first = True
        if type(pols)==list:
            datasets = []
            for pol in pols:
                for item in items:
                    ds = load_data(item, pol)
                    
                    if first:
                        data = ds
                        first = False
                    
                    else:
                        data = xr.where(data==-9999, ds, data, keep_attrs=True)

                if "time" in data.dims:      
                    datasets.append(data)
                else:
                    datasets.append(data.expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)))

                first=True
            data = xr.merge(datasets)

        else:
            for item in items:
                ds = load_data(item, pols)
                
                if first:
                    data = ds
                    first = False
                
                else:
                    data = xr.where(data==-9999, ds, data, keep_attrs=True)

            data = data.to_dataset(name=pols)

        return data.squeeze()

    print("start")
    pc_client = pc.Client.open("https://stac.eodc.eu/api/v1")
    time_range = "2025-01-01/2025-02-01"#read_and_pop_yaml("/eodc/private/tempearth/s1sig0_timesteps.yaml")
    search = pc_client.search(
        collections=["SENTINEL1_SIG0_20M"],
        datetime=time_range,
        query={"Equi7_TileID": {"eq": f"EU020M_{tile}T3"}})

    items_eodc = search.item_collection()

    if items_eodc:

        item_list = list(items_eodc)[::-1]
        grouped_items = group_dates(item_list)

        store = zarr.storage.LocalStore(store_path)
        group = zarr.group(store=store)
        x_extent = group["x"][:]
        y_extent = group["y"][:]
        rel_orbit_extent = group["relative_orbit_number"][:]

        datasets = []
        datasets_orbits = []
        sensing_origin = np.datetime64("2014-10-01T00:00:00")

        start = np.datetime64(time_range.split("/", 1)[0].strip(), "D")
        end = np.datetime64(time_range.split("/", 1)[1].strip(), "D")

        for items in grouped_items:
            ds = read_and_merge_items(items, ["VV", "VH"])
            
            ds["sensing_date"] = (ds['time'].values.astype("datetime64[s]") - sensing_origin).astype("int64")
            ds["abs_orbit_number"] = ds.attrs["abs_orbit_number"]
            ds['time'] = ds['time'].astype('datetime64[D]')

            datasets_orbits.append(ds)
            ds = None

        combined_orbits = xr.concat(datasets_orbits, dim="time", combine_attrs="override")
        full_times = pd.date_range(start=start, end=end, freq='D')
        result = combined_orbits.reindex(time=full_times, fill_value=-9999)


        sensing_dates = result["sensing_date"].values.reshape(result.sizes["time"],1,1)
        abs_orbit_numbers = result["abs_orbit_number"].values.reshape(result.sizes["time"],1,1)
        #rel_orbit_numbers = result["rel_orbit_number"].values.reshape(result.sizes["time"],1,1)

        result["x"] = result.x-10
        result["y"] = result.y+10

        x_min, x_max = get_idx(x_extent, result["x"].values)
        y_min, y_max = get_idx(y_extent, result["y"].values)

        time_origin = np.datetime64("2014-10-01")
        time_min = (result.time.min().values.astype("datetime64[D]") - time_origin).astype("int64")
        time_max = (result.time.max().values.astype("datetime64[D]") - time_origin).astype("int64")+1

        group["VH"][time_min:time_max, y_min:y_max, x_min:x_max] = result["VH"].values
        group["VV"][time_min:time_max, y_min:y_max, x_min:x_max] = result["VV"].values

        sensing_dates = np.broadcast_to(sensing_dates, (time_max-time_min, y_max-y_min, x_max-x_min))
        abs_orbit_numbers = np.broadcast_to(abs_orbit_numbers, (time_max-time_min, y_max-y_min, x_max-x_min))
        rel_orbit_numbers = np.broadcast_to(rel_orbit_numbers, (time_max-time_min, y_max-y_min, x_max-x_min))

        group["sensing_date"][time_min:time_max, y_min:y_max, x_min:x_max] = sensing_dates
        group["absolute_orbit_number"][time_min:time_max, y_min:y_max, x_min:x_max] = abs_orbit_numbers
        print(time_min, time_max, y_min, y_max, x_min, x_max)
        print("success")

    else:
        print("no items in collection")

In [73]:
with Workflow(
    generate_name="s1sig0-zarr-",
    volumes = nfs_volume,
    security_context=security_context,
    entrypoint="workflow"
) as w:
    with DAG(name="workflow"):
        #ext = extend_time_dimension()
        process1 = write_data(name="E045N015", arguments={"tile":"E045N015"})
        process2 = write_data(name="E048N015", arguments={"tile":"E048N015"})
        process3 = write_data(name="E051N015", arguments={"tile":"E051N015"})
        process4 = write_data(name="E048N012", arguments={"tile":"E048N012"})
        process5 = write_data(name="E051N012", arguments={"tile":"E051N012"})

        #ext >> 
        process1 >> process2 >> process3 >> process4 >> process5

In [74]:
w.create()

Workflow(api_version=None, kind=None, metadata=ObjectMeta(annotations=None, cluster_name=None, creation_timestamp=Time(__root__=datetime.datetime(2025, 8, 14, 9, 20, 41, tzinfo=datetime.timezone.utc)), deletion_grace_period_seconds=None, deletion_timestamp=None, finalizers=None, generate_name='s1sig0-zarr-', generation=1, labels={'workflows.argoproj.io/creator': 'system-serviceaccount-default-jenkins'}, managed_fields=[ManagedFieldsEntry(api_version='argoproj.io/v1alpha1', fields_type='FieldsV1', fields_v1=FieldsV1(), manager='argo', operation='Update', subresource=None, time=Time(__root__=datetime.datetime(2025, 8, 14, 9, 20, 41, tzinfo=datetime.timezone.utc)))], name='s1sig0-zarr-ndt24', namespace='spartacus', owner_references=None, resource_version='42965838', self_link=None, uid='e9e8c15e-cbcc-4b55-b341-b42e4db08d62'), spec=WorkflowSpec(active_deadline_seconds=None, affinity=None, archive_logs=None, arguments=Arguments(artifacts=None, parameters=None), artifact_gc=None, artifact_re