In [1]:
from dotenv import load_dotenv
import os
from hera.workflows import models, CronWorkflow, script, Artifact, Parameter, DAG, Steps, Step, NoneArchiveStrategy, Workflow, Task, WorkflowStatus
from hera.shared import global_config

load_dotenv("/home/otto/s1_zarr/.env")

True

In [2]:
global_config.host = "https://services.eodc.eu/workflows/"
global_config.namespace = "s1sig0"
global_config.token = os.getenv("argo_token_prod")
global_config.image = "ghcr.io/oscipal/image_zarr:latest"

In [3]:
nfs_volume = [models.Volume(
    name="eodc-mount",
    persistent_volume_claim={"claimName": "eodc-nfs-claim"},
    )]

security_context = {"runAsUser": 75000,
                    "runAsGroup": 60028}

In [10]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])

def get_timerange():
    import yaml

    def read_yaml(filename):
        with open(filename) as f:
            data = yaml.safe_load(f)
        popped = data["ranges"].pop(0)
        with open(filename, "w") as f:
            yaml.dump(data, f)

        data["processing"].append(popped)
        with open(filename, "w") as f:
            yaml.dump(data, f)
        return popped

    time_range = read_yaml("/eodc/private/tempearth/s1sig0_timesteps.yaml")
    print(time_range)

In [5]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])

def end(time_range):
    import yaml
    def read_and_move_yaml(filename, target):
            with open(filename) as f:
                data = yaml.safe_load(f) or {}

            ranges = data.get("processing", [])
            if target not in ranges:
                raise ValueError(f"{target!r} not found in ranges")

            # remove the specific value
            ranges.remove(target)

            # append to done
            data.setdefault("done", []).append(target)

            # save once
            with open(filename, "w") as f:
                yaml.safe_dump(data, f, sort_keys=False)

            return target

    read_and_move_yaml("/eodc/private/tempearth/s1sig0_timesteps.yaml", time_range)

In [6]:
@script(volume_mounts=[models.VolumeMount(name="eodc-mount", mount_path="/eodc")])

def write_data(tile: str, time_range, store_path: str = "/eodc/products/eodc/sentinel1_sig0/s1sig0.zarr"):
    import pystac_client as pc
    import xarray as xr
    import zarr
    import numpy as np
    import rioxarray
    import pandas as pd
    from datetime import datetime
    from collections import defaultdict

    def group_by_relative_orbit(items, key="sat:relative_orbit"):
        groups = defaultdict(list)
        for it in items:
            groups[it[0].properties[key]].append(it)
        return dict(groups)

    def get_idx(array1, array2):
        min = np.where(array1==array2[0])[0][0]
        max = np.where(array1==array2[-1])[0][0]+1
        return min, max

    def load_data(item, pols):
        if type(pols)==str:
            data = rioxarray.open_rasterio(item.assets[pols].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pols)
        else:
            data = []
            for pol in pols:
                data.append(rioxarray.open_rasterio(item.assets[pol].href).compute().expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)).rename(pol))
            
            data = xr.merge(data)
        return data.squeeze()

    def get_datetime(item):
        return datetime.strptime(item.properties["datetime"], "%Y-%m-%dT%H:%M:%SZ")

    def group_dates(item_list):
        grouped_items = [[]]
        i=0
        for item in item_list:
            
            if not grouped_items[i]:
                grouped_items[i].append(item)
            
            else: 
                if get_datetime(item) - get_datetime(grouped_items[i][-1]) <= pd.Timedelta(seconds=100):
                    grouped_items[i].append(item)

                else:
                    grouped_items.append([item])
                    i+=1
        return grouped_items

    def read_and_merge_items(items, pols):
        first = True
        if type(pols)==list:
            datasets = []
            for pol in pols:
                for item in items:
                    ds = load_data(item, pol)
                    
                    if first:
                        data = ds
                        first = False
                    
                    else:
                        data = xr.where(data==-9999, ds, data, keep_attrs=True)

                if "time" in data.dims:      
                    datasets.append(data)
                else:
                    datasets.append(data.expand_dims(time=pd.to_datetime([item.properties["datetime"]]).tz_convert(None)))

                first=True
            data = xr.merge(datasets)

        else:
            for item in items:
                ds = load_data(item, pols)
                
                if first:
                    data = ds
                    first = False
                
                else:
                    data = xr.where(data==-9999, ds, data, keep_attrs=True)

            data = data.to_dataset(name=pols)

        return data.squeeze()

    pc_client = pc.Client.open("https://stac.eodc.eu/api/v1")
    search = pc_client.search(
        collections=["SENTINEL1_SIG0_20M"],
        datetime=time_range,
        query={"Equi7_TileID": {"eq": f"EU020M_{tile}T3"}})

    items_eodc = search.item_collection()

    if items_eodc:

        item_list = list(items_eodc)[::-1]
        grouped_items = group_dates(item_list)

        store = zarr.storage.LocalStore(store_path)
        group = zarr.group(store=store)
        x_extent = group["x"][:]
        y_extent = group["y"][:]
        rel_orbit_extent = group["relative_orbit_number"][:]
    
        sensing_origin = np.datetime64("2014-10-01T00:00:00")

        start = np.datetime64(time_range.split("/", 1)[0].strip(), "D")
        end = np.datetime64(time_range.split("/", 1)[1].strip(), "D")
        
        grouped_orbits = group_by_relative_orbit(grouped_items)

        for orbit, items_orbits in grouped_orbits.items():
            print(f"{orbit} started")
            orbit_index = np.where(rel_orbit_extent==orbit)[0][0]
            datasets_orbits = []

            for items in items_orbits:
                try:
                    ds = read_and_merge_items(items, ["VV", "VH"])
                except KeyError:
                    try:
                        ds = read_and_merge_items(items, ["VV"])
                    except KeyError:
                        ds = read_and_merge_items(items, ["VH"])
                
                ds = ds.expand_dims({"rel_orbit_number": [ds.attrs["rel_orbit_number"]]})
                ds["sensing_date"] = (ds['time'].values.astype("datetime64[s]") - sensing_origin).astype("int64")
                ds["abs_orbit_number"] = ds.attrs["abs_orbit_number"]
                
                ds['time'] = ds['time'].astype('datetime64[D]')

                datasets_orbits.append(ds)
                ds = None

            combined_orbits = xr.concat(datasets_orbits, dim="time", combine_attrs="override")
            full_times = pd.date_range(start=start, end=end, freq='D')
            result = combined_orbits.reindex(time=full_times, fill_value=-9999)
            result = result.transpose("rel_orbit_number", "time", "y", "x")
            #datasets.append(result_orbits)

            

            # combined = xr.concat(datasets, dim="rel_orbit_number", combine_attrs="override").sortby("rel_orbit_number")
            # result = combined.reindex(rel_orbit_number=rel_orbit_extent, fill_value=-9999)

            sensing_dates = result["sensing_date"].values.reshape(1,result.sizes["time"],1,1)
            abs_orbit_numbers = result["abs_orbit_number"].values.reshape(1,result.sizes["time"],1,1)

            result["x"] = result.x-10
            result["y"] = result.y+10

            x_min, x_max = get_idx(x_extent, result["x"].values)
            y_min, y_max = get_idx(y_extent, result["y"].values)

            time_origin = np.datetime64("2014-10-01")
            time_min = (result.time.min().values.astype("datetime64[D]") - time_origin).astype("int64")
            time_max = (result.time.max().values.astype("datetime64[D]") - time_origin).astype("int64")+1

            group["VH"][orbit_index:orbit_index+1,time_min:time_max, y_min:y_max, x_min:x_max] = result["VH"].values
            group["VV"][orbit_index:orbit_index+1,time_min:time_max, y_min:y_max, x_min:x_max] = result["VV"].values

            sensing_dates = np.broadcast_to(sensing_dates, (1,time_max-time_min, y_max-y_min, x_max-x_min))
            abs_orbit_numbers = np.broadcast_to(abs_orbit_numbers, (1,time_max-time_min, y_max-y_min, x_max-x_min))
            #rel_orbit_numbers = np.broadcast_to(rel_orbit_numbers, (1,time_max-time_min, y_max-y_min, x_max-x_min))

            group["sensing_date"][orbit_index:orbit_index+1,time_min:time_max, y_min:y_max, x_min:x_max] = sensing_dates
            group["absolute_orbit_number"][orbit_index:orbit_index+1,time_min:time_max, y_min:y_max, x_min:x_max] = abs_orbit_numbers
            print(f"{orbit} done")
        print("success")

    else:
        print("no items in collection")

In [None]:
with CronWorkflow(
    generate_name="s1sig0-zarr-",
    schedule = "0 * * * *",
    volumes = nfs_volume,
    security_context=security_context,
    entrypoint="workflow"
) as w:
    with DAG(name="workflow"):

        tr = get_timerange()
        process1 = write_data(name="E045N015", arguments={"tile":"E045N015", "time_range":tr.result})
        process2 = write_data(name="E048N015", arguments={"tile":"E048N015", "time_range":tr.result})
        process3 = write_data(name="E051N015", arguments={"tile":"E051N015", "time_range":tr.result})
        process4 = write_data(name="E048N012", arguments={"tile":"E048N012", "time_range":tr.result})
        process5 = write_data(name="E051N012", arguments={"tile":"E051N012", "time_range":tr.result})
        ends = end(arguments={"time_range":tr.result})

        tr >> process1 >> process2 >> process3 >> process4 >> process5 >> ends

In [14]:
w.create()

CronWorkflow(api_version=None, kind=None, metadata=ObjectMeta(annotations=None, cluster_name=None, creation_timestamp=Time(__root__=datetime.datetime(2025, 8, 20, 8, 56, 11, tzinfo=datetime.timezone.utc)), deletion_grace_period_seconds=None, deletion_timestamp=None, finalizers=None, generate_name='s1sig0-zarr-', generation=1, labels={'workflows.argoproj.io/creator': 'system-serviceaccount-default-jenkins'}, managed_fields=[ManagedFieldsEntry(api_version='argoproj.io/v1alpha1', fields_type='FieldsV1', fields_v1=FieldsV1(), manager='argo', operation='Update', subresource=None, time=Time(__root__=datetime.datetime(2025, 8, 20, 8, 56, 11, tzinfo=datetime.timezone.utc)))], name='s1sig0-zarr-5qtgq', namespace='s1sig0', owner_references=None, resource_version='46487773', self_link=None, uid='ebe0fae2-0f16-495c-88ea-c387de9b6153'), spec=CronWorkflowSpec(concurrency_policy=None, failed_jobs_history_limit=None, schedule='0 * * * *', starting_deadline_seconds=None, successful_jobs_history_limit=N