# NCML metadata conversion to STAC Item using CWL Application Package annotation

This notebook should be compiled into a standalone *CWL* definition using the following command:

```shell
jupyter-repo2cwl "https://github.com/crim-ca/ncml2sta" -o /tmp
```
(replace the Git repository URL by the path if the clone locally)

It can then be deployed in *Weaver* using the CLI:

```shell
weaver deploy -u http://example.com/weaver -i ncml2stac --cwl /tmp/notebooks_ncml2stac.cwl
```

## Define the CWL Inputs for this Notebook

A real notebook does not even need to import `ipython2cwl` types!
It is sufficient to use only the string typing annotation to avoid import errors when this dependency is not installed!

In [85]:
# NOTE:
#  If using indented code block here (eg: 'if TYPE_CHECKING:'),
#  it is important to have other things than 'ipython2cwl' imports.
#  When ported into the generated python script, imports from 'ipython2cwl' are removed,
#  which can cause syntax/indent errors.
try:
    # to make optional inputs, define types like so: 'Optional[CWL<type>Input]'
    from typing import Any, MutableSequence, MutableMapping, Optional, TypeAlias, Union
    from ipython2cwl.iotypes import CWLFilePathInput, CWLFilePathOutput

    JsonLike: TypeAlias = MutableMapping[
        str,
        Optional[Union[str, float, int, bool, MutableSequence[Any], MutableMapping[str, Any]]]
    ]
except ImportError:
    pass  # ignore explicit typing definitions if modules were not installed (CWL conversion still works)

# NOTE: important part for CWL conversion is to apply the typing definition
# NOTE: application-specific detail: supports NCML URL directly or through a THREDDS catalog URL
input_ncml: "CWLFilePathInput" = (
    "https://pavics.ouranos.ca/twitcher/ows/proxy/"
    "thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"
    "?catalog=https%3A%2F%2Fpavics.ouranos.ca%2Ftwitcher%2Fows%2Fproxy%2F"
    "thredds%2Fcatalog%2Fbirdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fcatalog.html"
    "&dataset=birdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fsic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"
)

## Define the Core functionality of this Notebook

This code assumes that the following reference and all its dependencies are installed:

[https://github.com/crim-ca/stac-populator/tree/weaver-repo2cwl-ncml2sta](
https://github.com/crim-ca/stac-populator/tree/weaver-repo2cwl-ncml2stac
)

Run `make install` to have them automatically installed.

In [86]:
!rm -fr ~/.esdoc/pyessv-archive
!mkdir -p ~/.esdoc/
!git clone https://github.com/ES-DOC/pyessv-archive ~/.esdoc/pyessv-archive

Cloning into '/home/francis/.esdoc/pyessv-archive'...
remote: Enumerating objects: 63068, done.[K
remote: Counting objects: 100% (1557/1557), done.[K
remote: Compressing objects: 100% (476/476), done.[K
remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511[K
Receiving objects: 100% (63068/63068), 6.06 MiB | 5.05 MiB/s, done.
Resolving deltas: 100% (60270/60270), done.

Local identity for pyessv-archive set to "Francis Charette Migneault <francis.charette.migneault@gmail.com>"


In [87]:
import hashlib
import json
import tempfile
from datetime import datetime, date
from enum import Enum

import numpy as np
import pystac
import requests
import xncml
from pydantic.networks import Url

from STACpopulator.extensions import cmip6
from STACpopulator.stac_utils import CFJsonItem, DatacubeExt

In [88]:
# retrieve the file contents
if not (input_ncml.startswith("/") or input_ncml.startswith("file:///")):
    resp = requests.get(input_ncml, headers={"Accept": "text/xml, application/xml"}, timeout=5)
    if not resp.status_code == 200 and resp.text.startswith("<?xml"):
        raise ValueError(
            f"Could not retrieve NCML XML file contents from [{input_ncml}]."
            f"Error: [{resp.status_code}]: {resp.text!s}"
        )
    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as tmp_file:
        tmp_file.write(resp.text)
        input_ncml = tmp_file.name

# for debugging purposes, display the contents:
with open(input_ncml, mode="r", encoding="utf-8") as input_ncml_file:
    print(input_ncml_file.read())

<?xml version="1.0" encoding="UTF-8"?>
<ncml:netcdf xmlns:ncml="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2" location="https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc">
  <ncml:attribute name="CCCma_model_hash" value="fc4bb7db954c862d023b546e19aec6c588bc0552" />
  <ncml:attribute name="CCCma_parent_runid" value="p2-his13" />
  <ncml:attribute name="CCCma_pycmor_hash" value="26c970628162d607fffd14254956ebc6dd3b6f49" />
  <ncml:attribute name="CCCma_runid" value="p2-s4513" />
  <ncml:attribute name="Conventions" value="CF-1.7 CMIP-6.2" />
  <ncml:attribute name="YMDH_branch_time_in_child" value="2015:01:01:00" />
  <ncml:attribute name="YMDH_branch_time_in_parent" value="2015:01:01:00" />
  <ncml:attribute name="activity_id" value="ScenarioMIP" />
  <ncml:attribute name="branch_method" value="Spin-up documentation" />
  <ncml:attribute name="branch_time_in_child" type="double" value="602

In [89]:
# FIXME: duplicate code
# this is defined in:
# https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L102-L116
# but we cannot import it since outside of installed 'STACpopulator' module
def make_cmip6_item_id(_attrs: "JsonLike") -> str:
    """Return a unique ID for CMIP6 data item."""
    keys = [
        "activity_id",
        "institution_id",
        "source_id",
        "experiment_id",
        "variant_label",
        "table_id",
        "variable_id",
        "grid_label",
    ]
    name = "_".join(_attrs[k] for k in keys)
    return hashlib.md5(name.encode("utf-8")).hexdigest()


# FIXME: temporary patch of URL/Media-Type
# https://github.com/crim-ca/stac-populator/pull/23#discussion_r1341819744
class CFJsonItemNetCDF(CFJsonItem):
    def item_link(self) -> pystac.Link:
        url = self.attrs["@location"]  # NetCDF URL
        name = self.attrs["groups"]["THREDDSMetadata"]["attributes"]["id"]
        path = url.split(name, 1)[0]
        parts = list(filter(lambda _: bool(_), path.rsplit("/", 3)))
        service = parts[-2]  # always 1 path part for the service
        link = pystac.Link(
            rel="source",
            target=url,
            media_type="application/x-netcdf",
            title=f"{service}:{name}"
        )
        return link


# FIXME: partial duplicate code
# https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L138-L165
# should be combined into a single callable function that doesn't depend on the rest of the THREDDS crawling iterator
ds = xncml.Dataset(input_ncml)
attrs = ds.to_cf_dict()

# FIXME: AttributeError
nc_services = getattr(ds, "access_urls", None)
if nc_services:
    attrs["access_urls"] = nc_services

stac_item_id = make_cmip6_item_id(attrs["attributes"])
attrs["id"] = stac_item_id
stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)
DatacubeExt(stac_item)

<STACpopulator.stac_utils.DatacubeExt at 0x7fd0dafff9a0>

## Display the result for validation

In [90]:
stac_item_data = stac_item.item.to_dict()

def json_encode(obj: "pystac.Item") -> Union["JsonLike", str]:
    if isinstance(obj, (np.ndarray, np.number)):
        return obj.tolist()
    if isinstance(obj, (Url, Enum)):
        return str(obj)
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")

stac_item_json = json.dumps(stac_item_data, default=json_encode, indent=2)
print(stac_item_json)

{
  "type": "Feature",
  "stac_version": "1.0.0",
  "id": "36c83a8bb9d382ff2ffed7b9ba422cd3",
  "properties": {
    "start_datetime": "2019-12-06T12:00:00Z",
    "end_datetime": "2020-11-04T12:00:00Z",
    "datetime": null,
    "Conventions": "CF-1.7 CMIP-6.2",
    "activity_id": "ScenarioMIP",
    "creation_date": "2019-09-25T23:01:33Z",
    "data_specs_version": "01.00.30",
    "experiment": "update of RCP4.5 based on SSP2",
    "experiment_id": "ssp245",
    "frequency": "mon",
    "further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1",
    "grid_label": "gn",
    "institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada",
    "institution_id": "CCCma",
    "nominal_resolution": "100 km",
    "realm": [
      "seaIce"
    ],
    "source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/lat

## Define the CWL Outputs for this Notebook

In [91]:
# NOTE:
#   It is important to define the expected file name with an explicit string as done below.
#   This is to generate the corresponding glob pattern that will collect the output from the CWL execution.
output: "CWLFilePathOutput" = "ncml2stac.json"
with open(output, mode="w", encoding="utf-8") as out_file:
    json.dump(stac_item_data, out_file, default=json_encode)
