# THREDDS Catalog to STAC Items
## Investigating data ingestion pipeline

In order to expose TDS in STAC API, STAC items needs to be generated with appropriate metadata.
Next step in the pipeline would be to keep the STAC API index DB up to data with the static items generated in this notebook.

In [23]:
# Variables

# Since os.getcwd() doesn't works as expected in ipynb, we need to manually define the catalog save path, which is absolute
CATALOG_SAVE_PATH = "TO_DEFINE" 

In [24]:
from siphon.catalog import TDSCatalog
import json

def parse_datasets(catalog):
    """
    Collect all available datasets.
    """
    datasets = []

    for dataset_name, dataset_obj in catalog.datasets.items():
        http_url = dataset_obj.access_urls.get("httpserver", "")
        odap_url = dataset_obj.access_urls.get("opendap", "")
        ncml_url = dataset_obj.access_urls.get("ncml", "")
        uddc_url = dataset_obj.access_urls.get("uddc", "")
        iso_url = dataset_obj.access_urls.get("iso", "")
        wcs_url = dataset_obj.access_urls.get("wcs", "")
        wms_url = dataset_obj.access_urls.get("wms", "")
        
        datasets.append({
            "dataset_name" : dataset_name, 
            "http_url" : http_url,
            "odap_url" : odap_url,
            "ncml_url" : ncml_url,
            "uddc_url" : uddc_url,
            "iso_url" : iso_url,
            "wcs_url" : wcs_url,
            "wms_url" : wms_url
        })

    for catalog_name, catalog_obj in catalog.catalog_refs.items():
        d = parse_datasets(catalog_obj.follow())
        datasets.extend(d)

    return datasets

    
def crawl_tds():
    """
    Crawl TDS.
    """
    
    top_cat = TDSCatalog("https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/catalog.xml")
#     tds_ds = parse_datasets(top_cat)
    tds_ds = [{"dataset_name": "BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "http_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "odap_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "ncml_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "uddc_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "iso_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "wcs_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc", "wms_url": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/cccs_portal/indices/Final/BCCAQv2/tx_mean/allrcps_ensemble_stats/YS/BCCAQv2+ANUSPLIN300_ensemble-percentiles_historical+allrcps_1950-2100_tx_mean_YS.nc"}]
    
    # cache crawl result data (give option to use it or not)
    
    
    for i, item in enumerate(tds_ds):
        # add metadata attributes to crawl result elements
        item = add_tds_ds_metadata(item)
        
        # STACItemFactory call
        stac_item = get_stac_item(item)
        
        # write STAC item json to file
        write_item(stac_item)
    
    print("finished creating all STAC items")
    
#     json_dump = json.dumps(tds_ds)
    
#     print(json_dump)
    
    
def add_tds_ds_metadata(ds):
    """
    Add extra metadata to item.
    """
    # replace with regexes
    extra_meta = {
        "model" : "BCCAQv2+ANUSPLIN300",
        "experiment" : "ensemble-percentiles",
        "frequency" : "YS",
        "modeling_realm" : "historical+allrcps",
        "mip_table" : "",
        "ensemble_member" : "",
        "version_number" : "",
        "variable_name" : "tx_mean",
        "temporal_subset" : "1950-2100"
    }
    
    return dict(ds, **extra_meta)


def get_stac_item(item):
    """
    
    """
    return item


def write_item(item):
    """
    
    """
    pass
    

crawl_tds()


finished creating all STAC items


In [None]:
# create STAC collection
import pystac
from datetime import datetime
import json
import os

# items
collection_item = pystac.Item(id='local-image-col-1',
                               geometry={},
                               bbox={},
                               datetime=datetime.utcnow(),
                               properties={},
                               stac_extensions=[pystac.Extensions.EO])

collection_item.common_metadata.gsd = 0.3
collection_item.common_metadata.platform = 'Maxar'
collection_item.common_metadata.instruments = ['WorldView3']

# asset = pystac.Asset(href=img_path, 
#                       media_type=pystac.MediaType.GEOTIFF)
# collection_item.add_asset('image', asset)

collection_item2 = pystac.Item(id='local-image-col-2',
                               geometry={},
                               bbox={},
                               datetime=datetime.utcnow(),
                               properties={},
                               stac_extensions=[pystac.Extensions.EO])

collection_item2.common_metadata.gsd = 0.3
collection_item2.common_metadata.platform = 'Maxar'
collection_item2.common_metadata.instruments = ['WorldView3']

# asset2 = pystac.Asset(href=img_path,
#                      media_type=pystac.MediaType.GEOTIFF)
# collection_item2.add_asset('image', asset2)


# extents
sp_extent = pystac.SpatialExtent([None,None,None,None])
capture_date = datetime.strptime('2015-10-22', '%Y-%m-%d')
tmp_extent = pystac.TemporalExtent([(capture_date, None)])
extent = pystac.Extent(sp_extent, tmp_extent)


# collection
catalog = pystac.Catalog(id='bccaqv2', description='BCCAQv2 STAC')
collection = pystac.Collection(id='tx-mean',
                               description='tx mean',
                               extent=extent,
                               license='CC-BY-SA-4.0')

collection.add_items([collection_item, collection_item2])

catalog.clear_items()
catalog.clear_children()
catalog.add_child(collection)

# catalog.describe()

# normalize and save
print("save path : " + CATALOG_SAVE_PATH)
catalog.normalize_hrefs(CATALOG_SAVE_PATH)
# print(catalog.get_self_href())
# print(collection_item2.get_self_href())
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)

# print(json.dumps(catalog.to_dict(), indent=4))

# label_item = catalog.get_child('tx-mean').get_item('local-image-col-1')
# label_item.to_dict()

In [27]:
with open(catalog.get_self_href()) as f:
    print(f.read())

{
    "id": "bccaqv2",
    "stac_version": "1.0.0-beta.2",
    "description": "BCCAQv2 STAC",
    "links": [
        {
            "rel": "root",
            "href": "./catalog.json",
            "type": "application/json"
        },
        {
            "rel": "child",
            "href": "./tx-mean/collection.json",
            "type": "application/json"
        }
    ]
}


In [28]:
with open(collection.get_self_href()) as f:
    print(f.read())

{
    "id": "tx-mean",
    "stac_version": "1.0.0-beta.2",
    "description": "tx mean",
    "links": [
        {
            "rel": "root",
            "href": "../catalog.json",
            "type": "application/json"
        },
        {
            "rel": "item",
            "href": "./local-image-col-1/local-image-col-1.json",
            "type": "application/json"
        },
        {
            "rel": "item",
            "href": "./local-image-col-2/local-image-col-2.json",
            "type": "application/json"
        },
        {
            "rel": "parent",
            "href": "../catalog.json",
            "type": "application/json"
        }
    ],
    "extent": {
        "spatial": {
            "bbox": [
                [
                    null,
                    null,
                    null,
                    null
                ]
            ]
        },
        "temporal": {
            "interval": [
                [
                    "2015-10-22T00:00:00Z

In [29]:
with open(collection_item.get_self_href()) as f:
    print(f.read())

{
    "type": "Feature",
    "stac_version": "1.0.0-beta.2",
    "id": "local-image-col-1",
    "properties": {
        "gsd": 0.3,
        "platform": "Maxar",
        "instruments": [
            "WorldView3"
        ],
        "datetime": "2021-01-15T16:47:29.020127Z"
    },
    "geometry": {},
    "links": [
        {
            "rel": "root",
            "href": "../collection.json",
            "type": "application/json"
        },
        {
            "rel": "collection",
            "href": "../collection.json",
            "type": "application/json"
        },
        {
            "rel": "parent",
            "href": "../collection.json",
            "type": "application/json"
        }
    ],
    "assets": {},
    "bbox": {},
    "stac_extensions": [
        "eo"
    ],
    "collection": "tx-mean"
}
