In [None]:
#| default_exp metadata

# Metadata
> Various utilities to populate NetCDF global attributes as well as ISO13195 metadata.

In [None]:
#| export
import pandas as pd
import fastcore.all as fc
from cftime import num2date
from pyzotero import zotero
import json

from marisco.utils import get_bbox, Callback, run_cbs
from marisco.configs import CONFIGS

In [None]:
#| export
class GlobAttrsFeeder:
    "Produce NetCDF global attributes as specified by the callbacks."
    def __init__(self, 
                 dfs:dict, # Dictionary of NetCDF group DataFrames
                 cbs:list=[], # Callbacks
                 logs:list=[] # List of preprocessing steps taken
                 ): 
        fc.store_attr()
        self.attrs = {}
        
    def callback(self):
        run_cbs(self.cbs, self)
        
    def __call__(self):
        self.callback()
        return self.attrs

In [None]:
#| export
class BboxCB(Callback):
    "Compute dataset geographical bounding box"
    def __call__(self, obj):
        bbox = get_bbox(pd.concat(obj.dfs)) 
        lon_min, lon_max, lat_min, lat_max = [str(bound) for bound in bbox.bounds]
        obj.attrs.update({
            'geospatial_lat_min': lat_min, 
            'geospatial_lat_max': lat_max,
            'geospatial_lon_min': lon_min,
            'geospatial_lon_max': lon_max,
            'geospatial_bounds': bbox.wkt})

In [None]:
#| export
class DepthRangeCB(Callback):
    "Compute depth values range"
    def __call__(self, obj):
        max_depth = pd.concat(obj.dfs).depth.max()
        min_depth = pd.concat(obj.dfs).depth.min()
        obj.attrs.update({
            'geospatial_vertical_max': '0' if min_depth == 0 else str(-min_depth),
            'geospatial_vertical_min': str(-max_depth)})

In [None]:
#| export
class TimeRangeCB(Callback):
    "Compute time values range"
    def __init__(self, cfg): fc.store_attr()
        
    def __call__(self, obj):
        time = pd.concat(obj.dfs)['time']
        start, end = [num2date(t, units=self.cfg['units']['time']).isoformat() 
                      for t in (time.min(), time.max())]
        obj.attrs.update({
            'time_coverage_start': start,
            'time_coverage_end': end})

In [None]:
#| export
class ZoteroItem:
    def __init__(self, item_id, cfg):
        self.cfg = cfg
        self.item = self.getItem(item_id)
        
    def getItem(self, item_id):
        zot = zotero.Zotero(self.cfg['lib_id'], 'group', self.cfg['api_key'])
        return zot.item(item_id)
    
    def title(self):
        return self.item['data']['title']
    
    def summary(self):
        return self.item['data']['abstractNote']
    
    def creator_name(self):
        creators = [f'{c["creatorType"]}: {c["name"]}' for c in self.item['data']['creators']]
        return '; '.join(creators)
            
    def __repr__(self):
        return json.dumps(self.item, indent=4) 

In [None]:
#| export
class ZoteroCB(Callback):
    "Retrieve Zotero metadata"
    def __init__(self, itemId, cfg): fc.store_attr()
        
    def __call__(self, obj):
        # item = ZoteroItem(self.itemId, cfg()('zotero'))
        item = ZoteroItem(self.itemId, self.cfg['zotero'])
        for attr in ['title', 'summary', 'creator_name']:
            obj.attrs[attr] = getattr(item, attr)()

In [None]:
#| export
class KeyValuePairCB(Callback):
    def __init__(self, k, v): fc.store_attr()
    def __call__(self, obj): obj.attrs[self.k] = self.v

## How to use

In [None]:
dfs = pd.read_pickle('../files/pkl/dfs_test.pkl')

In [None]:
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
feed = GlobAttrsFeeder(dfs, cbs=[
    BboxCB(),
    DepthRangeCB(),
    TimeRangeCB(cfg=CONFIGS),
    ZoteroCB('26VMZZ2Q', cfg=CONFIGS),
    KeyValuePairCB('keywords', ', '.join(kw))
    ])

attrs = feed(); attrs

{'key': '26VMZZ2Q', 'version': 826, 'library': {'type': 'group', 'id': 2432820, 'name': 'MARIS', 'links': {'alternate': {'href': 'https://www.zotero.org/groups/maris', 'type': 'text/html'}}}, 'links': {'self': {'href': 'https://api.zotero.org/groups/2432820/items/26VMZZ2Q', 'type': 'application/json'}, 'alternate': {'href': 'https://www.zotero.org/groups/maris/items/26VMZZ2Q', 'type': 'text/html'}}, 'meta': {'createdByUser': {'id': 5826529, 'username': 'MarisAdmin', 'name': '', 'links': {'alternate': {'href': 'https://www.zotero.org/marisadmin', 'type': 'text/html'}}}, 'creatorSummary': 'HELCOM MORS', 'parsedDate': '2018', 'numChildren': 0}, 'data': {'key': '26VMZZ2Q', 'version': 826, 'itemType': 'document', 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances', 'creators': [{'creatorType': 'author', 'name': 'HELCOM MORS'}], 'abstractNote': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioact

{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Scienc