In [None]:
#| default_exp configs

# Configs
> Several dictionaries used to generate `.toml` configuration files copied under `/home/.marisco` folder and associated utilities function. These `.toml` files can be then adapted to your specific needs if required.



In [None]:
#| export
from pathlib import Path
import re
from functools import partial

from marisco.utils import read_toml, write_toml
import pandas as pd

import fastcore.all as fc

## Configuration files

In [None]:
#| export
CFG_FNAME = 'configs.toml'
CDL_FNAME = 'cdl.toml'

In [None]:
#| export
BASE_PATH = Path.home() / '.marisco'

By default, we create a folder named `.marisco` under your home directory that will receive all configuration files as defined in `BASE_PATH`:

In [None]:
BASE_PATH

Path('/Users/franckalbinet/.marisco')

In [None]:
#| export
CONFIGS = {
    'gh': {
        'owner': 'franckalbinet',
        'repo': 'marisco'
    },
    'names': {
        'nc_template': 'maris-template.nc'
    },
    'dirs': {
        'lut': str(BASE_PATH / 'lut'), # Look-up tables
        'tmp': str(BASE_PATH / 'tmp')
    },
    'paths': {
        'luts': 'nbs/files/lut'
    },
    'units': {
        'time': 'seconds since 1970-01-01 00:00:00.0'
    },
    'zotero': {
        'api_key': 'your-zotero-api-key',
        'lib_id': '2432820'
    }
}

The `CONFIGS` dictionary defines general settings:

| key     | Value  | Description                                   |
|---------|-------------|--------------------------------------------|
| `dirs/lut`   | `/Users/franckalbinet/.marisco/lut`         | Location & name of the directory receiving lookup tables.          |
| `dirs/tmp`    |  `/Users/franckalbinet/.marisco/tmp`         | Location & name of temporary files.       |
| `gh/owner`     |  `franckalbinet`       | GitHub account owner.                             |
| `gh/repo`      |  `marisco`       |   GitHub user used to download specific files (e.g lookup tables) during installation.                                  |
| `names/nc_template`   | `maris-template.nc` |  Name of the MARIS NetCDF4 template.                         |
| `paths_luts`   |  `nbs/files/lut`        |  GitHub repository directory name containing lookup tables.                             |
| `units_time`   |    `seconds since 1970-01-01 00:00:00.0`     | Reference date and time used for [NetCDF time encoding](https://unidata.github.io/cftime/).        |
| `zotero/api_key`  | `your-zotero-api-key`      |   Zotero API key.                     |
| `zotero/lib_id`  |  `2432820`       | Zotero library ID.                                  |


The main `CONFIGS_CDL` dictionary, used to generate a [NetCDF CDL (Common Data Language)](https://www.unidata.ucar.edu/software/netcdf/workshops/2012/nc3model/Cdl.html) `.toml` file. This file is then used to generate a template MARIS netcdf file. For further details refers to the [`configs.ipynb`](https://github.com/franckalbinet/marisco/blob/main/nbs/api/configs.ipynb) file.

Below, the vars/defaults section printed:

In [None]:
#| export
CONFIGS_CDL = { 
    'placeholder': '_to_be_filled_in_',
    'grps': {
        'sea': {
            'name': 'seawater'
        },
        'bio': {
            'name': 'biota'
        },
        'sed': {
            'name': 'sediment'
        },
        'sus': {
            'name': 'suspended-matter'
        }
    },
    'global_attrs': {
        'id': '', # zotero?
        'title': '',
        'summary': '',
        'keywords': '',
        'keywords_vocabulary': 'GCMD Science Keywords',
        'keywords_vocabulary_url': 'https://gcmd.earthdata.nasa.gov/static/kms/',
        'record': '',
        'featureType': '',
        'cdm_data_type': '',

        # Conventions
        'Conventions': 'CF-1.10 ACDD-1.3',

        # Publisher [ACDD1.3]
        'publisher_name': 'Paul MCGINNITY, Iolanda OSVATH, Florence DESCROIX-COMANDUCCI',
        'publisher_email': 'p.mc-ginnity@iaea.org, i.osvath@iaea.org, F.Descroix-Comanducci@iaea.org', 
        'publisher_url': 'https://maris.iaea.org',
        'publisher_institution': 'International Atomic Energy Agency - IAEA', 

        # Creator info [ACDD1.3]
        'creator_name': '',
        'institution': '',
        'metadata_link': '',
        'creator_email': '',
        'creator_url': '',
        'references': '',
        'license': ' '.join(['Without prejudice to the applicable Terms and Conditions', 
                             '(https://nucleus.iaea.org/Pages/Others/Disclaimer.aspx),',
                             'I hereby agree that any use of the data will contain appropriate',
                             'acknowledgement of the data source(s) and the IAEA Marine',
                             'Radioactivity Information System (MARIS).']),
        'comment': '',
        # Dataset info & coordinates [ACDD1.3]
        #'project': '', # Network long name
        #'platform': '', # Should be a long / full name
        'geospatial_lat_min': '', 
        'geospatial_lon_min': '',
        'geospatial_lat_max': '',
        'geospatial_lon_max': '',
        'geospatial_vertical_min': '',
        'geospatial_vertical_max': '',
        'geospatial_bounds': '', # wkt representation
        'geospatial_bounds_crs': 'EPSG:4326',

        # Time information
        'time_coverage_start': '',
        'time_coverage_end': '',
        #'time_coverage_resolution': '',
        'local_time_zone': '',
        'date_created': '',
        'date_modified': '',
        #
        # -- Additional metadata (custom to MARIS)
        #
        'publisher_postprocess_logs': ''
        },
    'dim': {
        'name': 'sample',
        'attrs': {
            'long_name': 'Sample ID of measurement'
        },
        'dtype': 'u8'
    },
    'vars': {    
        'defaults': {
            'lon': {
                'name': 'lon',
                'attrs': {
                    'long_name': 'Measurement longitude',
                    'standard_name': 'longitude',
                    'units': 'degrees_north',
                    'axis': 'Y',
                    '_CoordinateAxisType': 'Lon'
                },
                'dtype': 'f4'
            },
            'lat': {
                'name': 'lat',
                'attrs': {
                    'long_name': 'Measurement latitude',
                    'standard_name': 'latitude',
                    'units': 'degrees_east',
                    'axis': 'X',
                    '_CoordinateAxisType': 'Lat'
                },
                'dtype': 'f4'
            },
            'depth': {
                'name': 'depth',
                'attrs': {
                    'long_name': 'Depth below seal level',
                    'standard_name': 'depth_below_sea_floor',
                    'units': 'm',
                    'axis': 'Z'
                },
                'dtype': 'f4'
            },
            'time': {
                'name': 'time',
                'attrs': {
                    'long_name': 'Time of measurement',
                    'standard_name': 'time',
                    'units': 'seconds since 1970-01-01 00:00:00.0',
                    'time_origin': '1970-01-01 00:00:00',
                    'time_zone': 'UTC',
                    'abbreviation': 'Date/Time',
                    'axis': 'T',
                    'calendar': 'gregorian'
                },
                'dtype': 'u8',
            },
        },
        'bio': {
            'bio_group': {
                'name': 'bio_group',
                'attrs': {
                    'long_name': 'Biota group',
                    'standard_name': 'biota_group_tbd'
                },
                'dtype': 'bio_group_t'
            },
            'species_id': {
                'name': 'species_id',
                'attrs': {  
                    'long_name': 'Species ID',
                    'standard_name': 'AphiaID'
                },
                'dtype': 'species_t'
            },
            'body_part': {
                'name': 'body_part',
                'attrs': {
                    'long_name': 'Body part',
                    'standard_name': 'body_part_tbd'
                },
                'dtype': 'body_part_t' 
            }
        },
        'sed': {
            'sed_type': {
                'name': 'sed_type',
                'attrs': {
                    'long_name': 'Sediment type',
                    'standard_name': 'sediment_type_tbd'
                },
                'dtype': 'sed_type_t'
            }
        },
        'suffixes':  {
            'uncertainty': {
                'name': '_unc',
                'attrs': {
                    'long_name': ' uncertainty',
                    'standard_name': '_uncertainty'
                },
                'dtype': 'f4'
            },
            'detection_limit': {
                'name': '_dl',
                'attrs': {
                    'long_name': ' detection limit',
                    'standard_name': '_detection_limit'
                },
                'dtype': 'dl_type_t'
            },
            'volume': {
                'name': '_vol',
                'attrs': {
                    'long_name': ' volume',
                    'standard_name': '_volume'
                },
                'dtype': 'f4'
            },
            'filtered': {
                'name': '_filt',
                'attrs': {
                    'long_name': ' filtered',
                    'standard_name': '_filtered'
                },
                'dtype': 'filt_type_t'
            },
            'counting_method': {
                'name': '_counmet',
                'attrs': {
                    'long_name': ' counting method',
                    'standard_name': '_counting_method'
                },
                'dtype': 'counmet_type_t'
            },
            'unit': {
                'name': '_unit',
                'attrs': {
                    'long_name': ' unit',
                    'standard_name': '_unit'
                },
                'dtype': 'unit_type_t'
            }
        }
    },
    'enums': [
        {
            'name': 'bio_group_t', 
            'fname': 'dbo_biogroup.xlsx', 
            'key': 'biogroup', 
            'value':'biogroup_id'
        },
        {
            'name': 'body_part_t', 
            'fname': 'dbo_bodypar.xlsx', 
            'key': 'bodypar', 
            'value':'bodypar_id'
        },
        {
            'name': 'species_t', 
            'fname': 'dbo_species.xlsx', 
            'key': 'species', 
            'value':'species_id'
        },
        {
            'name': 'sed_type_t', 
            'fname': 'dbo_sedtype.xlsx', 
            'key': 'sedtype', 
            'value':'sedtype_id'
        },
        {
            'name': 'dl_type_t', 
            'fname': 'dbo_detection.xlsx', 
            'key': 'detection_name', 
            'value':'detection_id'
        },
        {
            'name': 'unit_type_t', 
            'fname': 'dbo_unit.xlsx', 
            'key': 'unit_sanitized', 
            'value':'unit_id'
        },
        {
            'name': 'dl_type_t', 
            'fname': 'dbo_detectlimit.xlsx', 
            'key': 'name_sanitized', 
            'value':'id'
        },
        {
            'name': 'filt_type_t', 
            'fname': 'dbo_filtered.xlsx', 
            'key': 'name',
            'value':'id'
        },
        {
            'name': 'counmet_type_t', 
            'fname': 'dbo_counmet.xlsx', 
            'key': 'counmet',
            'value':'counmet_id'
        }
        ]
}

In [None]:
fc.AttrDict(CONFIGS_CDL['vars']['defaults'])

```json
{ 'depth': { 'attrs': { 'axis': 'Z',
                        'long_name': 'Depth below seal level',
                        'standard_name': 'depth_below_sea_floor',
                        'units': 'm'},
             'dtype': 'f4',
             'name': 'depth'},
  'lat': { 'attrs': { '_CoordinateAxisType': 'Lat',
                      'axis': 'X',
                      'long_name': 'Measurement latitude',
                      'standard_name': 'latitude',
                      'units': 'degrees_east'},
           'dtype': 'f4',
           'name': 'lat'},
  'lon': { 'attrs': { '_CoordinateAxisType': 'Lon',
                      'axis': 'Y',
                      'long_name': 'Measurement longitude',
                      'standard_name': 'longitude',
                      'units': 'degrees_north'},
           'dtype': 'f4',
           'name': 'lon'},
  'time': { 'attrs': { 'abbreviation': 'Date/Time',
                       'axis': 'T',
                       'calendar': 'gregorian',
                       'long_name': 'Time of measurement',
                       'standard_name': 'time',
                       'time_origin': '1970-01-01 00:00:00',
                       'time_zone': 'UTC',
                       'units': 'seconds since 1970-01-01 00:00:00.0'},
            'dtype': 'u8',
            'name': 'time'}}
```

In [None]:
#| hide
write_toml(Path('./files') / CDL_FNAME, CONFIGS_CDL)

Creating files/cdl.toml


## Utilities function

In [None]:
#| export
NETCDF_TO_PYTHON_TYPE = {
    'u8': int,
    'f4': float
    }

In [None]:
#| export
def name2grp(
    name:str, # Name of the group
    cdl_name:Path = BASE_PATH / CDL_FNAME, # Path to `cdl.toml` file 
    ):
    # Reverse `cdl.toml` config group dict so that group config key can be retrieve based on its name
    cfg = read_toml(cdl_name)['grps']
    return {v['name']:k  for k, v in cfg.items()}[name]

Example:

In [None]:
name2grp('seawater')

'sea'

In [None]:
#| export
def get_nc_tpl_path():
    "Return the name of the MARIS NetCDF template as defined in `configs.toml`"
    return BASE_PATH / read_toml(BASE_PATH / 'configs.toml')['names']['nc_template']

In [None]:
get_nc_tpl_path()

Path('/Users/franckalbinet/.marisco/maris-template.nc')

In [None]:
#| export
def get_cfgs(
    key:str=None # `configs.toml` key of interest
    ) -> dict: # `configs.toml` file as dictionary
    "Lookup specific or all `configs.toml` properties."
    cfgs = read_toml(BASE_PATH / 'configs.toml')
    return cfgs if key is None else cfgs[key]

In [None]:
fc.AttrDict(get_cfgs())

```json
{ 'dirs': { 'lut': '/Users/franckalbinet/.marisco/lut',
            'tmp': '/Users/franckalbinet/.marisco/tmp'},
  'gh': {'owner': 'franckalbinet', 'repo': 'marisco'},
  'names': {'nc_template': 'maris-template.nc'},
  'paths': {'luts': 'nbs/files/lut'},
  'units': {'time': 'seconds since 1970-01-01 00:00:00.0'},
  'zotero': {'api_key': 'your-zotero-api-key', 'lib_id': '2432820'}}
```

## Enumeration types

Enumeration types are used to avoid using strings as NetCDF4 variable values. Instead, enumeration types (lookup tables) such as `{'Crustaceans': 2, 'Echinoderms': 3, ...}` are prepended to the NetCDF file template and associated ids (integers) are used as values.

In [None]:
#| export
def sanitize(s:str # String to sanitize
             ) -> str:
    """
    Sanitize dictionary key to comply with NetCDF enumeration type: 
    
        - remove `(`, `)`, `.`, `/`, `-`  
        - strip the string
    """
    s = re.sub(r'[().]', '', s)
    return re.sub(r'[/-]', ' ', s).strip() 

For example:

In [None]:
fc.test_eq(sanitize('key (sanitized)'), 'key sanitized')
fc.test_eq(sanitize('key san.itized'), 'key sanitized')
fc.test_eq(sanitize('key-sanitized'), 'key sanitized')
fc.test_eq(sanitize('key/sanitized'), 'key sanitized')

NetCDF4 enumeration type seems to not accept keys containing non alphanumeric characters like parentheses, dots, slash, ... As a result, MARIS lookup table needs to be sanitized.

In [None]:
#| export
def get_lut(src_dir:str, # Directory containing lookup tables
            fname:str, # Excel file lookup table name
            key:str, # Excel file column name to be used as dict keys 
            value:str, # Excel file column name to be used as dict values 
            do_sanitize:bool=True # Sanitization required?
            ) -> dict: # MARIS lookup table
    "Convert MARIS db lookup table excel file to dictionary `{'name': id, ...}`."
    fname = Path(src_dir) / fname
    lut = pd.read_excel(fname, index_col=key, usecols=[key, value])[value].to_dict()
    if do_sanitize: lut = {sanitize(k): v for k, v in lut.items()}
    return lut

For example:

In [None]:
lut_src_dir = './files/lut'
get_lut(lut_src_dir, 'dbo_biogroup.xlsx', key='biogroup', value='biogroup_id')

{'Not available': 0,
 'Birds': 1,
 'Crustaceans': 2,
 'Echinoderms': 3,
 'Fish': 4,
 'Mammals': 5,
 'Molluscs': 6,
 'Others': 7,
 'Plankton': 8,
 'Polychaete worms': 9,
 'Reptile': 10,
 'Seaweeds and plants': 11,
 'Cephalopods': 12,
 'Gastropods': 13,
 'Bivalves': 14}

In [None]:
#| export
def get_enum_dicts(
    lut_src_dir:str = get_cfgs()['dirs']['lut'], # Directory containing lookup tables
    cdl_name:Path = BASE_PATH / CDL_FNAME, # Path to `cdl.toml` file
    **kwargs
    ):
    "Return a dict of NetCDF enumeration types."
    enums_cfg = read_toml(cdl_name)['enums']
    enum_types = {}
    for enum in enums_cfg:
        name, fname, key, value = enum.values()
        lut = get_lut(lut_src_dir, fname, key=key, value=value, **kwargs)
        enum_types[name] = lut
        
    return enum_types

For example:

In [None]:
#|eval: false
enums = get_enum_dicts(lut_src_dir='./files/lut', cdl_name= './files/cdl.toml'); enums

{'bio_group_t': {'Not available': 0,
  'Birds': 1,
  'Crustaceans': 2,
  'Echinoderms': 3,
  'Fish': 4,
  'Mammals': 5,
  'Molluscs': 6,
  'Others': 7,
  'Plankton': 8,
  'Polychaete worms': 9,
  'Reptile': 10,
  'Seaweeds and plants': 11,
  'Cephalopods': 12,
  'Gastropods': 13,
  'Bivalves': 14},
 'body_part_t': {'Not available': 0,
  'Whole animal': 1,
  'Whole animal eviscerated': 2,
  'Whole animal eviscerated without head': 3,
  'Flesh with bones': 4,
  'Blood': 5,
  'Skeleton': 6,
  'Bones': 7,
  'Exoskeleton': 8,
  'Endoskeleton': 9,
  'Shells': 10,
  'Molt': 11,
  'Skin': 12,
  'Head': 13,
  'Tooth': 14,
  'Otolith': 15,
  'Fins': 16,
  'Faecal pellet': 17,
  'Byssus': 18,
  'Soft parts': 19,
  'Viscera': 20,
  'Stomach': 21,
  'Hepatopancreas': 22,
  'Digestive gland': 23,
  'Pyloric caeca': 24,
  'Liver': 25,
  'Intestine': 26,
  'Kidney': 27,
  'Spleen': 28,
  'Brain': 29,
  'Eye': 30,
  'Fat': 31,
  'Heart': 32,
  'Branchial heart': 33,
  'Muscle': 34,
  'Mantle': 35,
  'G