In [None]:
#| default_exp nc_template

# MARIS NetCDF Template
> Creation of MARIS NetCDF template based on "pseudo" Common Data Language `.toml` config file

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from typing import Dict, Union
from copy import deepcopy
from functools import partial
import re

from netCDF4 import Dataset
import numpy as np
import pandas as pd
from pathlib import Path
from fastcore.basics import patch, store_attr
from fastcore.test import *

from marisco.utils import read_toml
from marisco.configs import name2grp, get_cfgs

## Enumeration types

In [None]:
#| export
def sanitize(s):
    """Sanitize dictionary key to comply with NetCDF enumeration type"""
    s = re.sub(r'[()]', '', s)
    return re.sub(r'[./-]', ' ', s).strip() 

def get_lut(fname, name, idx):
    """Convert MARIS db lookup table excel file to dictionary {'name': id, ...}"""
    fname = Path(get_cfgs(key='dirs')['lut']) / fname
    lut = pd.read_excel(fname, index_col=name, usecols=[name, idx])[idx].to_dict()
    lut = {sanitize(key): value for key, value in lut.items()}
    return lut

In [None]:
#| export
enum_bio_group = get_lut('dbo_biogroup.xlsx', name='biogroup', idx='biogroup_id') # ok
enum_body_par = get_lut('dbo_bodypar.xlsx', name='bodypar', idx='bodypar_id') # ok
enum_species = get_lut('dbo_species.xlsx', name='species', idx='species_id')
enum_sed_type = get_lut('dbo_sedtype.xlsx', name='sedtype', idx='sedtype_id')

enum_type_lut = {'bio_group_t': enum_bio_group,
                 'body_part_t': enum_body_par,
                 'species_t': enum_species,
                 'sed_type_t': enum_sed_type}

## NetCDF template generation

In [None]:
#| export
class NCTemplate:
    "MARIS NetCDF templater"
    def __init__(self, 
                 tpl_fname:str, # CDL file name
                 vars_fname:str, # File name and path of MARIS nuclide look up table
                 dest_dir:str, # Destination directory for generated NetCDF template files
                 cdl:Dict, # Pseudo CDL (`.toml`)
                ):
        store_attr()
        self.dim = self.cdl['dim']
        self.enum_types = {}

In [None]:
cdl = read_toml(Path('./files') / 'cdl.toml')
nc_tpl = NCTemplate('test.nc',
                     vars_fname='./files/lut/dbo_nuclide.xlsx', 
                     dest_dir='./files/nc',
                     cdl=cdl)

In [None]:
expected = {'name': 'sample', 
            'dtype': 'u8', 
            'attrs': {'long_name': 'Sample ID of measurement'}
           }
test_eq(nc_tpl.dim, expected)

In [None]:
#| export
@patch
def get_analytes(self:NCTemplate,
                 col_varnames:str='nc_name', # Column name containing the NetCDF variable names
                 col_stdnames:str='nusymbol', # Column name containing the NetCDF standard names
                 dtype:str='f4', # Default type
                ):
    "Return the name of the variables analysed"
    df = pd.read_excel(self.vars_fname, index_col=0)
    df = df[df.nuclide != 'NOT AVAILABLE']
    var_names = df[col_varnames].tolist()
    std_names = df[col_stdnames].tolist()
    long_names = df[['nuclide', 'massnb']].apply(lambda row: ' '.join(row.values.astype(str)), 
                                                 axis=1).tolist()
    long_names = [name.capitalize() for name in long_names]

    return [{'name': n,
             'attrs': {
                 'long_name': ln,
                 'standard_name': sn
             },
             'dtype': dtype
            } for n, ln, sn in zip(*(var_names, long_names, std_names))]

In [None]:
nc_tpl.get_analytes()[:2]

[{'name': 'h3',
  'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'},
  'dtype': 'f4'},
 {'name': 'be7',
  'attrs': {'long_name': 'Beryllium 7', 'standard_name': '7Be'},
  'dtype': 'f4'}]

In [None]:
#| export
def derive(
    analyte:dict, # Analyte/nuclide/var name and associated netcdf attributes
    suffix:dict,  # Naming rules as described in CDL
):
    "Derive NetCDf var name & attributes as defined in CDL" 
    # TBD: refactor using recursion?
    derived = deepcopy(analyte)
    for k1, v1 in suffix.items():
        if k1 == 'attrs':
            for k2, v2 in suffix['attrs'].items():
                derived['attrs'][k2] += v2
        else:
            derived[k1] += v1
    return derived

In [None]:
analyte = nc_tpl.get_analytes()[0]; analyte

{'name': 'h3',
 'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'},
 'dtype': 'f4'}

In [None]:
analyte['attrs']['units'] = cdl['placeholder']; analyte

{'name': 'h3',
 'attrs': {'long_name': 'Tritium 3',
  'standard_name': '3H',
  'units': '_to_be_filled_in_'},
 'dtype': 'f4'}

In [None]:
suffix = cdl['vars']['suffixes']['uncertainty']; suffix

{'name': '_unc',
 'attrs': {'long_name': ' uncertainty', 'standard_name': '_uncertainty'}}

In [None]:
expected = {
    'name': 'h3_unc',
    'attrs': {
        'long_name': 'Tritium 3 uncertainty',
        'standard_name': '3H_uncertainty',
        'units': '_to_be_filled_in_'},
    'dtype': 'f4'
}

test_eq(derive(analyte, suffix), expected)

In [None]:
#| export
@patch
def create_variable(self:NCTemplate, 
               nc, # NetCDF file
               var:Dict, # Variable
               dtype:Union[str, None]=None, # Type of the variable
           ):
    """Create NetCDF variable with proper types (standard and enums)"""
    name = var['name']
    attrs = var['attrs'].copy()
    nc_var = nc.createVariable(name, 
                               self.enum_types.get(dtype) or dtype, 
                               self.dim['name'])
    nc_var.setncatts(attrs)    
    return nc

In [None]:
# Example of use
with Dataset('files/nc/test.nc', 'w', format='NETCDF4') as nc:
    nc.createDimension(nc_tpl.dim['name'], None)
    nc_tpl.create_variable(nc, cdl['vars']['defaults']['lon'])
    print(nc.variables['lon'])

f4
<class 'netCDF4._netCDF4.Variable'>
float32 lon(sample)
    long_name: Measurement longitude
    standard_name: longitude
    units: degrees_north
    axis: Y
    _CoordinateAxisType: Lon
unlimited dimensions: sample
current shape = (0,)
filling on, default _FillValue of 9.969209968386869e+36 used


In [None]:
#| export
@patch
def generate(self:NCTemplate,
             common_vars:list=['lon', 'lat', 'depth', 'time'], # Common variables
            ):
    "Generate CDL"
    fname = Path(self.dest_dir)/self.tpl_fname
    
    common_vars = self.cdl['vars']['defaults'].keys()
    
    with Dataset(fname, 'w', format='NETCDF4') as nc:
        # Create dataset attributes
        nc.setncatts(self.cdl['global_attrs']) 
        
        # Create Enum type    
        for name, enum in enum_type_lut.items(): 
            self.enum_types[name] = nc.createEnumType(np.uint16, name, enum)
        
        # Create shared `sample` dimension
        nc.createDimension(self.dim['name'], None)
        
        # Create grps
        grp_names = [v['name'] for k, v in self.cdl['grps'].items()]
        for grp_name in grp_names:
            grp = nc.createGroup(grp_name)

            # Create 'dim' variable
            #self.create_variable(grp, self.dim, 'i4')
            self.create_variable(grp, self.dim)
            
            # Create default variables
            for var in self.cdl['vars']['defaults'].values(): 
                self.create_variable(grp, var)

            # Create group-specific variables
            if name2grp(grp_name) in self.cdl['vars']:
                for var in self.cdl['vars'][name2grp(grp_name)].values(): 
                    self.create_variable(grp, var)
            
            # Create analyte variables
            for analyte in self.get_analytes():
                analyte['units'] = self.cdl['placeholder']
                self.create_variable(grp, analyte)
            
                # Derived uncertainty and detection limit variables
                for k, v in self.cdl['vars']['suffixes'].items():
                    self.create_variable(grp, derive(analyte, v))

In [None]:
# So in summary, to produce a template MARIS NetCDF
nc_tpl = NCTemplate('test.nc',
                    vars_fname='./files/lut/dbo_nuclide.xlsx', 
                    dest_dir='./files/nc',
                    cdl=cdl)

nc_tpl.generate()

u8
f4
f4
f4
u8
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f4
f