In [None]:
#| default_exp nc_template

# MARIS NetCDF Template
> Creation of MARIS NetCDF template based on "pseudo" Common Data Language `.toml` config file.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from typing import Dict, Union
from copy import deepcopy
import re

from netCDF4 import Dataset
import numpy as np
import pandas as pd
from pathlib import Path
# from fastcore.basics import patch, store_attr
import fastcore.all as fc
from fastcore.basics import patch

from marisco.utils import read_toml
from marisco.configs import name2grp, get_cfgs

## NetCDF template generator

Generate a NetCDF4 template from the configurable [`CDL.toml`](https://github.com/franckalbinet/marisco/blob/main/nbs/files/cdl.toml) file, itself generated in [`/api/configs.ipynb`](https://github.com/franckalbinet/marisco/blob/main/nbs/api/configs.ipynb).

In [None]:
#| export
class NCTemplater:
    "MARIS NetCDF template generator."
    def __init__(self, 
                 cdl_fname:Dict, # File name and path of the "Pseudo CDL" (`.toml`)
                 nuclide_vars_fname:str, # File name and path of MARIS nuclide lookup table containing variable names
                 tpl_fname:str, # File name and path of NetCDF4 file to be generated
                ):
        fc.store_attr()
        self.cdl = read_toml(cdl_fname)
        # self.dim = self.cdl['dim']
        self.dim = self.cdl['dim']
        self.enum_types = {}

For example, provided the configuration `cdl.toml` below, the templater gets access, among others, to its `dim` definiton section:

In [None]:
templater = NCTemplater(cdl_fname='./files/cdl.toml',
                        nuclide_vars_fname='./files/lut/dbo_nuclide.xlsx', 
                        tpl_fname='./files/nc/test.nc')

expected = {'name': 'sample', 
            'dtype': 'u8', 
            'attrs': {'long_name': 'Sample ID of measurement'}
            }

fc.test_eq(templater.dim, expected)

In [None]:
#| export
@patch
def nuclide_vars(
    self:NCTemplater,
    col_varnames:str='nc_name', # Column name in the Excel lookup file containing the NetCDF variable names
    col_stdnames:str='nusymbol', # Column name Excel lookup file containing the NetCDF standard names
    dtype:str='f4', # Default data type
    ) -> list[dict]: # List of nuclide variables (including their names and attributes)
    "Return the name of the radionuclide variables analysed."
    df = pd.read_excel(self.nuclide_vars_fname, index_col=0)
    df = df[df.nuclide != 'NOT AVAILABLE']
    var_names = df[col_varnames].tolist()
    std_names = df[col_stdnames].tolist()
    long_names = df[['nuclide', 'massnb']].apply(lambda row: ' '.join(row.values.astype(str)), 
                                                 axis=1).tolist()
    long_names = [name.capitalize() for name in long_names]

    return [{'name': n,
             'attrs': {
                 'long_name': ln,
                 'standard_name': sn
             },
             'dtype': dtype
            } for n, ln, sn in zip(*(var_names, long_names, std_names))]

For example, to retrieve the NetCDF nuclide names and associated attributes:

In [None]:
templater = NCTemplater(cdl_fname='./files/cdl.toml',
                        nuclide_vars_fname='./files/lut/dbo_nuclide.xlsx', 
                        tpl_fname='./files/nc/test.nc')


expected = [
  {'name': 'h3', 'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'}, 'dtype': 'f4'},
  {'name': 'be7', 'attrs': {'long_name': 'Beryllium 7', 'standard_name': '7Be'}, 'dtype': 'f4'}
  ]

fc.test_eq(templater.nuclide_vars()[:2], expected)

In [None]:
#| export
# @fc.patch
# def get_analytes(self:NCTemplater,
#                  col_varnames:str='nc_name', # Column name in the Excel lookup file containing the NetCDF variable names
#                  col_stdnames:str='nusymbol', # Column name Excel lookup file containing the NetCDF standard names
#                  dtype:str='f4', # Default type
#                 ):
#     "Return the name of the variables analysed"
#     df = pd.read_excel(self.vars_fname, index_col=0)
#     df = df[df.nuclide != 'NOT AVAILABLE']
#     var_names = df[col_varnames].tolist()
#     std_names = df[col_stdnames].tolist()
#     long_names = df[['nuclide', 'massnb']].apply(lambda row: ' '.join(row.values.astype(str)), 
#                                                  axis=1).tolist()
#     long_names = [name.capitalize() for name in long_names]

#     return [{'name': n,
#              'attrs': {
#                  'long_name': ln,
#                  'standard_name': sn
#              },
#              'dtype': dtype
#             } for n, ln, sn in zip(*(var_names, long_names, std_names))]

In [None]:
#| export
@patch
def derive(
    self:NCTemplater,
    nuclide:dict, # Nuclide variable name and associated netcdf attributes
    suffix:dict,  # Naming rules as described in CDL (e.g `_unc`)
) -> dict: # Derived variable name and associated attributes
    "Derive NetCDF nuclide-dependent variable names & attributes as defined in CDL." 
    return {
        'name': nuclide['name'] + suffix['name'],
        'attrs': {key: nuclide['attrs'][key] + suffix['attrs'][key] for key in nuclide['attrs']},
        'dtype': suffix['dtype']  # Using dtype from suffix
        }

For example, among others, the `cdl.toml` file defines the naming convention on variable names deriving from nuclides (e.g `h3_unc` for measurement uncertainty on the `h3` nuclide variable).

In [None]:
templater = NCTemplater(cdl_fname='./files/cdl.toml',
                        nuclide_vars_fname='./files/lut/dbo_nuclide.xlsx', 
                        tpl_fname='./files/nc/test.nc')

Here is below the defined Tritium NetCDF variable as specified in the `.cdl` file:

In [None]:
templater.nuclide_vars()[0]

{'name': 'h3',
 'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'},
 'dtype': 'f4'}

In [None]:
# Example of suffix defined in the .cdl file
suffix = {
    'name': '_unc',
    'attrs': {
        'long_name': ' uncertainty',
        'standard_name': '_uncertainty'
        },
    'dtype': 'f4'
    }

# And what we expect
expected = {
    'name': 'h3_unc',
    'attrs': {
        'long_name': 'Tritium 3 uncertainty',
        'standard_name': '3H_uncertainty'
        },
    'dtype': 'f4'
    }

fc.test_eq(templater.derive(templater.nuclide_vars()[0], suffix=suffix), expected)

In [None]:
# templater.cdl['placeholder']

'_to_be_filled_in_'

In [None]:
# analyte = templater.nuclide_vars()[0]; analyte

In [None]:
# analyte['attrs']['units'] = cdl['placeholder']; analyte

In [None]:
# suffix = cdl['vars']['suffixes']['uncertainty']; suffix

In [None]:
# expected = {
#     'name': 'h3_unc',
#     'attrs': {
#         'long_name': 'Tritium 3 uncertainty',
#         'standard_name': '3H_uncertainty',
#         'units': '_to_be_filled_in_'},
#     'dtype': 'f4'
# }

# test_eq(derive(analyte, suffix), expected)

In [None]:
#| export
# @fc.patch
# def create_variable(self:NCTemplater, 
#                nc, # NetCDF file
#                var:Dict, # Variable
#                dtype:Union[str, None]=None, # Type of the variable
#            ):
#     """Create NetCDF variable with proper types (standard and enums)"""
#     name = var['name']
#     attrs = var['attrs'].copy()
#     nc_var = nc.createVariable(name, 
#                                self.enum_types.get(dtype) or dtype, 
#                                self.dim['name'])
#     nc_var.setncatts(attrs)    
#     return nc

In [None]:
# Example of use
# with Dataset('files/nc/test.nc', 'w', format='NETCDF4') as nc:
#     nc.createDimension(templater.dim['name'], None)
#     templater.create_variable(nc, cdl['vars']['defaults']['lon'])
#     print(nc.variables['lon'])

In [None]:
#| export
# @fc.patch
# def generate(self:NCTemplater,
#              common_vars:list=['lon', 'lat', 'depth', 'time'], # Common variables
#             ):
#     "Generate CDL"
#     fname = Path(self.dest_dir)/self.tpl_fname
    
#     common_vars = self.cdl['vars']['defaults'].keys()
    
#     with Dataset(fname, 'w', format='NETCDF4') as nc:
#         # Create dataset attributes
#         nc.setncatts(self.cdl['global_attrs']) 
        
#         # Create Enum type    
#         for name, enum in enum_type_lut.items(): 
#             self.enum_types[name] = nc.createEnumType(np.uint16, name, enum)
        
#         # Create shared `sample` dimension
#         nc.createDimension(self.dim['name'], None)
        
#         # Create grps
#         grp_names = [v['name'] for k, v in self.cdl['grps'].items()]
#         for grp_name in grp_names:
#             grp = nc.createGroup(grp_name)

#             # Create 'dim' variable
#             #self.create_variable(grp, self.dim, 'i4')
#             self.create_variable(grp, self.dim)
            
#             # Create default variables
#             for var in self.cdl['vars']['defaults'].values(): 
#                 self.create_variable(grp, var)

#             # Create group-specific variables
#             if name2grp(grp_name) in self.cdl['vars']:
#                 for var in self.cdl['vars'][name2grp(grp_name)].values(): 
#                     self.create_variable(grp, var)
            
#             # Create analyte variables
#             for analyte in self.get_analytes():
#                 analyte['units'] = self.cdl['placeholder']
#                 self.create_variable(grp, analyte)
            
#                 # Derived uncertainty and detection limit variables
#                 for k, v in self.cdl['vars']['suffixes'].items():
#                     self.create_variable(grp, derive(analyte, v))

In [None]:
# So in summary, to produce a template MARIS NetCDF
# templater = NCTemplater('test.nc',
#                     vars_fname='./files/lut/dbo_nuclide.xlsx', 
#                     dest_dir='./files/nc',
#                     cdl=cdl)

# templater.generate()