In [None]:
#| default_exp nc_template

# MARIS NetCDF Template
> Creation of MARIS NetCDF template based on "pseudo" Common Data Language `.toml` config file

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from netCDF4 import Dataset
import numpy as np
import pandas as pd
from pathlib import Path
from fastcore.basics import patch, store_attr
from fastcore.test import *
from typing import Dict
from copy import deepcopy

from marisco.utils import read_toml
from marisco.configs import BASE_PATH, name2grp

In [None]:
#| export
# from each lut provided, create netcdf enumtype
#enum_dict = {'Altocumulus': 7, 'Missing': 255,
#             'Stratus': 2, 'Clear': 0,
#             'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5, 
#             'Cumulonimbus': 1, 'Stratocumulus': 3}
#df = pd.read_excel(self.vars_fname, index_col=0)

In [None]:
df = pd.read_excel('./files/lut/dbo_biogroup.xlsx', index_col=0); df.head()

Unnamed: 0_level_0,biogroup,helcom_id,helcom_grp
biogroup_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,(Not available),,
1,Birds,,
2,Crustaceans,B,BENTHIC ANIMAL
3,Echinoderms,B,BENTHIC ANIMAL
4,Fish,F,FISH


In [None]:
#enum_dict = {'Altocumulus': 7, 'Missing': 255,
#             'Stratus': 2, 'Clear': 0,
#             'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5,
#             'Cumulonimbus': 1, 'Stratocumulus': 3}

In [None]:
#enum_bio_group = {b: int(idx) for idx, b in df['biogroup'].items()}; enum_bio_group

{'(Not available)': 0,
 'Birds': 1,
 'Crustaceans': 2,
 'Echinoderms': 3,
 'Fish': 4,
 'Mammals': 5,
 'Molluscs': 6,
 'Others': 7,
 'Plankton': 8,
 'Polychaete worms': 9,
 'Reptile': 10,
 'Seaweeds and plants': 11,
 'Cephalopods': 12,
 'Gastropods': 13,
 'Bivalves': 14}

In [None]:
#enum_bio_group = {'Altocumulus': 7, 'Missing': 255,
#                  'Stratus': 2, 'Clear': 0,
#                  'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5,
#                  'Cumulonimbus': 1, 'Stratocumulus': 3}

In [None]:
#| export
class NCTemplate:
    "MARIS NetCDF templater"
    def __init__(self, 
                 tpl_fname:str, # CDL file name
                 vars_fname:str, # File name and path of MARIS nuclide look up table
                 dest_dir:str, # Destination directory for generated NetCDF template files
                 cdl:Dict, # Pseudo CDL (`.toml`)
                ):
        store_attr()
        self.dim = self.cdl['dim']

In [None]:
cdl = read_toml(Path('./files') / 'cdl.toml')
nc_tpl = NCTemplate('test.nc',
                     vars_fname='./files/lut/dbo_nuclide.xlsx', 
                     dest_dir='./files/nc',
                     cdl=cdl)

In [None]:
expected = {'name': 'sample', 
            'dtype': 'i4', 
            'attrs': {'long_name': 'Sample ID of measurement'}
           }
test_eq(nc_tpl.dim, expected)

In [None]:
#| export
@patch
def get_analytes(self:NCTemplate,
                 col_varnames:str='nc_name', # Column name containing the NetCDF variable names
                 col_stdnames:str='nusymbol', # Column name containing the NetCDF standard names
                 dtype:str='f4', # Default type
                ):
    "Return the name of the variables analysed"
    df = pd.read_excel(self.vars_fname, index_col=0)
    df = df[df.nuclide != 'NOT AVAILABLE']
    var_names = df[col_varnames].tolist()
    std_names = df[col_stdnames].tolist()
    long_names = df[['nuclide', 'massnb']].apply(lambda row: ' '.join(row.values.astype(str)), 
                                                 axis=1).tolist()
    long_names = [name.capitalize() for name in long_names]

    return [{'name': n,
             'attrs': {
                 'long_name': ln,
                 'standard_name': sn
             },
             'dtype': dtype
            } for n, ln, sn in zip(*(var_names, long_names, std_names))]

In [None]:
nc_tpl.get_analytes()[:2]

[{'name': 'h3',
  'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'},
  'dtype': 'f4'},
 {'name': 'be7',
  'attrs': {'long_name': 'Beryllium 7', 'standard_name': '7Be'},
  'dtype': 'f4'}]

In [None]:
from copy import deepcopy

In [None]:
#| export
def derive(
    analyte:dict, # Analyte/nuclide/var name and associated netcdf attributes
    suffix:dict,  # Naming rules as described in CDL
):
    "Derive NetCDf var name & attributes as defined in CDL" 
    # TBD: refactor using recursion?
    derived = deepcopy(analyte)
    for k1, v1 in suffix.items():
        if k1 == 'attrs':
            for k2, v2 in suffix['attrs'].items():
                derived['attrs'][k2] += v2
        else:
            derived[k1] += v1
    return derived

Example:

In [None]:
analyte = nc_tpl.get_analytes()[0]; analyte

{'name': 'h3',
 'attrs': {'long_name': 'Tritium 3', 'standard_name': '3H'},
 'dtype': 'f4'}

In [None]:
analyte['attrs']['units'] = cdl['placeholder']; analyte

{'name': 'h3',
 'attrs': {'long_name': 'Tritium 3',
  'standard_name': '3H',
  'units': '_to_be_filled_in_'},
 'dtype': 'f4'}

In [None]:
suffix = cdl['vars']['suffixes']['uncertainty']; suffix

{'name': '_unc',
 'attrs': {'long_name': ' uncertainty', 'standard_name': '_uncertainty'}}

In [None]:
expected = {
    'name': 'h3_unc',
    'attrs': {
        'long_name': 'Tritium 3 uncertainty',
        'standard_name': '3H_uncertainty',
        'units': '_to_be_filled_in_'},
    'dtype': 'f4'
}

test_eq(derive(analyte, suffix), expected)

In [None]:
#| export
@patch
def create_variable(self:NCTemplate, 
               nc, # NetCDF file
               var:Dict, # Variable
               dtype:str|None=None, # Type of the variable
           ):
    name = var['name']
    dtype = None or var['dtype']
    attrs = var['attrs'].copy()
    nc_var = nc.createVariable(name, dtype, self.dim['name'])
    nc_var.setncatts(attrs)    
    return nc

In [None]:
# Example of use
with Dataset('files/nc/test.nc', 'w', format='NETCDF4') as nc:
    nc.createDimension(nc_tpl.dim['name'], None)
    nc_tpl.create_variable(nc, cdl['vars']['defaults']['lon'])
    print(nc.variables['lon'])

<class 'netCDF4._netCDF4.Variable'>
float32 lon(sample)
    long_name: Measurement longitude
    standard_name: longitude
    units: degrees_north
    axis: Y
    _CoordinateAxisType: Lon
unlimited dimensions: sample
current shape = (0,)
filling on, default _FillValue of 9.969209968386869e+36 used


In [None]:
#| export
@patch
def generate(self:NCTemplate,
             common_vars:list=['lon', 'lat', 'depth', 'time'], # Common variables
            ):
    "Generate CDL"
    fname = Path(self.dest_dir)/self.tpl_fname
    
    common_vars = self.cdl['vars']['defaults'].keys()
    
    with Dataset(fname, 'w', format='NETCDF4') as nc:
        # Create dataset attributes
        nc.setncatts(self.cdl['global_attrs']) 
        
        # Create Enum type
        #biogroup_type = nc.createEnumType(np.uint8, 'biogroup_t', enum_bio_group)
        
        # Create shared `sample` dimension
        nc.createDimension(self.dim['name'], None)
        
        # Create grps
        grp_names = [v['name'] for k, v in self.cdl['grps'].items()]
        for grp_name in grp_names:
            grp = nc.createGroup(grp_name)

            # Create 'dim' variable
            #self.create_variable(grp, self.dim, 'i4')
            self.create_variable(grp, self.dim)
            
            # Create default variables
            for var in self.cdl['vars']['defaults'].values(): 
                self.create_variable(grp, var)

            # Create group-specific variables
            if name2grp(grp_name) in self.cdl['vars']:
                for var in self.cdl['vars'][name2grp(grp_name)].values(): 
                    self.create_variable(grp, var)
            
            # Create analyte variables
            for analyte in self.get_analytes():
                analyte['units'] = self.cdl['placeholder']
                self.create_variable(grp, analyte)
            
                # Derived uncertainty and detection limit variables
                for k, v in self.cdl['vars']['suffixes'].items():
                    self.create_variable(grp, derive(analyte, v))

In [None]:
# So in summary, to produce a template MARIS NetCDF
nc_tpl = NCTemplate('test.nc',
               vars_fname='./files/lut/dbo_nuclide.xlsx', 
               dest_dir='./files/nc',
               cdl=cdl)

nc_tpl.generate()