In [None]:
#| default_exp serializers

# Serializers
> Various utilities to encode MARIS dataset as `NetCDF`, `csv`, ... formats.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import netCDF4
from netCDF4 import Dataset
import pandas as pd
from typing import Dict, Callable
import pandas as pd
import numpy as np
from fastcore.basics import patch, store_attr
import fastcore.all as fc
import os

from marisco.configs import (
    NC_DTYPES, 
    NC_VARS, 
    NC_DIM,
    NC_GROUPS,
    lut_path, 
    Enums,
    nc_tpl_path
)

In [None]:
#| exports
class NetCDFEncoder:
    "MARIS NetCDF encoder."
    def __init__(self, 
                 dfs:dict[pd.DataFrame], # dict of Dataframes to encode with group name as key {'sediment': df_sed, ...}
                 dest_fname:str, # Name of output file to produce
                 global_attrs:Dict, # Global attributes
                 src_fname:str = nc_tpl_path(), # File name and path to the MARIS CDL template
                #  enums_xtra:Dict={}, # Enumeration types to overwrite
                 verbose:bool=False, # Print currently written NetCDF group and variable names
                 ):
        store_attr()
        self.enum_dtypes = {}
        self.nc_to_cols = {v:k for k,v in NC_VARS.items()}


In [1010]:
df_seawater = pd.DataFrame({
    'ID': [0, 1, 2], 
    'LON': [141, 142, 143], 
    'LAT': [37.3, 38.3, 39.3], 
    'TIME': [1234, 1235, 1236], 
    'NUCLIDE': [1, 2, 3],
    'VALUE': [0.1, 1.1, 2.1], 
    'AREA': [2374, 2379, 2401],
    })

df_biota = pd.DataFrame({
    'ID': [0, 1, 2, 3], 
    'LON': [141, 142, 143, 144], 
    'LAT': [37.3, 38.3, 39.3, 40.3], 
    'TIME': [1234, 1235, 1236, 1237], 
    'NUCLIDE': [1, 2, 3, 3],
    'VALUE': [0.1, 1.1, 2.1, 3.1], 
    'SPECIES': [1, 2, 3, 3]
    })

dfs = {'SEAWATER': df_seawater, 'BIOTA': df_biota}
attrs = {'id': '123', 'title': 'Test title', 'summary': 'Summary test'}
src = './files/nc/maris-template.nc'
dest = './files/nc/encoding-test.nc'

In [1011]:
#| exports
@patch 
def copy_global_attributes(self:NetCDFEncoder):
    "Update NetCDF template global attributes as specified by `global_attrs` argument."
    self.dest.setncatts(self.src.__dict__)
    for k, v in self.global_attrs.items(): self.dest.setncattr(k, v)

In [1012]:
# @patch
# def copy_dimensions(self:NetCDFEncoder):
#     for name, dimension in self.src.dimensions.items():
#         self.dest.createDimension(name, (len(dimension) if not dimension.isunlimited() else None))

In [1025]:
@patch
def copy_dimensions(self:NetCDFEncoder, grp_dest):
    "Copy dimensions to root and all groups from template."
    # # Copy root dimensions
    # for name, dimension in self.src.dimensions.items():
    #     self.dest.createDimension(name, (len(dimension) if not dimension.isunlimited() else None))
    
    # Copy dimensions to each group
    for grp_name, grp_src in self.src.groups.items():
        # grp_dest = self.dest.createGroup(grp_name)
        for name, dimension in grp_src.dimensions.items():
            group_dest.createDimension(name, (len(dimension) if not dimension.isunlimited() else None))

In [1026]:
#| exports
@patch
def process_groups(self:NetCDFEncoder):
    for grp_name, df in self.dfs.items():
        self.process_group(NC_GROUPS[grp_name], df)

In [1027]:
#| exports
@patch
def process_group(self:NetCDFEncoder, grp_name, df):
    grp_dest = self.dest.createGroup(grp_name)
    self.copy_dimensions(grp_dest)
    self.copy_variables(grp_name, df, grp_dest)

In [1028]:
#| exports
@patch
def copy_variables(self:NetCDFEncoder, grp_name, df, grp_dest):
    cols = [NC_VARS[col] for col in df.columns if col in NC_VARS]
    for var_name, var_src in self.src.groups[grp_name].variables.items():
        if var_name in cols: 
            self.copy_variable(var_name, var_src, df, grp_dest)

In [1029]:
#| exports
@patch
def copy_variable(self:NetCDFEncoder, var_name, var_src, df, grp_dest):
    dtype_name = var_src.datatype.name
    enums_src = self.src.enumtypes
    if self.verbose: 
        print(80*'-')
        print(f'Group: {grp_dest.name}, Variable: {var_name}')
    self._create_and_copy_variable(var_name, var_src, df, grp_dest, dtype_name)
    self.copy_variable_attributes(var_name, var_src, grp_dest)

In [1030]:
#| exports
@patch
def _create_and_copy_variable(self:NetCDFEncoder, var_name, var_src, df, grp_dest, dtype_name):
    variable_type = self.enum_dtypes.get(dtype_name, var_src.datatype)
    grp_dest.createVariable(var_name, variable_type, NC_DIM, compression='zlib', complevel=9)            
    isNotEnum = type(variable_type) != netCDF4._netCDF4.EnumType
    values = df[self.nc_to_cols[var_name]].values
    grp_dest[var_name][:] = values if isNotEnum else self.sanitize_if_enum_and_nan(values)

In [1031]:
#| exports
@patch
def sanitize_if_enum_and_nan(self:NetCDFEncoder, values, fill_value=-1):
    values[np.isnan(values)] = int(fill_value)
    values = values.astype(int)
    return values

In [1032]:
#| exports
@patch
def copy_enum_type(self:NetCDFEncoder, dtype_name):
    # if enum type not already created
    if dtype_name not in self.enum_types:
        enum_info = self.src.enumtypes[dtype_name]
        # If a subset of an enum is defined in enums_xtra (typically for the lengthy species_t)
        if enum_info.name in self.enums_xtra:
            # add "not applicable"
            enum_info.enum_dict = self.enums_xtra[enum_info.name]
            enum_info.enum_dict['Not applicable'] = -1 # TBD
        self.enum_types[dtype_name] = self.dest.createEnumType(enum_info.dtype, 
                                                               enum_info.name, 
                                                               enum_info.enum_dict)

In [1033]:
#| exports
@patch
def copy_variable_attributes(self:NetCDFEncoder, var_name, var_src, grp_dest):
    grp_dest[var_name].setncatts(var_src.__dict__)

In [1034]:
#| exports
@patch
def retrieve_all_cols(self:NetCDFEncoder, 
                      dtypes=NC_DTYPES
                      ):
    "Retrieve all unique columns from the dict of dataframes." 
    return list(set(col for df in self.dfs.values() for col in df.columns if col in dtypes.keys()))

In [1035]:
#| exports
@patch
def create_enums(self:NetCDFEncoder):
    cols = self.retrieve_all_cols()
    enums = Enums(lut_src_dir=lut_path())
    for col in cols:
        name = NC_DTYPES[col]['name']
        dtype = self.dest.createEnumType(np.int64, name, enums.types[col])
        self.enum_dtypes[name] = dtype

## Encode

In [1036]:
#| exports
@patch
def encode(self:NetCDFEncoder):
    "Encode MARIS NetCDF based on template and dataframes."
    with Dataset(self.src_fname, format='NETCDF4') as self.src, Dataset(self.dest_fname, 'w', format='NETCDF4') as self.dest:
        self.copy_global_attributes()
        # self.copy_dimensions()
        self.create_enums()
        self.process_groups()

In [1037]:
#|eval: false
encoder = NetCDFEncoder(dfs, 
                        dest_fname=dest, 
                        global_attrs=attrs,
                        verbose=False
                        )
encoder.encode()

RuntimeError: NetCDF: String match to name in use

In [None]:
# Test that global attributes are copied
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     for k, v in {'id': '123', 'title': 'Test title', 'summary': 'Summary test'}.items():
#         fc.test_eq(getattr(nc, k), v)

In [None]:
# Test that dimension is `sample` and unlimited
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     fc.test_eq('sample' in nc.dimensions, True)
#     fc.test_eq(nc.dimensions['sample'].isunlimited(), True)

In [None]:
# Test that groups are created
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     fc.test_eq(nc.groups.keys(), ['seawater', 'biota'])

In [None]:
# Test that groups are created
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     fc.test_eq(nc.groups.keys(), ['seawater', 'biota'])

In [None]:
# Test that correct variables are created in groups
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     fc.test_eq(nc['biota'].variables.keys(), 
#                ['sample', 'lon', 'lat', 'time', 'species', 'i131', 'i131_dl', 'i131_unit'])
    
#     fc.test_eq(nc['seawater'].variables.keys(), 
#                ['sample', 'lon', 'lat', 'time', 'i131', 'i131_dl', 'i131_unit'])

In [None]:
# Test that correct variables are created in groups
# with Dataset(dest, 'r', format='NETCDF4') as nc:
#     print(nc.dimensions.items())
#     print(nc['biota'].dimensions.items())
#     print(nc['seawater'].dimensions.items())

## OpenRefine CSV encoder

In [None]:
#| exports
class OpenRefineCsvEncoder:
    "OpenRefine CSV from NetCDF."
    def __init__(self, 
                 dfs:dict[pd.DataFrame], # dict of Dataframes to encode with group name as key {'sediment': df_sed, ...}
                 dest_fname:str, # Name of output file to produce
                 ref_id = -1, # ref_id to include 
                 verbose:bool=False, # Print 
                 ):
        store_attr()

In [None]:
#| exports
@patch
def process_groups_to_csv(self:OpenRefineCsvEncoder):
    for grp_name, df in self.dfs.items():
        # include ref_id
        if self.ref_id != -1:
            df['ref_id'] = self.ref_id
        self.process_group_to_csv(grp_name, df)

In [None]:
#| exports
@patch
def process_group_to_csv(self:OpenRefineCsvEncoder, group_name, df):
    filename, file_extension=os.path.splitext(self.dest_fname)
    path = filename + '_' + group_name + file_extension
    df.to_csv( path_or_buf= path, sep=',', index=False)

In [None]:
#| exports
@patch
def encode(self:OpenRefineCsvEncoder):
    "Encode OpenRefine CSV based on dataframes from NetCDF."
    # Include ref_id
    
    # Process to csv
    self.process_groups_to_csv()

In [None]:
#|eval: false
dest = '../files/csv/encoding-test.csv'

encoder = OpenRefineCsvEncoder(dfs,  dest_fname=dest)
encoder.encode()