In [None]:
#| default_exp serializers

# Serializers
> Various utilities to encode MARIS dataset as `NetCDF`, `csv`, ... formats.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from netCDF4 import Dataset
from cftime import num2date
import pandas as pd
from typing import Dict, Callable
import re
import unittest

from fastcore.basics import patch, store_attr

In [None]:
#| export
def cast_verbose(df, col):
    """
    Try to cast df column to numeric type:
        - Silently coerce to nan if not possible
        - But log when it failed
    """
    n_before = sum(df.reset_index()[col].notna())
    df_after = pd.to_numeric(df.reset_index()[col],
                                    errors='coerce', downcast=None)
    n_after = sum(df_after.notna())
    if n_before != n_after: 
        print(f'Failed to convert type of {col} in {n_before - n_after} occurences')
    
    return df_after

In [None]:
# import netCDF4 as nc

# # Open the source NetCDF file
# src_path = 'path/to/source.nc'
# src_nc = nc.Dataset(src_path, 'r')

# # Retrieve the enumeration type from the source file
# enum_type_name = 'your_enum_type_name'  # Replace with your actual enum type name
# enum_type = src_nc.cmptypes[enum_type_name]

# # Open/Create the destination NetCDF file
# dst_path = 'path/to/destination.nc'
# dst_nc = nc.Dataset(dst_path, 'w')

# # Create the enumeration type in the destination file
# dst_nc.createEnumType(enum_type.base_datatype, enum_type_name, enum_type.enum_dict)

# # Copy other relevant data if necessary
# # ...

# # Close both files
# src_nc.close()
# dst_nc.close()

In [None]:
# when I create, copy type, update enum_dict

# df.species -> list of ids
# or just get the full list and just update the enum
# by just filtering by values

In [None]:
#| export
def to_netcdf(
    dfs:dict[pd.DataFrame], # dict of Dataframes to encode with group name as key {'sediment': df_sed, ...}
    src_fname:str, # Input MARIS template NetCDF path and name
    # fname_output:str, # Name of output file to produce
    dest_fname:str, # Output NetCDF path and name to produce
    global_attrs:Dict, # Global attributes
    units_fn:Callable, # (group, variable) -> unit look up function
):
    "Encode MARIS dataset (provided as Pandas DataFrame) to NetCDF file"
    with Dataset(src_fname, format='NETCDF4') as src, Dataset(dest_fname, 'w', format='NETCDF4') as dst:
        # copy global attributes all at once via dictionary
        dst.setncatts(src.__dict__)
        dst.setncatts(global_attrs) 
        
        # copy dimensions
        for name, dimension in src.dimensions.items():
            dst.createDimension(
                name, (len(dimension) if not dimension.isunlimited() else None))

        # copy groups
        for grp_name, df in dfs.items():
            # TBD: asserting group name
            grp_dest = dst.createGroup(grp_name)
        
            n_before = 0
            n_after = 0
            
            # copy all variables of interest and fill them
            for name_var_src, var_src in src.groups[grp_name].variables.items():
                # Only if source variable is in destination
                if name_var_src in df.reset_index().columns:
                    # x = grp_dest.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                    grp_dest.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                                            compression='zlib', complevel=9)
                        
                    df_sanitized = cast_verbose(df, name_var_src)
                    grp_dest[name_var_src][:] = df_sanitized.values
                    
                    # copy variable attributes all at once via dictionary
                    grp_dest[name_var_src].setncatts(src.groups[grp_name][name_var_src].__dict__)
                    if (hasattr(src.groups[grp_name][name_var_src], 'units') and
                        src.groups[grp_name][name_var_src].units == '_to_be_filled_in_'):
                        grp_dest[name_var_src].units = units_fn(grp_name, name_var_src)

In [None]:
#| export
class NetCDFConverter:
    def __init__(self, 
                 src_fname:str, # File name and path to the MARIS CDL template
                 dest_fname:str, # Name of output file to produce
                 global_attrs:Dict, # Global attributes
                 units_fn:Callable, # (group, variable) -> unit look up function
                 ):
        store_attr()
        # self.src = None
        # self.dest = None

In [None]:
#| export
@patch
def to_netcdf(self:NetCDFConverter,
              dfs:dict[pd.DataFrame], # Dataframes dict to encode with grp name as key {'sediment': df_sed, ...}
              ):
    with (Dataset(self.fname_cdl, format='NETCDF4') as self.src, 
          Dataset(self.fname_output, '  w', format='NETCDF4') as self.dest):
      self.copy_global_attributes()
      self.copy_dimensions()
      self.process_groups(dfs)

In [None]:
#| export
@patch 
def copy_global_attributes(self:NetCDFConverter):
    self.dest_nc.setncatts(self.src.__dict__)
    self.dest.setncatts(self.global_attrs)

In [None]:
src_fname = './files/nc/maris-template.nc'
src_fname = './files/nc/maris-template.nc'

In [None]:

converter = NetCDFConverter(src_fname, )

In [None]:
# create NetCDFConverter
# call copy_gloabel_attributes
# check that global_attrs have been copied


In [None]:
#| export
@patch
def copy_dimensions(self:NetCDFConverter):
    for name, dimension in self.src.dimensions.items():
        self.dest.createDimension(name, (len(dimension) if not dimension.isunlimited() else None))

In [None]:
#| export
@patch
def process_groups(self:NetCDFConverter, dfs):
    for grp_name, df in dfs.items():
        self.process_group(grp_name, df)

In [None]:
#| export
@patch
def process_group(self:NetCDFConverter, group_name, df):
    group_dest = self.dest.createGroup(group_name)
    self.copy_variables(group_name, df, group_dest)

In [None]:
#| export
@patch
def copy_variables(self:NetCDFConverter, group_name, df, group_dest):
    for var_name, var_src in self.src.groups[group_name].variables.items():
        if var_name in df.reset_index().columns:
            self.copy_variable(var_name, var_src, df, group_dest)

In [None]:
#| export
@patch
def copy_variable(self:NetCDFConverter, var_name, var_src, df, group_dest):
    group_dest.createVariable(var_name, var_src.datatype, var_src.dimensions,
                              compression='zlib', complevel=9)
    df_sanitized = self.cast_verbose_rf(df, var_name)
    group_dest[var_name][:] = df_sanitized.values
    self.copy_variable_attributes(var_name, var_src, group_dest)

In [None]:
#| export
@patch
def copy_variable_attributes(self:NetCDFConverter, var_name, var_src, group_dest):
    group_dest[var_name].setncatts(var_src.__dict__)
    group_name = group_dest.path.split('/')[-1]
    if (hasattr(var_src, 'units') and var_src.units == '_to_be_filled_in_'):
        group_dest[var_name].units = self.units_fn(group_name, var_name)

In [None]:
#| export
@patch
def cast_verbose_rf(self:NetCDFConverter, 
                    df, 
                    col):
    """
    Try to cast df column to numeric type:
        - Silently coerce to nan if not possible
        - But log when it failed
    """
    n_before = sum(df.reset_index()[col].notna())
    df_after = pd.to_numeric(df.reset_index()[col],
                                    errors='coerce', downcast=None)
    n_after = sum(df_after.notna())
    if n_before != n_after: 
        print(f'Failed to convert type of {col} in {n_before - n_after} occurences')
    
    return df_after

In [None]:
# converter = NetCDFConverter(fname_cdl, fname_output, global_attrs, units_fn)
# converter.to_netcdf(dfs)

In [None]:
# def to_csv(
#     fname_nc:str,
#     fname_output:str):
#     "Convert MARIS NetCDF filer to `.csv`"
#     fname_nc = './files/nc/tepco-sediments.nc'
#     data_dict = {}
#     with Dataset(fname_nc) as nc:
#         # global attrs
#         for name in nc.ncattrs():
#             pass
#             #print(name)
#         # list of vars   
#         for name in nc.variables:
#             #print(name)
#             variable = nc[name]
#             data_dict[name] = variable[:]
#     return pd.DataFrame(data_dict)

# #df = to_csv('./files/nc/tepco-sediments.nc', '')

In [None]:
# fname_nc = '../../_data/output/helcom.nc'

Questions:
1. all smptype together
2. unit for actvity but sometimes dl or uncertainty in different units...

In [None]:
#|eval: false
# data = {}
# units = {}
# with Dataset(fname_nc) as nc:
#     #print(nc.ncattrs())
#     print(nc.groups.keys())
#     sw_grp = nc.groups['seawater']
#     for var in sw_grp.variables:
#         if hasattr(sw_grp.variables[var], 'units'):
#             units[var] = sw_grp.variables[var].units
#         data[var] = sw_grp.variables[var][:]

In [None]:
#|eval: false
# df = pd.DataFrame(data); df

In [None]:
#|eval: false
format_time = lambda x: num2date(x, units="seconds since 1970-01-01 00:00:00.0")
df['time'] = df['time'].apply(format_time)

In [None]:
#|eval: false
# df_nuc = df.set_index(['sample', 'lon', 'lat', 'depth', 'time']); df_nuc.head()

In [None]:
#|eval: false
# df_nuc.columns

In [None]:
#|eval: false
# def get_multi_index(colnames):
#     arr = []
#     for colname in colnames:
#         if re.search('_unc', colname):
#             arr.append((re.split('_unc', colname)[0], 'uncertainty'))
#         elif re.search('_dl', colname):
#             arr.append((re.split('_dl', colname)[0], 'detection'))
#         else:
#             arr.append((colname, 'activity'))
#     return pd.MultiIndex.from_tuples(arr)

In [None]:
#|eval: false
# get_multi_index(df_nuc.columns)

In [None]:
#|eval: false
# df_nuc.columns = get_multi_index(df_nuc.columns)

In [None]:
#|eval: false
# df_sw = df_nuc.stack(level=0).reset_index().rename(columns={'level_5': 'nucl'}); df_sw

In [None]:
#|eval: false
# df.head()

In [None]:
#|eval: false
# df_sw['unit'] = df_sw['nucl'].replace(units); df_sw