In [None]:
#| default_exp serializers

# Serializers
> Various utilities to encode MARIS dataset as `NetCDF`, `csv`, ... formats.

In [None]:
#| export
from netCDF4 import Dataset
from cftime import num2date, date2num
import pandas as pd
from typing import Dict, Callable
import re

In [None]:
#| export
def to_netcdf(
    dfs:dict[pd.DataFrame], # dict of Dataframes to encode with group name as key {'sediment': df_sed, ...}
    fname_cdl:str, # File name and path to the MARIS CDL template
    fname_output:str, # Name of output file to produce
    cfgs:Dict, # Config file containing global attributes
    units_fn:Callable, # (group, variable) -> unit look up function
):
    "Encode MARIS dataset (provided as Pandas DataFrame) to NetCDF file"
    with Dataset(fname_cdl, format='NETCDF4') as src, Dataset(fname_output, 'w', format='NETCDF4') as dst:
        # copy global attributes all at once via dictionary
        dst.setncatts(src.__dict__)

        dst.setncatts(cfgs['global_attr']) 
        # copy dimensions
        for name, dimension in src.dimensions.items():
            dst.createDimension(
                name, (len(dimension) if not dimension.isunlimited() else None))

        # copy groups
        for grp_name, df in dfs.items():
            # TBD: asserting group name
            grp_dest = dst.createGroup(grp_name)
        
            n_before = 0
            n_after = 0
            
            # copy all variables of interest and fill them
            for name_var_src, var_src in src.groups[grp_name].variables.items():
                if name_var_src in df.reset_index().columns:
                    x = grp_dest.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                                                compression='zlib', complevel=9)
                    # fill variables
                    # Sanitize
                    n_before += sum(df.reset_index()[name_var_src].notna())
                    df_sanitized = pd.to_numeric(df.reset_index()[name_var_src], 
                                                 errors='coerce', downcast=None)
                    n_after += sum(df_sanitized.notna())
                    grp_dest[name_var_src][:] = df_sanitized .values
                    # copy variable attributes all at once via dictionary
                    grp_dest[name_var_src].setncatts(src.groups[grp_name][name_var_src].__dict__)
                    if (hasattr(src.groups[grp_name][name_var_src], 'units') and
                        src.groups[grp_name][name_var_src].units == '_to_be_filled_in_'):
                        grp_dest[name_var_src].units = units_fn(grp_name, name_var_src)
            print(f'% of discarded data for grp {grp_name}: {100*(n_before - n_after)/n_before}')

In [None]:
#| export
def to_csv(
    fname_nc:str,
    fname_output:str):
    "Convert MARIS NetCDF filer to `.csv`"
    fname_nc = './files/nc/tepco-sediments.nc'
    data_dict = {}
    with Dataset(fname_nc) as nc:
        # global attrs
        for name in nc.ncattrs():
            pass
            #print(name)
        # list of vars   
        for name in nc.variables:
            #print(name)
            variable = nc[name]
            data_dict[name] = variable[:]
    return pd.DataFrame(data_dict)

#df = to_csv('./files/nc/tepco-sediments.nc', '')

In [None]:
fname_nc = '../../_data/output/helcom.nc'

Questions:
1. all smptype together
2. unit for actvity but sometimes dl or uncertainty in different units...

In [None]:
#|eval: false
data = {}
units = {}
with Dataset(fname_nc) as nc:
    #print(nc.ncattrs())
    print(nc.groups.keys())
    sw_grp = nc.groups['seawater']
    for var in sw_grp.variables:
        if hasattr(sw_grp.variables[var], 'units'):
            units[var] = sw_grp.variables[var].units
        data[var] = sw_grp.variables[var][:]

dict_keys(['seawater', 'sediment', 'biota'])


In [None]:
#|eval: false
df = pd.DataFrame(data); df

Unnamed: 0,sample,lon,lat,depth,time,h3,h3_unc,k40,k40_unc,mn54,...,pu239,pu239_unc,pu240,pu240_unc,am241,am241_unc,cm242,cm242_unc,cm244,cm244_unc
0,0,11.983300,54.466702,17.0,4.522176e+08,,,,,,...,,,,,,,,,,
1,1,11.083300,54.599998,23.0,4.522176e+08,,,,,,...,,,,,,,,,,
2,2,10.833300,54.866699,38.0,4.522176e+08,,,,,,...,,,,,,,,,,
3,3,12.683300,54.950001,23.0,4.522176e+08,,,,,,...,,,,,,,,,,
4,4,11.050000,55.383331,22.0,4.522176e+08,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4571,4581,24.155001,59.380001,21.0,1.535069e+09,,,2160.000000,4.00,,...,,,,,,,,,,
4572,4582,23.288000,59.305000,88.0,1.535328e+09,,,2380.000000,4.00,,...,,,,,,,,,,
4573,4583,11.245000,58.603298,15.5,1.536278e+09,1000.0,,,,,...,,,,,,,,,,
4574,4584,21.079500,59.033298,172.0,1.536970e+09,1002.0,,2821.310059,7.85,,...,,,,,,,,,,


In [None]:
#|eval: false
format_time = lambda x: num2date(x, units="seconds since 1970-01-01 00:00:00.0")
df['time'] = df['time'].apply(format_time)

In [None]:
#|eval: false
df_nuc = df.set_index(['sample', 'lon', 'lat', 'depth', 'time']); df_nuc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,h3,h3_unc,k40,k40_unc,mn54,co60,co60_unc,sr89,sr90,sr90_unc,...,pu239,pu239_unc,pu240,pu240_unc,am241,am241_unc,cm242,cm242_unc,cm244,cm244_unc
sample,lon,lat,depth,time,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,11.9833,54.466702,17.0,1984-05-01 00:00:00,,,,,,,,,27.0,10.0,...,,,,,,,,,,
1,11.0833,54.599998,23.0,1984-05-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2,10.8333,54.866699,38.0,1984-05-01 00:00:00,,,,,,,,,28.0,10.0,...,,,,,,,,,,
3,12.6833,54.950001,23.0,1984-05-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
4,11.05,55.383331,22.0,1984-05-01 00:00:00,,,,,,,,,28.0,10.0,...,,,,,,,,,,


In [None]:
#|eval: false
df_nuc.columns

Index(['h3', 'h3_unc', 'k40', 'k40_unc', 'mn54', 'co60', 'co60_unc', 'sr89',
       'sr90', 'sr90_unc', 'zr95', 'zr95_unc', 'nb95', 'nb95_unc', 'tc99',
       'tc99_unc', 'ru103', 'ru103_unc', 'ru106', 'ru106_unc', 'ag110m',
       'ag110m_unc', 'sb125', 'sb125_unc', 'cs134', 'cs134_unc', 'cs137',
       'cs137_unc', 'ba140', 'ba140_unc', 'ce144', 'ce144_unc', 'pb210',
       'pb210_unc', 'po210', 'po210_unc', 'u234', 'u238', 'np237', 'np237_unc',
       'pu238', 'pu238_unc', 'pu239', 'pu239_unc', 'pu240', 'pu240_unc',
       'am241', 'am241_unc', 'cm242', 'cm242_unc', 'cm244', 'cm244_unc'],
      dtype='object')

In [None]:
#|eval: false
def get_multi_index(colnames):
    arr = []
    for colname in colnames:
        if re.search('_unc', colname):
            arr.append((re.split('_unc', colname)[0], 'uncertainty'))
        elif re.search('_dl', colname):
            arr.append((re.split('_dl', colname)[0], 'detection'))
        else:
            arr.append((colname, 'activity'))
    return pd.MultiIndex.from_tuples(arr)

In [None]:
#|eval: false
get_multi_index(df_nuc.columns)

MultiIndex([(    'h3',    'activity'),
            (    'h3', 'uncertainty'),
            (   'k40',    'activity'),
            (   'k40', 'uncertainty'),
            (  'mn54',    'activity'),
            (  'co60',    'activity'),
            (  'co60', 'uncertainty'),
            (  'sr89',    'activity'),
            (  'sr90',    'activity'),
            (  'sr90', 'uncertainty'),
            (  'zr95',    'activity'),
            (  'zr95', 'uncertainty'),
            (  'nb95',    'activity'),
            (  'nb95', 'uncertainty'),
            (  'tc99',    'activity'),
            (  'tc99', 'uncertainty'),
            ( 'ru103',    'activity'),
            ( 'ru103', 'uncertainty'),
            ( 'ru106',    'activity'),
            ( 'ru106', 'uncertainty'),
            ('ag110m',    'activity'),
            ('ag110m', 'uncertainty'),
            ( 'sb125',    'activity'),
            ( 'sb125', 'uncertainty'),
            ( 'cs134',    'activity'),
            ( 'cs134', 'u

In [None]:
#|eval: false
df_nuc.columns = get_multi_index(df_nuc.columns)

In [None]:
#|eval: false
df_sw = df_nuc.stack(level=0).reset_index().rename(columns={'level_5': 'nucl'}); df_sw

Unnamed: 0,sample,lon,lat,depth,time,nucl,activity,uncertainty
0,0,11.9833,54.466702,17.0,1984-05-01 00:00:00,cs137,17.750000,10.00
1,0,11.9833,54.466702,17.0,1984-05-01 00:00:00,sr90,27.000000,10.00
2,1,11.0833,54.599998,23.0,1984-05-01 00:00:00,cs137,36.650002,10.00
3,2,10.8333,54.866699,38.0,1984-05-01 00:00:00,cs137,40.500000,10.00
4,2,10.8333,54.866699,38.0,1984-05-01 00:00:00,sr90,28.000000,10.00
...,...,...,...,...,...,...,...,...
10657,4584,21.0795,59.033298,172.0,2018-09-15 00:00:00,sr90,6.795000,4.80
10658,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,cs137,20.680000,7.50
10659,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,h3,1002.000000,
10660,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,k40,1950.165039,7.85


In [None]:
#|eval: false
df.head()

Unnamed: 0,sample,lon,lat,depth,time,h3,h3_unc,k40,k40_unc,mn54,...,pu239,pu239_unc,pu240,pu240_unc,am241,am241_unc,cm242,cm242_unc,cm244,cm244_unc
0,0,11.9833,54.466702,17.0,1984-05-01 00:00:00,,,,,,...,,,,,,,,,,
1,1,11.0833,54.599998,23.0,1984-05-01 00:00:00,,,,,,...,,,,,,,,,,
2,2,10.8333,54.866699,38.0,1984-05-01 00:00:00,,,,,,...,,,,,,,,,,
3,3,12.6833,54.950001,23.0,1984-05-01 00:00:00,,,,,,...,,,,,,,,,,
4,4,11.05,55.383331,22.0,1984-05-01 00:00:00,,,,,,...,,,,,,,,,,


In [None]:
#|eval: false
df_sw['unit'] = df_sw['nucl'].replace(units); df_sw

Unnamed: 0,sample,lon,lat,depth,time,nucl,activity,uncertainty,unit
0,0,11.9833,54.466702,17.0,1984-05-01 00:00:00,cs137,17.750000,10.00,Bq/m³
1,0,11.9833,54.466702,17.0,1984-05-01 00:00:00,sr90,27.000000,10.00,Bq/m³
2,1,11.0833,54.599998,23.0,1984-05-01 00:00:00,cs137,36.650002,10.00,Bq/m³
3,2,10.8333,54.866699,38.0,1984-05-01 00:00:00,cs137,40.500000,10.00,Bq/m³
4,2,10.8333,54.866699,38.0,1984-05-01 00:00:00,sr90,28.000000,10.00,Bq/m³
...,...,...,...,...,...,...,...,...,...
10657,4584,21.0795,59.033298,172.0,2018-09-15 00:00:00,sr90,6.795000,4.80,Bq/m³
10658,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,cs137,20.680000,7.50,Bq/m³
10659,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,h3,1002.000000,,Bq/m³
10660,4585,19.5797,61.083302,125.0,2018-09-17 00:00:00,k40,1950.165039,7.85,Bq/m³
