In [None]:
#| default_exp serializers

# Serializers
> Various utilities to encode MARIS dataset as `NetCDF`, `csv`, ... formats.

In [None]:
#| export
from netCDF4 import Dataset
import pandas as pd
from typing import Dict, Callable

In [None]:
#| export
def to_netcdf3(
    df:pd.DataFrame, # Dataframe to encode
    fname_cdl:str, # File name and path to the MARIS CDL template
    fname_output:str, # Name of output file to produce
    cfgs:Dict, # Config file containing global attributes
):
    "Encode MARIS dataset (provided as Pandas DataFrame) to NetCDF file"
    with Dataset(fname_cdl) as src, Dataset(fname_output, 'w', format='NETCDF3_CLASSIC') as dst:
        # copy global attributes all at once via dictionary
        dst.setncatts(src.__dict__)

        dst.setncatts(cfgs['global_attr']) 
        # copy dimensions
        for name, dimension in src.dimensions.items():
            dst.createDimension(
                name, (len(dimension) if not dimension.isunlimited() else None))

        n_before = 0
        n_after = 0
        # copy all variables of interest and fill them
        for name_var_src, var_src in src.variables.items():
            if name_var_src in df.reset_index().columns:
                x = dst.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                                       compression='zlib', complevel=9)
                # fill variables
                # Sanitize
                n_before += sum(df.reset_index()[name_var_src].notna())
                df_sanitized = pd.to_numeric(df.reset_index()[name_var_src], 
                                             errors='coerce', downcast=None)
                n_after += sum(df_sanitized.notna())
                dst[name_var_src][:] = df_sanitized .values
                # copy variable attributes all at once via dictionary
                dst[name_var_src].setncatts(src[name_var_src].__dict__)
                dst[name_var_src].unit = cfgs['global_attr']['unit']
        print(f'% of discarded data: {100*(n_before - n_after)/n_before}')

In [None]:
#| export
def to_netcdf(
    dfs:dict[pd.DataFrame], # dict of Dataframes to encode with group name as key {'sediment': df_sed, ...}
    fname_cdl:str, # File name and path to the MARIS CDL template
    fname_output:str, # Name of output file to produce
    cfgs:Dict, # Config file containing global attributes
    units_fn:Callable, # (group, variable) -> unit look up function
):
    "Encode MARIS dataset (provided as Pandas DataFrame) to NetCDF file"
    with Dataset(fname_cdl, format='NETCDF4') as src, Dataset(fname_output, 'w', format='NETCDF4') as dst:
        # copy global attributes all at once via dictionary
        dst.setncatts(src.__dict__)

        dst.setncatts(cfgs['global_attr']) 
        # copy dimensions
        for name, dimension in src.dimensions.items():
            dst.createDimension(
                name, (len(dimension) if not dimension.isunlimited() else None))

        # copy groups
        for grp_name, df in dfs.items():
            # TBD: asserting group name
            grp_dest = dst.createGroup(grp_name)
        
            n_before = 0
            n_after = 0
            
            # copy all variables of interest and fill them
            #for name_var_src, var_src in src.variables.items():
            for name_var_src, var_src in src.groups[grp_name].variables.items():
                if name_var_src in df.reset_index().columns:
                    x = grp_dest.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                                                compression='zlib', complevel=9)
                    # fill variables
                    # Sanitize
                    n_before += sum(df.reset_index()[name_var_src].notna())
                    df_sanitized = pd.to_numeric(df.reset_index()[name_var_src], 
                                                 errors='coerce', downcast=None)
                    n_after += sum(df_sanitized.notna())
                    grp_dest[name_var_src][:] = df_sanitized .values
                    # copy variable attributes all at once via dictionary
                    grp_dest[name_var_src].setncatts(src.groups[grp_name][name_var_src].__dict__)
                    #dst[name_var_src].unit = cfgs['global_attr']['unit']
                    grp_dest[name_var_src].unit = units_fn(grp_name, name_var_src)
            print(f'% of discarded data for grp {grp_name}: {100*(n_before - n_after)/n_before}')

In [None]:
#| export
def to_csv(
    fname_nc:str,
    fname_output:str):
    "Convert MARIS NetCDF filer to `.csv`"
    fname_nc = './files/nc/tepco-sediments.nc'
    data_dict = {}
    with Dataset(fname_nc) as nc:
        # global attrs
        for name in nc.ncattrs():
            pass
            #print(name)
        # list of vars   
        for name in nc.variables:
            #print(name)
            variable = nc[name]
            data_dict[name] = variable[:]
    return pd.DataFrame(data_dict)

#df = to_csv('./files/nc/tepco-sediments.nc', '')

In [None]:
#df.head()

In [None]:
import re

In [None]:
re.search('_unc|_dl', 'mn54_dl').group()

'_dl'

In [None]:
re.search('_unc|_dl', 'mn54_dl').group(0)

'_dl'

In [None]:
'pu239_240_tot_dl'.split('_dl')

['pu239_240_tot', '']

In [None]:
re.split('_unc|_dl', 'mn54_dl')

['mn54', '']

In [None]:
def get_multi_index(colnames):
    arr = []
    for colname in colnames:
        if re.search('_unc', colname):
            colname = re.split('_unc', colname)[0]
            arr.append((colname, 'un'))
        elif re.search('_dl', colname):
            colname = re.split('_dl', colname)[0]
            arr.append((colname, 'dl'))
        else:
            arr.append((colname, colname))
    return pd.MultiIndex.from_tuples(arr)

In [None]:
get_multi_index(nucl_cols)

MultiIndex([(           'h3',            'h3'),
            (           'h3',            'dl'),
            (         'mn54',          'mn54'),
            (         'mn54',            'dl'),
            (         'co58',          'co58'),
            (         'co58',            'dl'),
            (         'co60',          'co60'),
            (         'co60',            'dl'),
            (         'sr89',          'sr89'),
            (         'sr89',            'dl'),
            (         'sr90',          'sr90'),
            (         'sr90',            'dl'),
            (        'ru106',         'ru106'),
            (        'ru106',            'dl'),
            (        'sb125',         'sb125'),
            (        'sb125',            'dl'),
            (         'i131',          'i131'),
            (         'i131',            'dl'),
            (        'cs134',         'cs134'),
            (        'cs134',            'dl'),
            (        'cs137',         'c

In [None]:
#colnames = [name for name in df.columns if re.search('_unc|_dl', name) is None]
#nucl_cols = [name for name in df[colnames].columns if name not in ['sample', 'lon', 'lat', 'time']]
nucl_cols = [name for name in df.columns if name not in ['sample', 'lon', 'lat', 'time']]

TypeError: expected string or bytes-like object

In [None]:
df.columns = get_multi_index(df.columns)

TypeError: expected string or bytes-like object

In [None]:
df.columns

MultiIndex([(       'sample',          'sample'),
            (          'lon',             'lon'),
            (          'lat',             'lat'),
            (         'time',            'time'),
            (           'h3',              'h3'),
            (           'h3', 'detection_limit'),
            (         'mn54',            'mn54'),
            (         'mn54', 'detection_limit'),
            (         'co58',            'co58'),
            (         'co58', 'detection_limit'),
            (         'co60',            'co60'),
            (         'co60', 'detection_limit'),
            (         'sr89',            'sr89'),
            (         'sr89', 'detection_limit'),
            (         'sr90',            'sr90'),
            (         'sr90', 'detection_limit'),
            (        'ru106',           'ru106'),
            (        'ru106', 'detection_limit'),
            (        'sb125',           'sb125'),
            (        'sb125', 'detection_limit'),


In [None]:
pd.melt(df.reset_index(), 
        id_vars=[('sample', 'sample'),
                 ('lon', 'lon'),
                 ('lat', 'lat'),
                 ('time', 'time')
                ],
        #value_vars=nucl_cols,
        value_name='activity',
        #var_name='nuclide',
       )

Unnamed: 0,"(sample, sample)","(lon, lon)","(lat, lat)","(time, time)",variable_0,variable_1,activity
0,0,141.029999,37.32,1.300749e+09,index,,0.0
1,1,141.029999,37.32,1.300804e+09,index,,1.0
2,2,141.029999,37.32,1.300888e+09,index,,2.0
3,3,141.029999,37.32,1.300959e+09,index,,3.0
4,4,141.029999,37.32,1.301047e+09,index,,4.0
...,...,...,...,...,...,...,...
987937,21472,141.039993,37.48,1.657621e+09,talpha,detection_limit,
987938,21473,141.039993,37.48,1.657621e+09,talpha,detection_limit,
987939,21474,141.039993,37.48,1.658225e+09,talpha,detection_limit,
987940,21475,141.039993,37.48,1.658225e+09,talpha,detection_limit,


In [None]:
#pd.melt(df[colnames].reset_index(), 
pd.melt(df.reset_index(), 
        id_vars=['sample', 'lon', 'lat', 'time'],
        value_vars=nucl_cols,
        value_name='activity',
        var_name='nuclide',
       ).sort_values(by='sample')

Unnamed: 0,sample,lon,lat,time,nuclide,activity
0,0,141.029999,37.32,1.300749e+09,h3,
816126,0,141.029999,37.32,1.300749e+09,i132_dl,44.0
214770,0,141.029999,37.32,1.300749e+09,sr90,
622833,0,141.029999,37.32,1.300749e+09,pu238_dl,
944988,0,141.029999,37.32,1.300749e+09,talpha_dl,
...,...,...,...,...,...,...
472493,21476,141.039993,37.48,1.658830e+09,cs137_dl,
579878,21476,141.039993,37.48,1.658830e+09,ce144,
902033,21476,141.039993,37.48,1.658830e+09,tbeta,
42953,21476,141.039993,37.48,1.658830e+09,h3_dl,
