In [None]:
#| default_exp handlers.helcom

# HELCOM
> Data pipeline (handler) to convert HELCOM data ([source](https://helcom.fi/about-us)) to `NetCDF` format

The data is provided as a Microsoft Access database. `Mdbtools` (https://github.com/mdbtools/mdbtools) is used to convert tables into `.csv` files on Unix-like OS.

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from functools import partial
import fastcore.all as fc

from pathlib import Path

from marisco.utils import (has_valid_varname, match_worms)
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)

from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)

from marisco.configs import base_path, nc_tpl_path, cfg
from marisco.serializers import NetCDFEncoder

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
fname_in = '../../_data/accdb/mors/csv'
fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
#| export
def load_data(src_dir,
                smp_types=['SEA', 'SED', 'BIO']):
    "Load HELCOM data and return them as individual dataframe by sample type"
    dfs = {}
    lut_smp_type = {'SEA': 'seawater', 'SED': 'sediment', 'BIO': 'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '02.csv'
        fname_smp = smp_type + '01.csv'
        df = pd.merge(pd.read_csv(Path(src_dir)/fname_meas),  # measurements
                      pd.read_csv(Path(src_dir)/fname_smp),  # sample
                      on='KEY', how='left')
        dfs[lut_smp_type[smp_type]] = df
    return dfs


def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Load tables

In [None]:
dfs = load_data(fname_in)

In [None]:
dfs['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,WKRIL2012003,CS137,,,5.3,32.0,08/20/14 00:00:00,90,KRIL,2012003,...,29.2,29.3333,,0.0,,,,11,11,08/20/14 00:00:00
1,WKRIL2012004,CS137,,,19.9,20.0,08/20/14 00:00:00,90,KRIL,2012004,...,29.2,29.3333,,29.0,,,,11,11,08/20/14 00:00:00
2,WKRIL2012005,CS137,,,25.5,20.0,08/20/14 00:00:00,90,KRIL,2012005,...,23.09,23.15,,0.0,,,,11,3,08/20/14 00:00:00
3,WKRIL2012006,CS137,,,17.0,29.0,08/20/14 00:00:00,90,KRIL,2012006,...,27.59,27.9833,,0.0,,,,11,11,08/20/14 00:00:00
4,WKRIL2012007,CS137,,,22.2,18.0,08/20/14 00:00:00,90,KRIL,2012007,...,27.59,27.9833,,39.0,,,,11,11,08/20/14 00:00:00


In [None]:
dfs['biota'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,BVTIG2012041,CS134,VTIG01,<,0.01014,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
1,BVTIG2012041,K40,VTIG01,,135.3,W,3.57,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
2,BVTIG2012041,CO60,VTIG01,<,0.01398,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
3,BVTIG2012041,CS137,VTIG01,,4.338,W,3.48,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
4,BVTIG2012040,CS134,VTIG01,<,0.009614,W,,,02/27/14 00:00:00,6.0,...,F,5,17.0,45.9,964.0,18.458,92.9,2,16,02/27/14 00:00:00


In [None]:
dfs['sediment'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
0,SKRIL2012048,RA226,,,35.0,26.0,,,,08/20/14 00:00:00,...,20.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
1,SKRIL2012049,RA226,,,36.0,22.0,,,,08/20/14 00:00:00,...,27.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
2,SKRIL2012050,RA226,,,38.0,24.0,,,,08/20/14 00:00:00,...,2.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
3,SKRIL2012051,RA226,,,36.0,25.0,,,,08/20/14 00:00:00,...,4.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
4,SKRIL2012052,RA226,,,30.0,23.0,,,,08/20/14 00:00:00,...,6.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00


## Data transformation pipeline

### Normalize nuclide names

#### Lower & strip

In [None]:
#| export
class LowerStripRdnNameCB(Callback):
    "Convert nuclide names to lowercase & strip any trailing space(s)"

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].apply(
                lambda x: x.lower().strip())

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB()])
print(tfm()['seawater']['NUCLIDE'].unique())

['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'
 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'
 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'
 'np237' 'pu240' 'mn54']


#### Remap to MARIS nuclide names 

In [None]:
#| export
def get_unique_nuclides(dfs):
    "Get list of unique radionuclide types measured across samples"
    nuclides = []
    for k in dfs.keys():
        nuclides += dfs[k]['NUCLIDE'].unique().tolist()
    return nuclides

In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

"pu239240" variable name not found in MARIS CDL
"cm243244" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"pu238240" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"k-40" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

In [None]:
#| export
varnames_lut_updates = {
    'k-40': 'k40',
    'cm243244': 'cm243_244_tot',
    'cs134137': 'cs134_137_tot',
    'pu239240': 'pu239_240_tot',
    'pu238240': 'pu238_240_tot'}


In [None]:
#| export
def get_varnames_lut(dfs, lut=varnames_lut_updates):
    lut = {n: n for n in set(get_unique_nuclides(dfs))}
    lut.update(varnames_lut_updates)
    return lut


In [None]:
varnames_lut = partial(get_varnames_lut, lut=varnames_lut_updates)(tfm.dfs)

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names"

    def __init__(self,
                 fn_lut=partial(get_varnames_lut, lut=varnames_lut_updates)):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut(tfm.dfs)
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'].replace(lut, inplace=True)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB()])

print(tfm()['biota']['NUCLIDE'].unique())

['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'
 'zn65' 'sb125' 'pu239_240_tot' 'ru106' 'be7' 'ce144' 'pb210' 'po210'
 'sb124' 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131'
 'ba140' 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223'
 'eu155' 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152'
 'sc46' 'rb86' 'ra224' 'th232' 'cs134_137_tot' 'am241' 'ra228' 'th228'
 'cs138' 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']


In [None]:
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

### Parse time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].DATE, 
                                                format='%m/%d/%y %H:%M:%S')

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

0   2012-05-23
1   2012-05-23
2   2012-06-17
3   2012-05-24
4   2012-05-24
Name: time, dtype: datetime64[ns]


### Normalize uncertainty units

In [None]:
#| export
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
    return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
#| export
# Columns of interest
coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),
                 ('biota', 'VALUE_Bq/kg', 'ERROR%'),
                 ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

In [None]:
#| export
class NormalizeUncUnitCB(Callback):
    "Convert uncertainty from % to activity unit"

    def __init__(self, coi=coi_units_unc): fc.store_attr()

    def __call__(self, tfm):
        for grp, val, unc in self.coi:
            tfm.dfs[grp][unc] = self.fix_units(tfm.dfs[grp], val, unc)

    def fix_units(self, df, meas_col, unc_col):
        return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
#| export
class NormalizeUncUnitCB(Callback):
    "Convert uncertainty from % to activity unit"

    def __init__(self, coi=coi_units_unc): fc.store_attr()

    def __call__(self, tfm):
        for grp, val, unc in self.coi:
            tfm.dfs[grp][unc] = self.fix_units(tfm.dfs[grp], val, unc)

    def fix_units(self, df, meas_col, unc_col):
        return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB()])

print(tfm()['seawater'][['VALUE_Bq/m³', 'ERROR%_m³']][:5])

   VALUE_Bq/m³  ERROR%_m³
0          5.3      1.696
1         19.9      3.980
2         25.5      5.100
3         17.0      4.930
4         22.2      3.996


### Lookup biota species

In [None]:
df_rubin = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
df_rubin.head(5)

Unnamed: 0,RUBIN_ID,RUBIN,SCIENTIFIC NAME,ENGLISH NAME
0,11,ABRA BRA,ABRAMIS BRAMA,BREAM
1,12,ANGU ANG,ANGUILLA ANGUILLA,EEL
2,13,ARCT ISL,ARCTICA ISLANDICA,ISLAND CYPRINE
3,14,ASTE RUB,ASTERIAS RUBENS,COMMON STARFISH
4,15,CARD EDU,CARDIUM EDULE,COCKLE


In [None]:
#| export
def get_species_lut(fname_in, overwrite=False):
    fname_lut = 'species_helcom.pkl'
    config_path = base_path() / 'lut' / fname_lut
    repo_path = Path('../files/lut') / fname_lut

    if overwrite or (not config_path.exists()):
        df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
        lut = {}
        
        for _, row in tqdm(df[['RUBIN', 'SCIENTIFIC NAME']].iterrows(), total=df.shape[0]):
            res = match_worms(row['SCIENTIFIC NAME'])
            if (res == -1):
                print(f"No match for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
                aphia_id = -1
            else:
                if len(res[0]) > 1:
                    print(
                        f"Several matches for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
                    print(res)
                aphia_id = res[0][0]['AphiaID']

            lut[row['RUBIN']] = aphia_id
        fc.save_pickle(config_path, lut)
        fc.save_pickle(repo_path, lut)
    else:
        lut = fc.load_pickle(config_path)
        
    return lut

In [None]:
species_lut = get_species_lut(fname_in, overwrite=False)

In [None]:
#| export
class LookupBiotaSpeciesCB(Callback):
    'Match "RUBIN" species with WorMS db taxon name (AphiaID)'
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(
            lambda x: lut[x.strip()])

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in))])

print(tfm()['biota'][['RUBIN', 'species_id']][:5])

      RUBIN  species_id
0  GADU MOR      126436
1  GADU MOR      126436
2  GADU MOR      126436
3  GADU MOR      126436
4  GADU MOR      126436


### Lookup biota tissues

In [None]:
dfs['biota']['TISSUE'].unique()

array([ 5,  1, 41,  3, 51, 43, 42, 12, 10, 18, 52, 20,  8, 54, 53])

### Rename columns

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
           'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)',
                        'SEDI'],
           'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                     'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm',
                     'species_id', 'TISSUE']}


In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)': 'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)': 'lon',
    'LONGITUDE ddmmmm': 'lon',
    # group specific
    'TISSUE': 'body_part',
    'SEDI': 'sed_type'
}


In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB()])

print(tfm()['biota'].head(5))

  nuclide       value       unc       time  depth    lat    lon  species_id  \
0   cs134    0.010140       NaN 2012-09-23    NaN  54.17  12.19      126436   
1     k40  135.300000  4.830210 2012-09-23    NaN  54.17  12.19      126436   
2    co60    0.013980       NaN 2012-09-23    NaN  54.17  12.19      126436   
3   cs137    4.338000  0.150962 2012-09-23    NaN  54.17  12.19      126436   
4   cs134    0.009614       NaN 2012-09-23    NaN  54.17  12.19      126436   

   body_part  
0          5  
1          5  
2          5  
3          5  
4          5  


### Reshape: long to wide

In [None]:
#| export
class ReshapeLongToWide(Callback):
    def __init__(self): fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            cols = ['nuclide']
            vals = ['value', 'unc']
            idx = list(set(tfm.dfs[k].columns) -
                       set(cols + vals))  # All others

            tfm.dfs[k] = tfm.dfs[k].pivot_table(index=idx,
                                                columns=cols,
                                                values=vals).reset_index()

            # Flatten cols name
            tfm.dfs[k].columns = rename_cols(tfm.dfs[k].columns)

            # Set index
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide()])

print(tfm()['biota'].head(5))

          lon       time  species_id  depth    lat  body_part  ac228_unc  \
sample                                                                     
0        9.41 2011-12-11      126417    2.0  54.31          5        NaN   
1       10.00 2011-12-13      126436    4.0  54.45          5        NaN   
2       10.07 1987-12-08      126436   21.5  54.33          5        NaN   
3       10.07 1987-12-08      126436   21.5  54.33         18        NaN   
4       10.07 1994-10-09      126417   27.0  54.34          1        NaN   

        ag108m_unc  ag110m_unc  am241_unc  ...  sr89    sr90  tc99  te129m  \
sample                                     ...                               
0              NaN         NaN        NaN  ...   NaN     NaN   NaN     NaN   
1              NaN         NaN        NaN  ...   NaN     NaN   NaN     NaN   
2              NaN     0.00858        NaN  ...   NaN  0.0042   NaN     NaN   
3              NaN     0.11470        NaN  ...   NaN     NaN   NaN     NaN   

### Encode time (seconds since ...)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg())])

print(tfm()['seawater'].head(5))


        lat  lon        time  depth  ag110m_unc  am241_unc  ba140_unc  \
sample                                                                  
0       0.0  0.0  1429574400   12.0         NaN        NaN        NaN   
1       0.0  0.0  1429747200    4.0         NaN        NaN        NaN   
2       0.0  0.0  1430352000   13.0         NaN        NaN        NaN   
3       0.0  0.0  1431993600   81.0         NaN        NaN        NaN   
4       0.0  0.0  1432080000   69.0         NaN        NaN        NaN   

        ce144_unc  cm242_unc  cm243_244_tot_unc  ...  pu240  ru103  ru106  \
sample                                           ...                        
0             NaN        NaN                NaN  ...    NaN    NaN    NaN   
1             NaN        NaN                NaN  ...    NaN    NaN    NaN   
2             NaN        NaN                NaN  ...    NaN    NaN    NaN   
3             NaN        NaN                NaN  ...    NaN    NaN    NaN   
4             NaN        N

### Sanitize coordinates

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])

print(tfm()['seawater'].head(5))


            lat      lon        time  depth  ag110m_unc  am241_unc  ba140_unc  \
sample                                                                          
10      53.9422  14.2578  1339545600   10.0         NaN        NaN        NaN   
11      53.9483  14.2633   870220800   10.0         NaN        NaN        NaN   
12      53.9483  14.2633   966729600   10.0         NaN        NaN        NaN   
13      53.9483  14.2633   992044800   10.0         NaN        NaN        NaN   
14      53.9483  14.2633  1023580800   10.0         NaN        NaN        NaN   

        ce144_unc  cm242_unc  cm243_244_tot_unc  ...  pu240  ru103  ru106  \
sample                                           ...                        
10            NaN        NaN                NaN  ...    NaN    NaN    NaN   
11            NaN        NaN                NaN  ...    NaN    NaN    NaN   
12            NaN        NaN                NaN  ...    NaN    NaN    NaN   
13            NaN        NaN                NaN

## Encode to NetCDF

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])

dfs_tfm = tfm()


In [None]:
tfm.logs

['Convert nuclide names to lowercase & strip any trailing space(s)',
 'Remap to MARIS radionuclide names',
 'Convert uncertainty from % to activity unit',
 'Match "RUBIN" species with WorMS db taxon name (AphiaID)',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [None]:
#| export
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)

{'geospatial_lat_min': '31.1667',
 'geospatial_lat_max': '65.6347',
 'geospatial_lon_min': '9.41',
 'geospatial_lon_max': '53.458',
 'geospatial_bounds': 'POLYGON ((9.41 53.458, 31.1667 53.458, 31.1667 65.6347, 9.41 65.6347, 9.41 53.458))',
 'geospatial_vertical_max': '0',
 'geospatial_vertical_min': '-460.0',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-12T00:00:00',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting d

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[
        LowerStripRdnNameCB(),
        RemapRdnNameCB(),
        ParseTimeCB(),
        NormalizeUncUnitCB(),
        LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
        RenameColumnCB(),
        ReshapeLongToWide(),
        EncodeTimeCB(cfg()),
        SanitizeLonLatCB()])
    
    encoder = NetCDFEncoder(tfm(), 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw),
                            **kwargs)
    encoder.encode()

In [None]:
encode(fname_in, fname_out, nc_tpl_path(), verbose=True)

Group: seawater, Variable: sample
Group: seawater, Variable: lon
Group: seawater, Variable: lat
Group: seawater, Variable: depth
Group: seawater, Variable: time
Group: seawater, Variable: h3
Group: seawater, Variable: h3_unc
Group: seawater, Variable: k40
Group: seawater, Variable: k40_unc
Group: seawater, Variable: mn54
Group: seawater, Variable: co60
Group: seawater, Variable: co60_unc
Group: seawater, Variable: sr89
Group: seawater, Variable: sr90
Group: seawater, Variable: sr90_unc
Group: seawater, Variable: zr95
Group: seawater, Variable: zr95_unc
Group: seawater, Variable: nb95
Group: seawater, Variable: nb95_unc
Group: seawater, Variable: tc99
Group: seawater, Variable: tc99_unc
Group: seawater, Variable: ru103
Group: seawater, Variable: ru103_unc
Group: seawater, Variable: ru106
Group: seawater, Variable: ru106_unc
Group: seawater, Variable: ag110m
Group: seawater, Variable: ag110m_unc
Group: seawater, Variable: sb125
Group: seawater, Variable: sb125_unc
Group: seawater, Variab

ValueError: trying to assign illegal value to Enum variable