# HELCOM (With callbacks)
> Data pipeline (handler) to convert HELCOM data ([source](https://helcom.fi/about-us)) to `NetCDF` format

The data is provided as a Microsoft Access database. `Mdbtools` (https://github.com/mdbtools/mdbtools) is used to convert tables into `.csv` files on Unix-like OS.

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm
import fastcore.all as fc

from netCDF4 import Dataset
from datetime import datetime, timedelta
from cftime import num2date, date2num
from pathlib import Path
from datetime import datetime
import re

from marisco.utils import (has_valid_varname, match_worms, 
                           Callback, run_cbs)

from marisco.metadata import (GlobAttrsFeeder, BboxCB, 
                              DepthRangeCB, TimeRangeCB, 
                              ZoteroCB, KeyValuePairCB)

from marisco.serializers import to_netcdf
from marisco.configs import get_nc_tpl_path, get_cfgs

NC_TPL_PATH = get_nc_tpl_path()

## Parameters

In [None]:
#| params
fname_in = '../../_data/accdb/mors/csv'
fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
def load_helcom(src_dir, 
                smp_types=['SEA', 'SED', 'BIO']):
    "Load HELCOM data and return them as individual dataframe by sample type"
    dfs = {}
    lut_smp_type = {'SEA': 'seawater', 'SED': 'sediment', 'BIO':'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '02.csv'
        fname_smp = smp_type + '01.csv'
        df = pd.merge(pd.read_csv(Path(src_dir)/fname_meas), # measurements
                      pd.read_csv(Path(src_dir)/fname_smp), # sample
                      on='KEY', how='left')
        dfs[lut_smp_type[smp_type]] = df
    return dfs

def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Callbacks & Transformer

In [None]:
class Transformer():
    def __init__(self, dfs, cbs=None): 
        fc.store_attr()
        self.logs = []
        
    def callback(self):
        run_cbs(self.cbs, self)
        
    def __call__(self):
        self.callback()
        return self.dfs

## Load tables

In [None]:
dfs = load_helcom(fname_in)

In [None]:
dfs['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,WKRIL2012003,CS137,,,5.3,32.0,08/20/14 00:00:00,90,KRIL,2012003,...,29.2,29.3333,,0.0,,,,11,11,08/20/14 00:00:00
1,WKRIL2012004,CS137,,,19.9,20.0,08/20/14 00:00:00,90,KRIL,2012004,...,29.2,29.3333,,29.0,,,,11,11,08/20/14 00:00:00
2,WKRIL2012005,CS137,,,25.5,20.0,08/20/14 00:00:00,90,KRIL,2012005,...,23.09,23.15,,0.0,,,,11,3,08/20/14 00:00:00
3,WKRIL2012006,CS137,,,17.0,29.0,08/20/14 00:00:00,90,KRIL,2012006,...,27.59,27.9833,,0.0,,,,11,11,08/20/14 00:00:00
4,WKRIL2012007,CS137,,,22.2,18.0,08/20/14 00:00:00,90,KRIL,2012007,...,27.59,27.9833,,39.0,,,,11,11,08/20/14 00:00:00


In [None]:
dfs['biota'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,BVTIG2012041,CS134,VTIG01,<,0.01014,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
1,BVTIG2012041,K40,VTIG01,,135.3,W,3.57,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
2,BVTIG2012041,CO60,VTIG01,<,0.01398,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
3,BVTIG2012041,CS137,VTIG01,,4.338,W,3.48,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
4,BVTIG2012040,CS134,VTIG01,<,0.009614,W,,,02/27/14 00:00:00,6.0,...,F,5,17.0,45.9,964.0,18.458,92.9,2,16,02/27/14 00:00:00


In [None]:
dfs['sediment'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
0,SKRIL2012048,RA226,,,35.0,26.0,,,,08/20/14 00:00:00,...,20.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
1,SKRIL2012049,RA226,,,36.0,22.0,,,,08/20/14 00:00:00,...,27.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
2,SKRIL2012050,RA226,,,38.0,24.0,,,,08/20/14 00:00:00,...,2.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
3,SKRIL2012051,RA226,,,36.0,25.0,,,,08/20/14 00:00:00,...,4.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
4,SKRIL2012052,RA226,,,30.0,23.0,,,,08/20/14 00:00:00,...,6.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00


## Data transformation pipeline

### Normalize nuclide names

#### Lower & strip

In [None]:
class LowerStripRdnNameCB(Callback):
    "Convert nuclide names to lowercase & strip any trailing space(s)"
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].apply(lambda x: x.lower().strip())

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB()])
tfm()['seawater']['NUCLIDE'].unique()

array(['cs137', 'sr90', 'h3', 'cs134', 'pu238', 'pu239240', 'am241',
       'cm242', 'cm244', 'tc99', 'k40', 'ru103', 'sr89', 'sb125', 'nb95',
       'ru106', 'zr95', 'ag110m', 'cm243244', 'ba140', 'ce144', 'u234',
       'u238', 'co60', 'pu239', 'pb210', 'po210', 'np237', 'pu240',
       'mn54'], dtype=object)

#### Remap to MARIS nuclide names 

In [None]:
def get_unique_nuclides(dfs):
    "Get list of unique radionuclide types measured across samples"
    nuclides = []
    for k in dfs.keys():
        nuclides += dfs[k]['NUCLIDE'].unique().tolist()
    return nuclides

get_unique_nuclides(tfm.dfs)[:5]

['cs137', 'sr90', 'h3', 'cs134', 'pu238']

In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(get_unique_nuclides(tfm.dfs), NC_TPL_PATH)

"pu239240" variable name not found in MARIS CDL
"cm243244" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"pu238240" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"k-40" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

In [None]:
# Create lut
varnames_lut = {n: n for n in set(get_unique_nuclides(tfm.dfs))}

In [None]:
# Renaming above mentioned nuclides accordingly
varnames_lut['k-40'] = 'k40'
varnames_lut['cm243244'] = 'cm243_244_tot'
varnames_lut['cs134137'] = 'cs134_137_tot'
varnames_lut['pu239240'] = 'pu239_240_tot'
varnames_lut['pu238240'] = 'pu238_240_tot'

In [None]:
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names"
    def __init__(self, lut): fc.store_attr()        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'].replace(self.lut, inplace=True)            

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut)])

tfm()['biota']['NUCLIDE'].unique()

array(['cs134', 'k40', 'co60', 'cs137', 'sr90', 'ag108m', 'mn54', 'co58',
       'ag110m', 'zn65', 'sb125', 'pu239_240_tot', 'ru106', 'be7',
       'ce144', 'pb210', 'po210', 'sb124', 'sr89', 'zr95', 'te129m',
       'ru103', 'nb95', 'ce141', 'la140', 'i131', 'ba140', 'pu238',
       'u235', 'bi214', 'pb214', 'pb212', 'tl208', 'ac228', 'ra223',
       'eu155', 'ra226', 'gd153', 'sn113', 'fe59', 'tc99', 'co57',
       'sn117m', 'eu152', 'sc46', 'rb86', 'ra224', 'th232',
       'cs134_137_tot', 'am241', 'ra228', 'th228', 'cs138', 'cs139',
       'cs140', 'cs141', 'cs142', 'cs143', 'cs144', 'cs145', 'cs146'],
      dtype=object)

In [None]:
has_valid_varname(get_unique_nuclides(tfm.dfs), NC_TPL_PATH);

"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


### Parse time

In [None]:
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        #format_time = lambda x: date2num(x, units=get_cfgs('units')['time'])
        for k in tfm.dfs.keys():
            # Parse
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].DATE, infer_datetime_format=True)

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB()])

tfm()['seawater']['time'][:5]

0   2012-05-23
1   2012-05-23
2   2012-06-17
3   2012-05-24
4   2012-05-24
Name: time, dtype: datetime64[ns]

### Normalize uncertainty units

In [None]:
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
    return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)    

# Columns of interest
coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'), 
                 ('biota', 'VALUE_Bq/kg', 'ERROR%'),
                 ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

In [None]:
class NormalizeUncUnitCB(Callback):
    "Convert uncertainty from % to activity unit"
    def __init__(self, coi): fc.store_attr()
    
    def __call__(self, tfm):
        for grp, val, unc in self.coi: 
            tfm.dfs[grp][unc] = self.fix_units(tfm.dfs[grp], val, unc)
            
    def fix_units(self, df, meas_col, unc_col):
        return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)    

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(coi_units_unc)])

tfm()['seawater'][['VALUE_Bq/m³', 'ERROR%_m³']][:5]


Unnamed: 0,VALUE_Bq/m³,ERROR%_m³
0,5.3,1.696
1,19.9,3.98
2,25.5,5.1
3,17.0,4.93
4,22.2,3.996


### Lookup biota species

In [None]:
df_rubin = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'); df_rubin.head(5)

Unnamed: 0,RUBIN_ID,RUBIN,SCIENTIFIC NAME,ENGLISH NAME
0,11,ABRA BRA,ABRAMIS BRAMA,BREAM
1,12,ANGU ANG,ANGUILLA ANGUILLA,EEL
2,13,ARCT ISL,ARCTICA ISLANDICA,ISLAND CYPRINE
3,14,ASTE RUB,ASTERIAS RUBENS,COMMON STARFISH
4,15,CARD EDU,CARDIUM EDULE,COCKLE


In [None]:
df_rubin = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')

def get_species_lut(df):
    lut = {}
    for _, row in tqdm(df[['RUBIN', 'SCIENTIFIC NAME']].iterrows(), total=df.shape[0]):
        res = match_worms(row['SCIENTIFIC NAME'])
        if (res == -1):
            print(f"No match for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
            aphia_id = -1
        else:        
            if len(res[0]) > 1 : 
                print(f"Several matches for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
                print(res)
            aphia_id = res[0][0]['AphiaID']
            
        lut[row['RUBIN']] = aphia_id
    
    return lut

In [None]:
# Match taxon name with WorMS DB if does not exist & loading existing lut if already exists
fname_lut = 'cache/helcom/species_lut.pkl'
if os.path.exists(fname_lut): species_lut = fc.load_pickle(fname_lut)
else:
    species_lut = get_species_lut(df_rubin)    
    fc.save_pickle(fname_lut, species_lut)

print(species_lut)    

{'ABRA BRA': 154281, 'ANGU ANG': 126281, 'ARCT ISL': 138802, 'ASTE RUB': 123776, 'CARD EDU': 152921, 'CH HI;BA': 399467, 'CLAD GLO': 145048, 'CLUP HAR': 126417, 'CRAN CRA': 107552, 'CYPR CAR': 154582, 'ENCH CIM': -1, 'ENGR ENC': 126426, 'ESOX LUC': 154210, 'FISHLARVAE': -1, 'FUCU VES': 145548, 'FURC LUM': 145620, 'GADU MOR': 126436, 'GAST ACU': 126505, 'GYMN CER': 405451, 'LAMI SAC': 145730, 'LIMA LIM': 127139, 'MACO BAL': 141579, 'MERL MNG': 126438, 'MYA ARE': 140430, 'MYOX SCO': 127203, 'MYTI EDU': 140480, 'OSME EPE': 126736, 'PERC FLU': 151353, 'PLANKTON': 747442, 'PLAT FLE': 127141, 'PLEU PLA': 127143, 'POLY FUC': 144639, 'PSET MAX': 154473, 'RHODOPHY': 852, 'RUTI RUT': 154333, 'SADU ENT': 119034, 'SCOM SCO': 127023, 'SOLE SOL': 127160, 'SPRA SPR': 126425, 'STIZ LUC': 321686, 'STUC PEC': 588573, 'ZOAR VIV': 127123, 'ZANN PALU': 416222}


In [None]:
class LookupBiotaSpeciesCB(Callback):
    'Match "RUBIN" species with WorMS db taxon name (AphiaID)'
    def __init__(self, lut): fc.store_attr()
    
    def __call__(self, tfm):
        tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(lambda x: self.lut[x.strip()])

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(), 
                            LookupBiotaSpeciesCB(species_lut)])

tfm()['biota'][['RUBIN', 'species_id']][:5]

Unnamed: 0,RUBIN,species_id
0,GADU MOR,126436
1,GADU MOR,126436
2,GADU MOR,126436
3,GADU MOR,126436
4,GADU MOR,126436


### Lookup biota tissues

In [None]:
dfs['biota']['TISSUE'].unique()

### Rename columns

In [None]:
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
           'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 
                        'SEDI'],
            'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                        'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm',
                        'species_id', 'TISSUE']}

In [None]:
# Define column names renaming rules
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)':'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)':'lon',
    'LONGITUDE ddmmmm': 'lon',
    # group specific
    'TISSUE': 'body_part',
    'SEDI': 'sed_type'
}

In [None]:
class RenameColumnCB(Callback):
    def __init__(self, coi, renaming_rules): fc.store_attr()
    
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            print(self.coi[k])
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]
            
            
            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(coi_units_unc),
                            LookupBiotaSpeciesCB(species_lut),
                            RenameColumnCB(coi_grp, renaming_rules)])

tfm()['seawater'].head(5)

['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'SEDI']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time', 'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm', 'species_id', 'TISSUE']


Unnamed: 0,nuclide,value,unc,time,depth,lat,lon
0,cs137,5.3,1.696,2012-05-23,,60.0833,29.3333
1,cs137,19.9,3.98,2012-05-23,,60.0833,29.3333
2,cs137,25.5,5.1,2012-06-17,,59.4333,23.15
3,cs137,17.0,4.93,2012-05-24,,60.25,27.9833
4,cs137,22.2,3.996,2012-05-24,,60.25,27.9833


### Reshape: long to wide

In [None]:
class ReshapeLongToWide(Callback):
    def __init__(self): fc.store_attr()
    
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            cols = ['nuclide']
            vals = ['value', 'unc']
            idx = list(set(tfm.dfs[k].columns) - set(cols + vals)) # All others

            tfm.dfs[k] = tfm.dfs[k].pivot_table(index=idx, 
                                                columns=cols, 
                                                values=vals).reset_index()

            # Flatten cols name
            tfm.dfs[k].columns = rename_cols(tfm.dfs[k].columns)

            # Set index
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(coi_units_unc),
                            LookupBiotaSpeciesCB(species_lut),
                            RenameColumnCB(coi_grp, renaming_rules),
                            ReshapeLongToWide()])

tfm()['seawater'].head(5)

['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'SEDI']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time', 'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm', 'species_id', 'TISSUE']


Unnamed: 0_level_0,lon,time,depth,lat,ag110m_unc,am241_unc,ba140_unc,ce144_unc,cm242_unc,cm243_244_tot_unc,...,pu240,ru103,ru106,sb125,sr89,sr90,tc99,u234,u238,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,2015-04-21,12.0,0.0,,,,,,,...,,,,,,5.759,,,,
1,0.0,2015-04-23,4.0,0.0,,,,,,,...,,,,,,6.758,,,,
2,0.0,2015-04-30,13.0,0.0,,,,,,,...,,,,,,,,,,
3,0.0,2015-05-19,81.0,0.0,,,,,,,...,,,,,,,,,,
4,0.0,2015-05-20,69.0,0.0,,,,,,,...,,,,,,7.255,,,,


### Encode time (seconds since ...)

In [None]:
class EncodeTimeCB(Callback):
    "Encode time as `int` representing seconds since xxx"
    def __call__(self, tfm):
        format_time = lambda x: date2num(x, units=get_cfgs('units')['time'])
        for k in tfm.dfs.keys():    
            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(format_time)
  

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(coi_units_unc),
                            LookupBiotaSpeciesCB(species_lut),
                            RenameColumnCB(coi_grp, renaming_rules),
                            ReshapeLongToWide(),
                            EncodeTimeCB()])

tfm()['seawater'].head(5)

['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'SEDI']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time', 'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm', 'species_id', 'TISSUE']


Unnamed: 0_level_0,lon,time,depth,lat,ag110m_unc,am241_unc,ba140_unc,ce144_unc,cm242_unc,cm243_244_tot_unc,...,pu240,ru103,ru106,sb125,sr89,sr90,tc99,u234,u238,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1429574400,12.0,0.0,,,,,,,...,,,,,,5.759,,,,
1,0.0,1429747200,4.0,0.0,,,,,,,...,,,,,,6.758,,,,
2,0.0,1430352000,13.0,0.0,,,,,,,...,,,,,,,,,,
3,0.0,1431993600,81.0,0.0,,,,,,,...,,,,,,,,,,
4,0.0,1432080000,69.0,0.0,,,,,,,...,,,,,,7.255,,,,


### Sanitize coordinates

In [None]:
class SanitizeLonLatCB(Callback):
    "Drop row when both longitude & latitude equal 0"
    def __call__(self, tfm):
        tfm.dfs = {grp: (df[(df.lon != 0) & (df.lat != 0)]) 
                   for grp, df in tfm.dfs.items()}

In [None]:
dfs = load_helcom(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(varnames_lut),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(coi_units_unc),
                            LookupBiotaSpeciesCB(species_lut),
                            RenameColumnCB(coi_grp, renaming_rules),
                            ReshapeLongToWide(),
                            EncodeTimeCB(),
                            SanitizeLonLatCB()])

tfm()['seawater'].head(5)

['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time', 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 'SEDI']
['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time', 'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm', 'species_id', 'TISSUE']


Unnamed: 0_level_0,lon,time,depth,lat,ag110m_unc,am241_unc,ba140_unc,ce144_unc,cm242_unc,cm243_244_tot_unc,...,pu240,ru103,ru106,sb125,sr89,sr90,tc99,u234,u238,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,9.6333,544838400,16.0,54.8433,,0.002907,,,0.004369,,...,,,,,,18.2,,,,
11,9.6333,597628800,17.0,54.84,,,,,,,...,,,,,,,,,,
12,9.6333,690854400,16.0,54.8417,,,,,,,...,,,,,,,,,,
13,9.6333,724464000,17.0,54.8417,,,,,,,...,,,,,,,,,,
14,9.6352,629596800,17.0,54.8425,,,,,,,...,,,,,,,,,,


In [None]:
# Visualizing transform steps
tfm.logs

['Convert nuclide names to lowercase & strip any trailing space(s)',
 'Remap to MARIS radionuclide names',
 'Convert uncertainty from % to activity unit',
 'Match "RUBIN" species with WorMS db taxon name (AphiaID)',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0']

In [None]:
tfm.dfs['biota'].head()

Unnamed: 0_level_0,lon,body_part,species_id,lat,time,depth,ac228_unc,ag108m_unc,ag110m_unc,am241_unc,...,sr89,sr90,tc99,te129m,th228,th232,tl208,u235,zn65,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9.41,5,126417,54.31,1323561600,2.0,,,,,...,,,,,,,,,,
1,10.0,5,126436,54.45,1323734400,4.0,,,,,...,,,,,,,,,,
2,10.07,1,126417,54.34,781660800,27.0,,0.0,,0.0,...,,0.0267,,,,,,,,
3,10.07,5,126436,54.33,565920000,21.5,,,0.00858,,...,,0.0042,,,,,,,,
4,10.07,18,126436,54.33,565920000,21.5,,,0.1147,,...,,,,,,,,,,


## Encode to NetCDF

### Feed global attributes

In [None]:
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
feed = GlobAttrsFeeder(dfs, cbs=[BboxCB(), 
                                 DepthRangeCB(),
                                 TimeRangeCB(),
                                 ZoteroCB('26VMZZ2Q'),
                                 KeyValuePairCB('keywords', ', '.join(kw)),
                                 KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
                                 ])
attrs = feed(); attrs

{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Scienc

### To NetCDF

In [None]:
def units_fn(grp_name,
             rdn_name):
    lut = {'seawater': 'Bq/m³', 
           'sediment': 'Bq/kg', 
           'biota': 'Bq/kg'}
    return lut[grp_name]

In [None]:
to_netcdf(dfs, NC_TPL_PATH, fname_out, attrs, units_fn)

% of discarded data for grp seawater: 0.0
% of discarded data for grp sediment: 0.0
% of discarded data for grp biota: 0.0
