In [None]:
#| default_exp handlers.helcom

# HELCOM
> Data pipeline (handler) to convert HELCOM data ([source](https://helcom.fi/about-us)) to `NetCDF` format

<!-- ## HELCOM MORS Environment database -->

[Helcom MORS data](https://helcom.fi/about-us) is provided as a Microsoft Access database. 
[`Mdbtools`](https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.

Example steps:
1. Download data (e.g. https://metadata.helcom.fi/geonetwork/srv/fin/catalog.search#/metadata/2fdd2d46-0329-40e3-bf96-cb08c7206a24). 
2. Install mdbtools via VScode Terminal 

    ```
    sudo apt-get -y install mdbtools
    ````

3. Install unzip via VScode Terminal 

    ```
    sudo apt-get -y install unzip
    ````

4. In VS code terminal, navigate to the marisco data folder

    ```
    cd /home/marisco/downloads/marisco/_data/accdb/mors_19840101_20211231
    ```

5. Unzip MORS_ENVIRONMENT.zip 

    ```
    unzip MORS_ENVIRONMENT.zip 
    ```

6. Run preprocess.sh to generate the required data files

    ```
    ./preprocess.sh MORS_ENVIRONMENT.zip
    ````
7. Conetens of 'preprocess.sh' script.
    ```
    #!/bin/bash

    # Example of use: ./preprocess.sh MORS_ENVIRONMENT.zip
    unzip $1
    dbname=$(ls *.accdb)
    mkdir csv
    for table in $(mdb-tables -1 "$dbname"); do
        echo "Export table $table"
        mdb-export "$dbname" "$table" > "csv/$table.csv"
    done
    ```


Questions:

    - Do the renaming 'cs138': 'cs137', ... make sense?
    

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import pandas as pd # Python package that provides fast, flexible, and expressive data structures.
import numpy as np
from tqdm import tqdm # Python Progress Bar Library
from functools import partial # Function which Return a new partial object which when called will behave like func called with the positional arguments args and keyword arguments keywords
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from pathlib import Path # This module offers classes representing filesystem paths


from marisco.utils import (has_valid_varname, match_worms, 
                           match_maris_species, match_maris_sediment)
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)

from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)

from marisco.configs import base_path, nc_tpl_path, cfg, cache_path, species_lut_path, sediments_lut_path

from marisco.serializers import NetCDFEncoder

In [None]:
import warnings
warnings.filterwarnings('ignore')

Here we define the fname_in and fname_out variables. These variables are paths which are defined as relative paths. These paths are relative to 
the current working directory. Note that fname_in refers to the csv folder that contains the  HELCOM data. fname_out defines the path and filename for the NetCDF output.

In [None]:
fname_in = '../../_data/accdb/mors/csv'
fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
#| export
def load_data(src_dir,
                smp_types=['SEA', 'SED', 'BIO']):
    "Load HELCOM data and return the data in a dictionary of dataframes with the dictionary key as the sample type"
    dfs = {}
    lut_smp_type = {'SEA': 'seawater', 'SED': 'sediment', 'BIO': 'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '02.csv' # measurement (i.e. radioactivity) information.
        fname_smp = smp_type + '01.csv' # sample information 
        df = pd.merge(pd.read_csv(Path(src_dir)/fname_meas),  # measurements
                      pd.read_csv(Path(src_dir)/fname_smp),  # sample
                      on='KEY', how='left')
        dfs[lut_smp_type[smp_type]] = df
    return dfs

In [None]:
#| export
def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unit':
                new_cols.append(inner + '_' + outer)
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Load tables

`dfs` is a dictionary dataframes  created from the Helcom dataset located at the path `fname_in`. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. 

In [None]:
dfs = load_data(fname_in)
dfs

{'seawater':                 KEY NUCLIDE METHOD < VALUE_Bq/m³  VALUE_Bq/m³  ERROR%_m³  \
 0      WKRIL2012003   CS137    NaN           NaN          5.3  32.000000   
 1      WKRIL2012004   CS137    NaN           NaN         19.9  20.000000   
 2      WKRIL2012005   CS137    NaN           NaN         25.5  20.000000   
 3      WKRIL2012006   CS137    NaN           NaN         17.0  29.000000   
 4      WKRIL2012007   CS137    NaN           NaN         22.2  18.000000   
 ...             ...     ...    ...           ...          ...        ...   
 21211  WSSSM2021005      H3  SSM45           NaN       1030.0  93.203883   
 21212  WSSSM2021006      H3  SSM45           NaN       2240.0  43.303571   
 21213  WSSSM2021007      H3  SSM45           NaN       2060.0  47.087379   
 21214  WSSSM2021008      H3  SSM45           NaN       2300.0  43.478261   
 21215  WSSSM2021004      H3  SSM45             <          NaN        NaN   
 
          DATE_OF_ENTRY_x  COUNTRY LABORATORY   SEQUENCE  ... 

List the keys for the dictionary of dataframes:

In [None]:
keys=dfs.keys()
keys

dict_keys(['seawater', 'sediment', 'biota'])

Show the structure of the `seawater` dataframe:

In [None]:
dfs['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,WKRIL2012003,CS137,,,5.3,32.0,08/20/14 00:00:00,90.0,KRIL,2012003.0,...,29.2,29.3333,,0.0,,,,11.0,11.0,08/20/14 00:00:00
1,WKRIL2012004,CS137,,,19.9,20.0,08/20/14 00:00:00,90.0,KRIL,2012004.0,...,29.2,29.3333,,29.0,,,,11.0,11.0,08/20/14 00:00:00
2,WKRIL2012005,CS137,,,25.5,20.0,08/20/14 00:00:00,90.0,KRIL,2012005.0,...,23.09,23.15,,0.0,,,,11.0,3.0,08/20/14 00:00:00
3,WKRIL2012006,CS137,,,17.0,29.0,08/20/14 00:00:00,90.0,KRIL,2012006.0,...,27.59,27.9833,,0.0,,,,11.0,11.0,08/20/14 00:00:00
4,WKRIL2012007,CS137,,,22.2,18.0,08/20/14 00:00:00,90.0,KRIL,2012007.0,...,27.59,27.9833,,39.0,,,,11.0,11.0,08/20/14 00:00:00


Show the structure of the `biota` dataframe:

In [None]:
dfs['biota'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,BVTIG2012041,CS134,VTIG01,<,0.01014,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2.0,16,02/27/14 00:00:00
1,BVTIG2012041,K40,VTIG01,,135.3,W,3.57,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2.0,16,02/27/14 00:00:00
2,BVTIG2012041,CO60,VTIG01,<,0.01398,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2.0,16,02/27/14 00:00:00
3,BVTIG2012041,CS137,VTIG01,,4.338,W,3.48,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2.0,16,02/27/14 00:00:00
4,BVTIG2012040,CS134,VTIG01,<,0.009614,W,,,02/27/14 00:00:00,6.0,...,F,5,17.0,45.9,964.0,18.458,92.9,2.0,16,02/27/14 00:00:00


Show the structure of the `sediment` dataframe: 

In [None]:
dfs['sediment'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
0,SKRIL2012048,RA226,,,35.0,26.0,,,,08/20/14 00:00:00,...,20.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
1,SKRIL2012049,RA226,,,36.0,22.0,,,,08/20/14 00:00:00,...,27.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
2,SKRIL2012050,RA226,,,38.0,24.0,,,,08/20/14 00:00:00,...,2.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
3,SKRIL2012051,RA226,,,36.0,25.0,,,,08/20/14 00:00:00,...,4.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
4,SKRIL2012052,RA226,,,30.0,23.0,,,,08/20/14 00:00:00,...,6.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00


## Data transformation pipeline

### Normalize nuclide names

#### Lower & strip

Create a callback `LowerStripRdnNameCB` that receives a dictionary of dataframes. For each dataframe in the dictionary of dataframes it converts the contents of the nuclide name column, `Nuclides`, to lowercase and strips any leading or trailing whitespace(s). 

In [None]:
#| export
class LowerStripRdnNameCB(Callback):
    "Convert nuclide names to lowercase & strip any trailing space(s)"
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].apply(
                lambda x: x.lower().strip())

Here we apply the transform `LowerStripRdnNameCB`. Then print the nuclide names that are unique from the column, `NUCLIDE`, of each dataframe include in the dictionary of dataframes, dfs.

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB()])
print('seawater nuclides: ')
print(tfm()['seawater']['NUCLIDE'].unique())
print('biota nuclides: ')
print(tfm()['biota']['NUCLIDE'].unique())
print('sediment nuclides: ')
print(tfm()['sediment']['NUCLIDE'].unique())

seawater nuclides: 
['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'
 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'
 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'
 'np237' 'pu240' 'mn54']
biota nuclides: 
['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'
 'zn65' 'sb125' 'pu239240' 'ru106' 'be7' 'ce144' 'pb210' 'po210' 'sb124'
 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131' 'ba140'
 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223' 'eu155'
 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152' 'sc46'
 'rb86' 'ra224' 'th232' 'cs134137' 'am241' 'ra228' 'th228' 'k-40' 'cs138'
 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']
sediment nuclides: 
['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134137' 'cs134' 'pu239240'
 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144' 'am241' 'be7'
 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210' 'ra224' 'nb95'
 'p

#### Remap to MARIS nuclide names 

In [None]:
#| export
def get_unique_nuclides(dfs):
    "Get list of unique radionuclide types measured across samples."
    nuclides = []
    for k in dfs.keys():
        nuclides += dfs[k]['NUCLIDE'].unique().tolist()
    return nuclides

The marisco package includes a template that defines the permitted structure of the data. This includes a list of permitted nuclide names. This template is located at `nc_tpl_path` and is a `NetCDF` file ,`*.nc` format.
This template is created using [`nc_template.ipynb` notebook](../api/nc_template.ipynb). The template is created with reference to two files, `cdl.toml` and `configs.toml`. The `cdl.toml` file includes all the permitted nuclide names. 

Here we check if the nuclide names included in the dfs are listed in the template that defines the permitted structures:

In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

"pu239240" variable name not found in MARIS CDL
"cm243244" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"pu238240" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"k-40" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

Many nuclide names are not listed in the MARIS CDL (i.e. `cdl.toml`).
Create a look up table, `varnames_lut_updates`, which will be used to correct the nuclide names in the dictionary of dataframes (i.e. dfs) that are not compatible with the template at `nc_tpl_path` 

In [None]:
#| export
varnames_lut_updates = {
    'k-40': 'k40',
    'cm243244': 'cm243_244_tot',
    'cs134137': 'cs134_137_tot',
    'pu239240': 'pu239_240_tot',
    'pu238240': 'pu238_240_tot',
    'cs138': 'cs137',
    'cs139': 'cs137',
    'cs140': 'cs137',
    'cs141': 'cs137',
    'cs142': 'cs137',
    'cs143': 'cs137',
    'cs144': 'cs137',
    'cs145': 'cs137',
    'cs146': 'cs137'}

Create a function, `get_varnames_lut`, which returns a dictionary of nuclide names. This dictionary of nuclide names includes the `NUCLIDE` names in the dictionary of dataframes (i.e. dfs) with the corrections included in varnames_lut_updates:

In [None]:
#| export
def get_varnames_lut(dfs, lut=varnames_lut_updates):
    lut = {n: n for n in set(get_unique_nuclides(dfs))}
    lut.update(varnames_lut_updates)
    return lut

Create a callback that remaps the nuclide names in the dfs to those in `varnames_lut_updates`:

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self,
                 fn_lut=partial(get_varnames_lut, lut=varnames_lut_updates)):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut(tfm.dfs)
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'].replace(lut, inplace=True)

Apply the transformers `LowerStripRdnNameCB` and `RemapRdnNameCB`. Print the unique nuclides for each dataframe included in the dictionary of dataframes:

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB()])

print('seawater nuclides: ')
print(tfm()['seawater']['NUCLIDE'].unique())
print('biota nuclides: ')
print(tfm()['biota']['NUCLIDE'].unique())
print('sediment nuclides: ')
print(tfm()['sediment']['NUCLIDE'].unique())

seawater nuclides: 
['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239_240_tot' 'am241' 'cm242'
 'cm244' 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95'
 'ag110m' 'cm243_244_tot' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239'
 'pb210' 'po210' 'np237' 'pu240' 'mn54']
biota nuclides: 
['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'
 'zn65' 'sb125' 'pu239_240_tot' 'ru106' 'be7' 'ce144' 'pb210' 'po210'
 'sb124' 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131'
 'ba140' 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223'
 'eu155' 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152'
 'sc46' 'rb86' 'ra224' 'th232' 'cs134_137_tot' 'am241' 'ra228' 'th228']
sediment nuclides: 
['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134_137_tot' 'cs134'
 'pu239_240_tot' 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144'
 'am241' 'be7' 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210'
 'ra224' 'nb95' 'pu238_240_tot' 'pu241' 'pu239' 'eu155' 'ir192' 'th2

Check that all nuclide varnames are valid. Returns `True` if all are valid:

In [None]:
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

True

### Parse time

Create a class that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`):

In [None]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].DATE, 
                                                format='%m/%d/%y %H:%M:%S')

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

0   2012-05-23
1   2012-05-23
2   2012-06-17
3   2012-05-24
4   2012-05-24
Name: time, dtype: datetime64[ns]


### Normalize uncertainty units

In [None]:
#| export
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
    return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

For each sample type of the Helcom dataset the uncertainty is provided as a relative uncertainty to the value. The column names for each sample type differ.  The `coi_units_unc` defines the column name of the Value and Uncertainty for each sample type.

In [None]:
#| export
# Columns of interest
coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),
                 ('biota', 'VALUE_Bq/kg', 'ERROR%'),
                 ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

Normalize the uncertainty. The relative error is converted to the uncertainty with the same units as the value. 

In [None]:
#| export
class NormalizeUncUnitCB(Callback):
    "Convert from relative error % to uncertainty of activity unit"
    def __init__(self, coi=coi_units_unc): fc.store_attr()

    def __call__(self, tfm):
        for grp, val, unc in self.coi:
            tfm.dfs[grp][unc] = self.fix_units(tfm.dfs[grp], val, unc)

    def fix_units(self, df, meas_col, unc_col):
        return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB()])

print(tfm()['seawater'][['VALUE_Bq/m³', 'ERROR%_m³']][:5])
print(tfm()['biota'][['VALUE_Bq/kg', 'ERROR%']][:5])
print(tfm()['sediment'][['VALUE_Bq/kg', 'ERROR%_kg']][:5])

   VALUE_Bq/m³  ERROR%_m³
0          5.3      1.696
1         19.9      3.980
2         25.5      5.100
3         17.0      4.930
4         22.2      3.996
   VALUE_Bq/kg    ERROR%
0     0.010140       NaN
1   135.300000  6.535274
2     0.013980       NaN
3     4.338000  0.006549
4     0.009614       NaN
   VALUE_Bq/kg  ERROR%_kg
0         35.0   1.114750
1         36.0   1.026432
2         38.0   1.316928
3         36.0   1.166400
4         30.0   0.621000


### Lookup biota species

Get a list of rubin names included in the Helcom data i.e.( `RUBIN_NAME.csv`):

In [None]:
df_rubin = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
df_rubin.head(5)

Unnamed: 0,RUBIN_ID,RUBIN,SCIENTIFIC NAME,ENGLISH NAME
0,11,ABRA BRA,ABRAMIS BRAMA,BREAM
1,12,ANGU ANG,ANGUILLA ANGUILLA,EEL
2,13,ARCT ISL,ARCTICA ISLANDICA,ISLAND CYPRINE
3,14,ASTE RUB,ASTERIAS RUBENS,COMMON STARFISH
4,15,CARD EDU,CARDIUM EDULE,COCKLE


In [None]:
dfs['biota']['RUBIN'].unique()

array(['GADU MOR', 'SPRA SPR', 'CLUP HAR', 'MERL MNG', 'LIMA LIM',
       'PLEU PLA', 'PLAT FLE', 'SADU ENT', 'ENGR ENC', 'ESOX LUC',
       'MACO BAL', 'FUCU VES', 'ZOAR VIV', 'OSME EPE', 'MYOX SCO',
       'GYMN CER', 'GAST ACU', 'SCOM SCO', 'MYTI EDU', 'CYPR CAR',
       'ABRA BRA', 'STIZ LUC', 'RUTI RUT', 'PERC FLU', 'MYA ARE',
       'CRAN CRA', 'PLANKTON', 'CARD EDU', 'ARCT ISL', 'CLAD GLO',
       'FURC LUM', 'ANGU ANG', 'FISHLARVAE', 'ENCH CIM', 'ASTE RUB',
       'RHODOPHY', 'LAMI SAC', 'PSET MAX', 'GADU MOR  ', 'POLY FUC',
       'STUC PEC', 'ZANN PALU', 'VERT FUCO', 'CH HI;BA', 'ZOST MARI',
       'CERA DIAP', 'FUCU VES '], dtype=object)

In [None]:
match_maris_species(species_lut_path(), 'PLANKTON')

Unnamed: 0,species_id,species,Taxonname,TaxonDBID,score
281,280,Plankton,Plankton,Q25367,0
696,695,Zooplankton,Zooplankton,Q842627,3
633,632,Palaemon,Palaemon,Q1269791,4
160,159,Neuston,Neuston,Q1552545,5
697,696,Phytoplankton,Phytoplankton,Q184755,5
812,811,Chanos,Chanos,Q15114233,5
220,219,Pagurus,Pagurus,Q2480027,6
867,866,Terapon,Terapon,Q1978136,6
1233,1232,Praunus,Praunus,Q3860707,6
1019,1018,Therapon,Therapon,Q27988534,6


In [None]:
#| export
def get_maris_species(fname_in, fname_cache, overwrite=False, verbose=False):
    fname_cache = cache_path() / fname_cache
    lut = {}
    df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
    
    if overwrite or (not fname_cache.exists()):
        if verbose:
            print('Source:Destination:Score')    
        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
            match = match_maris_species(row['SCIENTIFIC NAME'])
            '''
            Include the source, destination and score in lut. 
            '''
            lut[row['RUBIN']] = {'id': match.iloc[0]['species_id'],'name': match.iloc[0]['species'],'source':  row["SCIENTIFIC NAME"], 'status':'marisco_cdl', 'match_type': match.iloc[0]['score']} 
            if verbose: 
                print(f'{row["SCIENTIFIC NAME"]}: {match.iloc[0]["species"]}: {match.iloc[0]["score"]}')
                # Return a verbose lut
                
        fc.save_pickle(fname_cache, lut)
    else:
        lut = fc.load_pickle(fname_cache)
        
    return lut    

Show `species_lut` as a dataframe:

In [None]:
species_lut = get_maris_species(fname_in, 'species_helcom.pkl', overwrite=False, verbose=False)
species_lut_df = pd.DataFrame(species_lut).transpose()
species_lut_df

  0%|          | 0/46 [00:00<?, ?it/s]


KeyError: "['AphiaID', 'scientificname', 'status', 'rank', 'match_type'] not in index"

Show maris_species_lut where match_type is not a perfect match ( i.e. not equal 0).

In [None]:
species_lut_df[species_lut_df['match_type'] != 0]

Unnamed: 0,id,name,source,status,match_type
CARD EDU,988,Cardiidae,CARDIUM EDULE,marisco_cdl,6
CH HI;BA,122,Macoma balthica,CHARA BALTICA,marisco_cdl,6
ENCH CIM,276,Echinodermata,ENCHINODERMATA CIM,marisco_cdl,5
LAMI SAC,149,Laminaria japonica,LAMINARIA SACCHARINA,marisco_cdl,7
MACO BAL,122,Macoma balthica,MACOMA BALTICA,marisco_cdl,1
PSET MAX,675,Pinctada maxima,PSETTA MAXIMA,marisco_cdl,5
STIZ LUC,285,Sander lucioperca,STIZOSTEDION LUCIOPERCA,marisco_cdl,10
STUC PEC,704,Stuckenia pectinata,STUCKENIA PECTINATE,marisco_cdl,1


`get_worms_species` completes a lookup of the species included in the `RUBIN_NAME.csv` against the WORMS database at `https://www.marinespecies.org/rest/AphiaRecordsByMatchNames`. If `load_lut` parameter equals `True` then a lut is read from fname_cache. If the the species included in the RUBIN_NAME are listed in the lut with a perfect score (i.e. 0) then no lookup is performed for that species.

In [None]:
#| export
def get_worms_species(fname_in, fname_cache, load_lut=False, overwrite=False):
    fname_cache = cache_path() / fname_cache
    lut = {}

    if overwrite or (not fname_cache.exists()):
        df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
        
        if load_lut:
            '''
            open and read the LUT file
            '''
            lut = fc.load_pickle(fname_cache)
        
        for _, row in tqdm(df[['RUBIN', 'SCIENTIFIC NAME']].iterrows(), total=df.shape[0]):
            if load_lut:
                '''
                If row['RUBIN'] in LUT and match_type equals 0 then dont complete the lookup with WORMS. 
                '''
                if row['RUBIN'] in lut:
                    if lut[row['RUBIN']]['match_type'] == 0:
                        continue
            res = match_worms(row['SCIENTIFIC NAME'])
            if (res == -1):
                print(f"No match found for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
                id = -1 
                lut[row['RUBIN']] = {'id': id, 'name': '', 'source': row["SCIENTIFIC NAME"] ,'status': 'No match', 'match_type': 'No match', 'unacceptreason':'No match'}
            else:
                if len(res[0]) > 1:
                    print(f"Several matches for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
                    
                id, name, status, match_type,unacceptreason  = [res[0][0].get(key) 
                                                for key in ['AphiaID', 'scientificname', 'status', 'match_type','unacceptreason']]        
                
                lut[row['RUBIN']] = {'id': id, 'name': name, 'source': row["SCIENTIFIC NAME"] ,'status': status, 'match_type': match_type, 'unacceptreason':unacceptreason}
        fc.save_pickle(fname_cache, lut)
    else:
        lut = fc.load_pickle(fname_cache)
        
    return lut

In [None]:
species_lut = get_worms_species(fname_in, 'species_helcom.pkl', load_lut=True, overwrite=True); 

 24%|██▍       | 11/46 [01:11<03:35,  6.17s/it]

No match found for ENCH CIM (ENCHINODERMATA CIM)


100%|██████████| 46/46 [01:23<00:00,  1.81s/it]


Show species_lut as a dataframe after worms lookup. 

In [None]:
species_lut_df=pd.DataFrame(species_lut).transpose()
species_lut_df

Unnamed: 0,id,name,source,status,match_type,unacceptreason
ABRA BRA,271,Abramis brama,ABRAMIS BRAMA,marisco_cdl,0,
ANGU ANG,272,Anguilla anguilla,ANGUILLA ANGUILLA,marisco_cdl,0,
ARCT ISL,273,Arctica islandica,ARCTICA ISLANDICA,marisco_cdl,0,
ASTE RUB,21,Asterias rubens,ASTERIAS RUBENS,marisco_cdl,0,
CARD EDU,152921,Cardium edule,CARDIUM EDULE,superseded combination,exact,original combination
CH HI;BA,399467,Chara baltica,CHARA BALTICA,accepted,exact,
CLAD GLO,290,Cladophora glomerata,CLADOPHORA GLOMERATA,marisco_cdl,0,
CLUP HAR,50,Clupea harengus,CLUPEA HARENGUS,marisco_cdl,0,
CRAN CRA,59,Crangon crangon,CRANGON CRANGON,marisco_cdl,0,
CYPR CAR,275,Cyprinus carpio,CYPRINUS CARPIO,marisco_cdl,0,


Show all rows which were included in the WORMS lookup  

In [None]:
species_lut_df[species_lut_df['match_type']!=0]

Unnamed: 0,id,name,source,status,match_type,unacceptreason
CARD EDU,152921,Cardium edule,CARDIUM EDULE,superseded combination,exact,original combination
CH HI;BA,399467,Chara baltica,CHARA BALTICA,accepted,exact,
ENCH CIM,-1,,ENCHINODERMATA CIM,No match,No match,No match
LAMI SAC,145730,Laminaria saccharina,LAMINARIA SACCHARINA,unaccepted,exact,
MACO BAL,141579,Macoma balthica,MACOMA BALTICA,accepted,phonetic,
PSET MAX,154473,Psetta maxima,PSETTA MAXIMA,unaccepted,exact,
STIZ LUC,321686,Stizostedion lucioperca,STIZOSTEDION LUCIOPERCA,unaccepted,exact,
STUC PEC,588573,Stuckenia pectinata,STUCKENIA PECTINATE,accepted,near_1,


In [None]:
#| export
class LookupBiotaSpeciesCB(Callback):
    'Match species with MARIS database.'
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(
            lambda x: lut[x.strip()]['id'])
        # Remove data with a species_id of -1.
        tfm.dfs['biota']=tfm.dfs['biota'].drop(tfm.dfs['biota'][tfm.dfs['biota']['species_id'] == -1 ].index)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl'))
                            ])

print(tfm()['biota'][['RUBIN', 'species_id']][:5])

      RUBIN  species_id
0  GADU MOR          99
1  GADU MOR          99
2  GADU MOR          99
3  GADU MOR          99
4  GADU MOR          99


### Lookup biota tissues

In [None]:
dfs['biota']['TISSUE'].unique()

array([ 5,  1, 41,  3, 51, 43, 42, 12, 10, 18, 52, 20,  8, 54, 53, 13])

In [None]:
#| export
def get_bodypart(verbose=False):
    "Naive lut - TO BE REFACTORED"
    lut={
        5: 52,
        1: 1,
        41: 1,
        3: 3,
        51: 54,
        43: 19,        
        42: 59,
        12: 20,
        10: 7,
        18: 25,
        52: 55,
        20: 38,
        8: 12,
        54: 57,
        53: 56,
        13:21}
    
    if verbose:
        marris_dbo_bodypar=pd.read_excel('../../nbs/files/lut/dbo_bodypar.xlsx')
        helcom_tissue=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv')
        print ('marris_dbo_bodypar  :  helcom_tissue')
        for k, v in lut.items():
            print (str(helcom_tissue[helcom_tissue.TISSUE==int(k)].TISSUE_DESCRIPTION.values[0]) + '  :  ' + str(marris_dbo_bodypar[marris_dbo_bodypar.bodypar_id==v].bodypar.values[0]))   
    return lut

In [None]:
#| export
class LookupBiotaBodyPartCB(Callback):
    'Update bodypart id based on MARIS dbo_bodypar.xlsx'
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['body_part'] = tfm.dfs['biota']['TISSUE'].apply(lambda x: lut[x])

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaBodyPartCB(get_bodypart)
                            ])

print(tfm()['biota'][['TISSUE', 'body_part']][:5])

   TISSUE  body_part
0       5         52
1       5         52
2       5         52
3       5         52
4       5         52


### Lookup sediment types

In [None]:
df_sediment = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')
df_sediment.head(5)

Unnamed: 0,SEDI,SEDIMENT TYPE,RECOMMENDED TO BE USED
0,-99,NO DATA,
1,0,GRAVEL,YES
2,1,SAND,YES
3,2,FINE SAND,NO
4,3,SILT,YES


In [None]:
df_sediment['SEDI'].unique()

array([-99,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  20,  21,  22,  23,  24,  25,  30,  31,  32,
        33,  34,  35,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
        50,  51,  52,  54,  55,  57,  58,  59])

In [None]:
#| export
def get_sediment(verbose=False):
    lut = {}
    if verbose: print('Source:Destination')
    df_sediment = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')
    
    for _, row in df_sediment.iterrows():
        match = match_maris_sediment(row['SEDIMENT TYPE'])
        lut[row['SEDI']] = match.iloc[0,0]
        if verbose: print(f'({row["SEDI"]}) {row["SEDIMENT TYPE"]}: ({match.iloc[0,0]}) {match.iloc[0,1]}')
    return lut   

In [None]:
get_sediment(verbose=True)

Source:Destination
(-99) NO DATA: (26) Soft
(0) GRAVEL: (2) Gravel
(1) SAND: (6) Sand
(2) FINE SAND: (7) Fine sand
(3) SILT: (12) Silt
(4) CLAY: (1) Clay
(5) MUD: (4) Mud
(6) GLACIAL: (25) Glacial
(7) SOFT: (26) Soft
(8) SULPHIDIC: (27) Sulphidic
(9) Fe-Mg CONCRETIONS: (28) Fe-Mg concretions
(10) SAND AND GRAVEL: (29) Sand and gravel
(11) PURE SAND: (30) Pure sand
(12) SAND AND FINE SAND: (31) Sand and fine sand
(13) SAND AND SILT: (62) Sand and silt
(14) SAND AND CLAY: (32) Sand and clay
(15) SAND AND MUD: (33) Sand and mud
(20) FINE SAND AND GRAVEL: (34) Fine sand and gravel
(21) FINE SAND AND SAND: (35) Fine sand and sand
(22) PURE FINE SAND: (36) Pure fine sand
(23) FINE SAND AND SILT: (37) Fine sand and silt
(24) FINE SAND AND CLAY: (38) Fine sand and clay
(25) FINE SAND AND MUD: (39) Fine sand and mud
(30) SILT AND GRAVEL: (11) Silt and gravel
(31) SILT AND SAND: (40) Silt and sand
(32) SILT AND FINE SAND: (41) Silt and fine sand
(33) PURE SILT: (42) Pure silt
(34) SILT AND CLAY:

{-99: 26,
 0: 2,
 1: 6,
 2: 7,
 3: 12,
 4: 1,
 5: 4,
 6: 25,
 7: 26,
 8: 27,
 9: 28,
 10: 29,
 11: 30,
 12: 31,
 13: 62,
 14: 32,
 15: 33,
 20: 34,
 21: 35,
 22: 36,
 23: 37,
 24: 38,
 25: 39,
 30: 11,
 31: 40,
 32: 41,
 33: 42,
 34: 10,
 35: 43,
 40: 44,
 41: 45,
 42: 46,
 43: 48,
 44: 47,
 45: 49,
 46: 50,
 47: 51,
 48: 52,
 49: 53,
 50: 54,
 51: 55,
 52: 56,
 54: 57,
 55: 58,
 57: 59,
 58: 60,
 59: 61}

In [None]:
dfs['sediment']['SEDI'].unique()

array([ nan, -99.,   0.,  55.,  11.,  57.,  51.,  52.,  22.,  10.,  44.,
         5.,  50.,  15.,   1.,  40.,  33.,  43.,  59.,  54.,   9.,  45.,
        14.,  41.,  25.,  42.,  24.,  12.,  58.,  13.,   7.,  49.,  48.,
         4.,  47.,  23.,  20.,  46.,   2.,  34.,  32.,  56.,  35.,  73.,
        21.])

In [None]:
lut_sediment = get_sediment(verbose=True)

Source:Destination
(-99) NO DATA: (26) Soft
(0) GRAVEL: (2) Gravel
(1) SAND: (6) Sand
(2) FINE SAND: (7) Fine sand
(3) SILT: (12) Silt
(4) CLAY: (1) Clay
(5) MUD: (4) Mud
(6) GLACIAL: (25) Glacial
(7) SOFT: (26) Soft
(8) SULPHIDIC: (27) Sulphidic
(9) Fe-Mg CONCRETIONS: (28) Fe-Mg concretions
(10) SAND AND GRAVEL: (29) Sand and gravel
(11) PURE SAND: (30) Pure sand
(12) SAND AND FINE SAND: (31) Sand and fine sand
(13) SAND AND SILT: (62) Sand and silt
(14) SAND AND CLAY: (32) Sand and clay
(15) SAND AND MUD: (33) Sand and mud
(20) FINE SAND AND GRAVEL: (34) Fine sand and gravel
(21) FINE SAND AND SAND: (35) Fine sand and sand
(22) PURE FINE SAND: (36) Pure fine sand
(23) FINE SAND AND SILT: (37) Fine sand and silt
(24) FINE SAND AND CLAY: (38) Fine sand and clay
(25) FINE SAND AND MUD: (39) Fine sand and mud
(30) SILT AND GRAVEL: (11) Silt and gravel
(31) SILT AND SAND: (40) Silt and sand
(32) SILT AND FINE SAND: (41) Silt and fine sand
(33) PURE SILT: (42) Pure silt
(34) SILT AND CLAY:

In [None]:
dfs['sediment']['SEDI'].fillna(-99).astype('int').unique()

array([-99,   0,  55,  11,  57,  51,  52,  22,  10,  44,   5,  50,  15,
         1,  40,  33,  43,  59,  54,   9,  45,  14,  41,  25,  42,  24,
        12,  58,  13,   7,  49,  48,   4,  47,  23,  20,  46,   2,  34,
        32,  56,  35,  73,  21])

In [None]:
#| export
class LookupSedimentCB(Callback):
    'Update sediment id  based on MARIS dbo_sedtype.xlsx'
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['sediment']['SEDI'] = dfs['sediment']['SEDI'].fillna(-99).astype('int')
        # To check with Helcom
        tfm.dfs['sediment']['SEDI'].replace(56, -99, inplace=True)
        tfm.dfs['sediment']['SEDI'].replace(73, -99, inplace=True)
        tfm.dfs['sediment']['sed_type'] = tfm.dfs['sediment']['SEDI'].apply(lambda x: lut[x])

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupSedimentCB(get_sediment)
                            ])

print(tfm()['sediment'][['SEDI', 'sed_type']][:5])

   SEDI  sed_type
0   -99        26
1   -99        26
2   -99        26
3   -99        26
4   -99        26


***

### Capture Units

In [None]:
#| export
# Define unit names renaming rules
renaming_unit_rules = { 'VALUE_Bq/m³': 1, #'Bq/m3'
                       'VALUE_Bq/kg': 3 #'Bq/kg'}
                  

In [None]:
#| export
class LookupUnitCB(Callback):
    def __init__(self,
                 renaming_unit_rules=renaming_unit_rules):
        fc.store_attr()
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            for k,v in self.renaming_unit_rules.items():
                if k in tfm.dfs[grp].columns:
                    tfm.dfs[grp]['unit'] = np.where(tfm.dfs[grp].loc[:,k].notna(), np.int64(v), np.int64(0))


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB()])

tfm()

{'seawater':                 KEY NUCLIDE METHOD < VALUE_Bq/m³  VALUE_Bq/m³  ERROR%_m³  \
 0      WKRIL2012003   cs137    NaN           NaN          5.3      1.696   
 1      WKRIL2012004   cs137    NaN           NaN         19.9      3.980   
 2      WKRIL2012005   cs137    NaN           NaN         25.5      5.100   
 3      WKRIL2012006   cs137    NaN           NaN         17.0      4.930   
 4      WKRIL2012007   cs137    NaN           NaN         22.2      3.996   
 ...             ...     ...    ...           ...          ...        ...   
 21211  WSSSM2021005      h3  SSM45           NaN       1030.0    960.000   
 21212  WSSSM2021006      h3  SSM45           NaN       2240.0    970.000   
 21213  WSSSM2021007      h3  SSM45           NaN       2060.0    970.000   
 21214  WSSSM2021008      h3  SSM45           NaN       2300.0   1000.000   
 21215  WSSSM2021004      h3  SSM45             <          NaN        NaN   
 
          DATE_OF_ENTRY_x  COUNTRY LABORATORY   SEQUENCE  ... 

In [None]:
tfm.dfs['sediment']['unit']

0        3
1        3
2        3
3        3
4        3
        ..
39812    3
39813    3
39814    3
39815    3
39816    3
Name: unit, Length: 39817, dtype: int64

***

### Rename columns

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)','unit'],
           'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)',
                        'sed_type','unit'],
           'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                     'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm',
                     'species_id', 'body_part','unit']}


In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)': 'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)': 'lon',
    'LONGITUDE ddmmmm': 'lon'
}


In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB(),
                            RenameColumnCB()])

#print(tfm()['biota'].head(5))

tfm()

{'seawater':       nuclide   value       unc       time  depth      lat      lon  unit
 0       cs137     5.3     1.696 2012-05-23    NaN  60.0833  29.3333     1
 1       cs137    19.9     3.980 2012-05-23    NaN  60.0833  29.3333     1
 2       cs137    25.5     5.100 2012-06-17    NaN  59.4333  23.1500     1
 3       cs137    17.0     4.930 2012-05-24    NaN  60.2500  27.9833     1
 4       cs137    22.2     3.996 2012-05-24    NaN  60.2500  27.9833     1
 ...       ...     ...       ...        ...    ...      ...      ...   ...
 21211      h3  1030.0   960.000 2021-10-15    NaN  60.5200  18.3572     1
 21212      h3  2240.0   970.000 2021-11-04    NaN  57.4217  17.0000     1
 21213      h3  2060.0   970.000 2021-10-15    NaN  57.2347  11.9452     1
 21214      h3  2300.0  1000.000 2021-05-17    NaN  57.2347  11.9452     1
 21215      h3     NaN       NaN 2021-05-13    NaN  58.6033  11.2450     0
 
 [21216 rows x 8 columns],
 'sediment':       nuclide   value       unc       time  de

In [None]:
tfm.dfs['biota']

Unnamed: 0,nuclide,value,unc,time,depth,lat,lon,species_id,body_part,unit
0,cs134,0.010140,,2012-09-23,,54.170,12.1900,99,52,3
1,k40,135.300000,4.830210,2012-09-23,,54.170,12.1900,99,52,3
2,co60,0.013980,,2012-09-23,,54.170,12.1900,99,52,3
3,cs137,4.338000,0.150962,2012-09-23,,54.170,12.1900,99,52,3
4,cs134,0.009614,,2012-09-23,,54.170,12.1900,99,52,3
...,...,...,...,...,...,...,...,...,...,...
15822,k40,65.000000,6.630000,2020-10-09,0.0,60.224,18.2374,141579,1,3
15823,cs137,4.500000,0.279000,2020-10-09,0.0,60.224,18.2374,141579,1,3
15824,be7,94.000000,3.196000,2020-10-26,0.0,60.302,18.2200,96,54,3
15825,k40,1100.000000,17.600000,2020-10-26,0.0,60.302,18.2200,96,54,3


***

### Reshape: long to wide

In [None]:
#| export
class ReshapeLongToWide(Callback):
    def __init__(self): fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            cols = ['nuclide']
            #vals = ['value', 'unc']
            vals = ['value', 'unc', 'unit']
            
            idx = list(set(tfm.dfs[k].columns) -
                       set(cols + vals))  # All others

            tfm.dfs[k] = tfm.dfs[k].pivot_table(index=idx,
                                                columns=cols,
                                                values=vals).reset_index()
            
            # Flatten cols name
            tfm.dfs[k].columns = rename_cols(tfm.dfs[k].columns)
            
            # Update dtypes of unit
            date_cols = [col for col in tfm.dfs[k].columns if 'unit' in col]
            tfm.dfs[k][date_cols] = tfm.dfs[k][date_cols].fillna(0)
            tfm.dfs[k][date_cols] = tfm.dfs[k][date_cols].apply(lambda x: x.astype('int64'))
            
            #tfm.dfs[grp]['unit']=tfm.dfs[grp]['unit'].astype('int64')
            # Set index
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB(),
                            RenameColumnCB(),
                            ReshapeLongToWide()])

tfm()

{'seawater':         depth      lat      lon       time  ag110m_unc  am241_unc  ba140_unc  \
 sample                                                                         
 0         0.0  54.3500  11.0783 1986-05-09         NaN        NaN        NaN   
 1         0.0  54.5000  10.3167 1986-05-11         NaN        NaN        NaN   
 2         0.0  55.3050  21.0266 2019-02-12         NaN        NaN        NaN   
 3         0.0  55.3483  16.4483 1986-05-09         NaN        NaN        NaN   
 4         0.0  55.5000  18.8800 2010-08-02         NaN        NaN        NaN   
 ...       ...      ...      ...        ...         ...        ...        ...   
 4814    459.0  58.5833  18.2333 1989-06-10         NaN        NaN        NaN   
 4815    460.0  58.5800  18.2333 1994-05-21         NaN   0.000252        NaN   
 4816    460.0  58.5833  18.2333 1993-07-24         NaN        NaN        NaN   
 4817    460.0  58.5833  18.2317 2000-08-03         NaN        NaN        NaN   
 4818    460.0  

***

### Encode time (seconds since ...)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB(),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg())])

tfm()

{'seawater':         depth      lat      lon        time  ag110m_unc  am241_unc  ba140_unc  \
 sample                                                                          
 0         0.0  54.3500  11.0783   515980800         NaN        NaN        NaN   
 1         0.0  54.5000  10.3167   516153600         NaN        NaN        NaN   
 2         0.0  55.3050  21.0266  1549929600         NaN        NaN        NaN   
 3         0.0  55.3483  16.4483   515980800         NaN        NaN        NaN   
 4         0.0  55.5000  18.8800  1280707200         NaN        NaN        NaN   
 ...       ...      ...      ...         ...         ...        ...        ...   
 4814    459.0  58.5833  18.2333   613440000         NaN        NaN        NaN   
 4815    460.0  58.5800  18.2333   769478400         NaN   0.000252        NaN   
 4816    460.0  58.5833  18.2333   743472000         NaN        NaN        NaN   
 4817    460.0  58.5833  18.2317   965260800         NaN        NaN        NaN   
 481

***

### Sanitize coordinates

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB(),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])

tfm()


{'seawater':         depth      lat      lon        time  ag110m_unc  am241_unc  ba140_unc  \
 sample                                                                          
 0         0.0  54.3500  11.0783   515980800         NaN        NaN        NaN   
 1         0.0  54.5000  10.3167   516153600         NaN        NaN        NaN   
 2         0.0  55.3050  21.0266  1549929600         NaN        NaN        NaN   
 3         0.0  55.3483  16.4483   515980800         NaN        NaN        NaN   
 4         0.0  55.5000  18.8800  1280707200         NaN        NaN        NaN   
 ...       ...      ...      ...         ...         ...        ...        ...   
 4814    459.0  58.5833  18.2333   613440000         NaN        NaN        NaN   
 4815    460.0  58.5800  18.2333   769478400         NaN   0.000252        NaN   
 4816    460.0  58.5833  18.2333   743472000         NaN        NaN        NaN   
 4817    460.0  58.5833  18.2317   965260800         NaN        NaN        NaN   
 481

***

## Encode to NetCDF

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_maris_species, 
                                                         fname_in, 'species_helcom.pkl')),
                            LookupBiotaBodyPartCB(get_bodypart),
                            LookupSedimentCB(get_sediment),
                            LookupUnitCB(),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])

tfm()

{'seawater':         depth      lat      lon        time  ag110m_unc  am241_unc  ba140_unc  \
 sample                                                                          
 0         0.0  54.3500  11.0783   515980800         NaN        NaN        NaN   
 1         0.0  54.5000  10.3167   516153600         NaN        NaN        NaN   
 2         0.0  55.3050  21.0266  1549929600         NaN        NaN        NaN   
 3         0.0  55.3483  16.4483   515980800         NaN        NaN        NaN   
 4         0.0  55.5000  18.8800  1280707200         NaN        NaN        NaN   
 ...       ...      ...      ...         ...         ...        ...        ...   
 4814    459.0  58.5833  18.2333   613440000         NaN        NaN        NaN   
 4815    460.0  58.5800  18.2333   769478400         NaN   0.000252        NaN   
 4816    460.0  58.5833  18.2333   743472000         NaN        NaN        NaN   
 4817    460.0  58.5833  18.2317   965260800         NaN        NaN        NaN   
 481

In [None]:
tfm.logs

['Convert nuclide names to lowercase & strip any trailing space(s)',
 'Remap to MARIS radionuclide names.',
 'Convert from relative error % to uncertainty of activity unit',
 'Match species with MARIS database.',
 'Update bodypart id based on MARIS dbo_bodypar.xlsx',
 'Update sediment id  based on MARIS dbo_sedtype.xlsx',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [None]:
#| export
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)

{'geospatial_lat_min': '31.1667',
 'geospatial_lat_max': '65.6347',
 'geospatial_lon_min': '9.41',
 'geospatial_lon_max': '53.458',
 'geospatial_bounds': 'POLYGON ((9.41 53.458, 31.1667 53.458, 31.1667 65.6347, 9.41 65.6347, 9.41 53.458))',
 'geospatial_vertical_max': '0',
 'geospatial_vertical_min': '-460.0',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2021-12-06T00:00:00',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting d

### Encoding

In [None]:
species_lut

{'ABRA BRA': {'id': 271,
  'name': 'Abramis brama',
  'source': 'ABRAMIS BRAMA',
  'status': 'marisco_cdl',
  'match_type': 0},
 'ANGU ANG': {'id': 272,
  'name': 'Anguilla anguilla',
  'source': 'ANGUILLA ANGUILLA',
  'status': 'marisco_cdl',
  'match_type': 0},
 'ARCT ISL': {'id': 273,
  'name': 'Arctica islandica',
  'source': 'ARCTICA ISLANDICA',
  'status': 'marisco_cdl',
  'match_type': 0},
 'ASTE RUB': {'id': 21,
  'name': 'Asterias rubens',
  'source': 'ASTERIAS RUBENS',
  'status': 'marisco_cdl',
  'match_type': 0},
 'CARD EDU': {'id': 152921,
  'name': 'Cardium edule',
  'source': 'CARDIUM EDULE',
  'status': 'superseded combination',
  'match_type': 'exact',
  'unacceptreason': 'original combination'},
 'CH HI;BA': {'id': 399467,
  'name': 'Chara baltica',
  'source': 'CHARA BALTICA',
  'status': 'accepted',
  'match_type': 'exact',
  'unacceptreason': None},
 'CLAD GLO': {'id': 290,
  'name': 'Cladophora glomerata',
  'source': 'CLADOPHORA GLOMERATA',
  'status': 'marisco_c

In [None]:
{info['name']: info['id'] for info in species_lut.values() if info['name'] != ''}

{'Abramis brama': 271,
 'Anguilla anguilla': 272,
 'Arctica islandica': 273,
 'Asterias rubens': 21,
 'Cardium edule': 152921,
 'Chara baltica': 399467,
 'Cladophora glomerata': 290,
 'Clupea harengus': 50,
 'Crangon crangon': 59,
 'Cyprinus carpio': 275,
 'Engraulis encrasicolus': 84,
 'Esox lucius': 269,
 'Fish larvae': 277,
 'Fucus vesiculosus': 96,
 'Furcellaria lumbricalis': 289,
 'Gadus morhua': 99,
 'Gasterosteus aculeatus': 286,
 'Gymnocephalus cernua': 288,
 'Laminaria saccharina': 145730,
 'Limanda limanda': 270,
 'Macoma balthica': 141579,
 'Merlangius merlangus': 139,
 'Mya arenaria': 120,
 'Myoxocephalus scorpius': 278,
 'Mytilus edulis': 129,
 'Osmerus eperlanus': 279,
 'Perca fluviatilis': 247,
 'Plankton': 280,
 'Platichthys flesus': 191,
 'Pleuronectes platessa': 192,
 'Polysiphonia fucoides': 245,
 'Psetta maxima': 154473,
 'Rhodophyta': 282,
 'Rutilus rutilus': 283,
 'Saduria entomon': 284,
 'Scomber scombrus': 244,
 'Solea solea': 397,
 'Sprattus sprattus': 243,
 'S

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    dfs = load_data(fname_in)         
    tfm = Transformer(dfs, cbs=[
        LowerStripRdnNameCB(),
        RemapRdnNameCB(),
        ParseTimeCB(),
        NormalizeUncUnitCB(),
        LookupBiotaSpeciesCB(partial(get_maris_species, 
                                     fname_in, 'species_helcom.pkl')),
        LookupBiotaBodyPartCB(get_bodypart),
        LookupSedimentCB(get_sediment),
        LookupUnitCB(),        
        RenameColumnCB(),
        ReshapeLongToWide(),
        EncodeTimeCB(cfg()),
        SanitizeLonLatCB()
        ])
    
    species_lut = get_maris_species(fname_in, 'species_helcom.pkl')
    enums_xtra = {
        'species_t': {info['name']: info['id'] 
                      for info in species_lut.values() if info['name'] != ''}
    }
        
    encoder = NetCDFEncoder(tfm(), 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw),
                            enums_xtra=enums_xtra,
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, nc_tpl_path(), verbose=False)

uint64
float32
float32
float32
uint64
float32
float32
<class 'netCDF4._netCDF4.EnumType'>: name = 'unit_type_t', numpy dtype = uint64, fields/values ={'NOT AVAILABLE': 0, 'Bq per m3': 1, 'Bq per m2': 2, 'Bq per kg': 3, 'Bq per kgd': 4, 'Bq per kgw': 5, 'kg per kg': 6, 'TU': 7, 'DELTA per mill': 8, 'atom per kg': 9, 'atom per kgd': 10, 'atom per kgw': 11, 'atom per l': 12, 'Bq per kgC': 13}
float32
float32
<class 'netCDF4._netCDF4.EnumType'>: name = 'unit_type_t', numpy dtype = uint64, fields/values ={'NOT AVAILABLE': 0, 'Bq per m3': 1, 'Bq per m2': 2, 'Bq per kg': 3, 'Bq per kgd': 4, 'Bq per kgw': 5, 'kg per kg': 6, 'TU': 7, 'DELTA per mill': 8, 'atom per kg': 9, 'atom per kgd': 10, 'atom per kgw': 11, 'atom per l': 12, 'Bq per kgC': 13}
float32
<class 'netCDF4._netCDF4.EnumType'>: name = 'unit_type_t', numpy dtype = uint64, fields/values ={'NOT AVAILABLE': 0, 'Bq per m3': 1, 'Bq per m2': 2, 'Bq per kg': 3, 'Bq per kgd': 4, 'Bq per kgw': 5, 'kg per kg': 6, 'TU': 7, 'DELTA per mill': 8,

<marisco.serializers.NetCDFEncoder at 0x7f4b56ebe4c0>