In [None]:
#| default_exp handlers.helcom

# HELCOM

> This data pipeline, known as a "handler" in Marisco terminology, is designed to clean, standardize, and encode [HELCOM data](https://helcom.fi/about-us) into `NetCDF` format. The handler processes raw HELCOM data, applying various transformations and lookups to align it with `MARIS` data standards.

Key functions of this handler:

- **Cleans** and **normalizes** raw HELCOM data
- **Applies standardized nomenclature** and units
- **Encodes the processed data** into `NetCDF` format compatible with MARIS requirements

This handler is a crucial component in the Marisco data processing workflow, ensuring HELCOM data is properly integrated into the MARIS database.



Note: *Additionally, a decoder [link] is provided to process data from `NetCDF` to a `.csv` format compatible with the MARIS master database. This feature is maintained for legacy purposes, as data ingestion was previously performed using OpenRefine.*

:::{.callout-tip}

For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.

:::

The present notebook pretends to be an instance of [Literate Programming](https://www.wikiwand.com/en/articles/Literate_programming) in the sense that it is a narrative that includes code snippets that are interspersed with explanations. When a function or a class needs to be exported in a dedicated python module (in our case `marisco/handlers/helcom.py`) the code snippet is added to the module using `#| exports` as provided by the wonderful [nbdev](https://nbdev.readthedocs.io/en/latest/) library.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd 
import numpy as np
#from functools import partial 
import fastcore.all as fc 
from pathlib import Path 
#from dataclasses import asdict
from typing import List, Dict, Callable, Tuple, Any 
from collections import OrderedDict, defaultdict
import re
from functools import partial

from marisco.utils import (
    #has_valid_varname, 
    #match_worms, 
    Remapper, 
    ddmm_to_dd,
    #match_maris_lut, 
    Match, 
    get_unique_across_dfs
)

from marisco.callbacks import (
    Callback, 
    Transformer, 
    EncodeTimeCB, 
    AddSampleTypeIdColumnCB,
    AddNuclideIdColumnCB, 
    LowerStripNameCB, 
    SanitizeLonLatCB, 
    #ReshapeLongToWide, 
    CompareDfsAndTfmCB, 
    RemapCB
)

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB, 
    DepthRangeCB, 
    TimeRangeCB, 
    ZoteroCB, 
    KeyValuePairCB
)

from marisco.configs import (
    nuc_lut_path, 
    nc_tpl_path, 
    cfg, 
    #cache_path, 
    #cdl_cfg, 
    Enums, 
    lut_path, 
    species_lut_path, 
    sediments_lut_path, 
    bodyparts_lut_path, 
    detection_limit_lut_path, 
    filtered_lut_path, 
    #area_lut_path, 
    get_lut, 
    unit_lut_path,
    base_path, # not needed here, included to troubleshoot cdl_cfg
    prepmet_lut_path,
    sampmet_lut_path,
    counmet_lut_path
)

from marisco.encoders import (
    NetCDFEncoder, 
    OpenRefineCsvEncoder
)

import warnings
warnings.filterwarnings('ignore')

In [None]:
#| hide
pd.set_option('display.max_rows', 100)

## Configuration & file paths

- **fname_in**: path to the folder containing the HELCOM data in CSV format. The path can be defined as a relative path. 

- **fname_out_nc**: path and filename for the NetCDF output.The path can be defined as a relative path. 

- **Zotero key**: used to retrieve attributes related to the dataset from [Zotero](https://www.zotero.org/). The MARIS datasets include a [library](https://maris.iaea.org/datasets) available on [Zotero](https://www.zotero.org/groups/2432820/maris/library). 

- **ref_id**: refers to the location in Archive of the Zotero library.


In [None]:
# | exports
fname_in = '../../_data/accdb/mors/csv'
fname_out_nc = '../../_data/output/100-HELCOM-MORS-2024.nc'
zotero_key ='26VMZZ2Q' # HELCOM MORS zotero key
ref_id = 100 # HELCOM MORS reference id as defined by MARIS

## Load data

[Helcom MORS (Monitoring of Radioactive Substances in the Baltic Sea) data](https://helcom.fi/about-us) is provided as a Microsoft Access database. 
[`Mdbtools`](https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.

**Example steps**:


1. [Download data](https://metadata.helcom.fi/geonetwork/srv/fin/catalog.search#/metadata/2fdd2d46-0329-40e3-bf96-cb08c7206a24)

2. Install mdbtools via VScode Terminal: 

    ```
    sudo apt-get -y install mdbtools
    ```

3. Install unzip via VScode Terminal:

    ```
    sudo apt-get -y install unzip
    ```

4. In `VS Code` terminal (for instance), navigate to the marisco data folder:

    ```
    cd /home/marisco/downloads/marisco/_data/accdb/mors_19840101_20211231
    ```

5. Unzip `MORS_ENVIRONMENT.zip`:

    ```
    unzip MORS_ENVIRONMENT.zip 
    ```

6. Run `preprocess.sh` to generate the required data files:

    ```
    ./preprocess.sh MORS_ENVIRONMENT.zip
    ```

7. Content of `preprocess.sh` script:

    ```
    #!/bin/bash

    # Example of use: ./preprocess.sh MORS_ENVIRONMENT.zip
    unzip $1
    dbname=$(ls *.accdb)
    mkdir csv
    for table in $(mdb-tables -1 "$dbname"); do
        echo "Export table $table"
        mdb-export "$dbname" "$table" > "csv/$table.csv"
    done
    ```

Once converted to `.csv` files, the data is ready to be loaded into a dictionary of dataframes.
    

In [None]:
#| exports
default_smp_types = {  
    'BIO': 'BIOTA', 
    'SEA': 'SEAWATER', 
    'SED': 'SEDIMENT'
}

In [None]:
#| exports
def load_data(src_dir: str|Path, 
              smp_types: dict = default_smp_types 
             ) -> Dict[str, pd.DataFrame]: 
    "Load HELCOM data and return the data in a dictionary of dataframes with the dictionary key as the sample type."
    src_path = Path(src_dir)
    
    def load_and_merge(file_prefix: str) -> pd.DataFrame:
        try:
            df_meas = pd.read_csv(src_path / f'{file_prefix}02.csv')
            df_smp = pd.read_csv(src_path / f'{file_prefix}01.csv')
            return pd.merge(df_meas, df_smp, on='KEY', how='left')
        except FileNotFoundError as e:
            print(f"Error loading files for {file_prefix}: {e}")
            return pd.DataFrame()  # Return an empty DataFrame if files are not found
    
    return {smp_type: load_and_merge(file_prefix) for file_prefix, smp_type in smp_types.items()}  

`dfs` is a dictionary of dataframes created from the Helcom dataset located at the path `fname_in`. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. 

In [None]:
dfs = load_data(fname_in)
print('keys/sample types: ', dfs.keys())
for key in dfs.keys():
    print(f'{key} columns: ', dfs[key].columns)

keys/sample types:  dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])
BIOTA columns:  Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',
       'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',
       'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',
       'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',
       'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',
       'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',
       'DATE_OF_ENTRY_y'],
      dtype='object')
SEAWATER columns:  Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/m³', 'VALUE_Bq/m³', 'ERROR%_m³',
       'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR',
       'MONTH', 'DAY', 'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',
       'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'TDEPTH', 'SDEPTH', 'SALIN',
       'TTEMP', 'FILT', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'DATE_OF_ENTRY_y'],
      dtype='object')
SEDIMEN

## Add sample type column (REMOVE STEP)

:::{.callout-tip}

**TODO**: The `samptype_id` column is added to the dataframe for legacy reasons (again Open Refine output). Soon we will use a 'decoder' to replace the open refine encoder and the openrefine csv will be created from the netcdf file. 

:::

The sample types (`SEAWATER`, `BIOTA`, `SEDIMENT`, ...) are encoded group names in the NetCDF file produced.

To maintain compatibility with legacy systems, where we create a `csv` and parse using OpenRefine, sample type IDs are included in each DataFrame. This is acheived using the `AddSampleTypeIdColumnCB` callback.

In [None]:
'''
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(tfm.dfs['SEAWATER'][['KEY', 'samptype_id']].head())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
'''

"\n#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),\n                            CompareDfsAndTfmCB(dfs)\n                            ])\ntfm()\nprint(tfm.dfs['SEAWATER'][['KEY', 'samptype_id']].head())\nprint(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')\n"

## Normalize nuclide names

### Lower & strip nuclide names

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Some nuclide names contain one or multiple trailing spaces.

:::

This is demonstrated below for the `NUCLIDE` column:

In [None]:
#| eval: false
df = get_unique_across_dfs(load_data(fname_in), 'NUCLIDE', as_df=True, include_nchars=True)
df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()
print(df[df['n_chars'] != df['stripped_chars']])

    index      value  n_chars  stripped_chars
1       1   CS134           8               5
7       7   AM241           8               5
9       9   CS137           8               5
29     29   CO60            8               4
32     32     SR90          6               4
39     39      SR90         5               4
56     56   SR90            8               4
67     67    SR90           7               4
75     75     CS137         6               5
77     77   K40             8               3
85     85  CS137            9               5
88     88    TC99           7               4
90     90   PU238           8               5


To fix this issue, we use the `LowerStripNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s).

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])

for key in tfm().keys():
    print(f'{key} nuclides: ')
    print(tfm()[key]['NUCLIDE'].unique())

BIOTA nuclides: 
['cs134' 'k40' 'co60' 'cs137' 'sr90' 'ag108m' 'mn54' 'co58' 'ag110m'
 'zn65' 'sb125' 'pu239240' 'ru106' 'be7' 'ce144' 'pb210' 'po210' 'sb124'
 'sr89' 'zr95' 'te129m' 'ru103' 'nb95' 'ce141' 'la140' 'i131' 'ba140'
 'pu238' 'u235' 'bi214' 'pb214' 'pb212' 'tl208' 'ac228' 'ra223' 'eu155'
 'ra226' 'gd153' 'sn113' 'fe59' 'tc99' 'co57' 'sn117m' 'eu152' 'sc46'
 'rb86' 'ra224' 'th232' 'cs134137' 'am241' 'ra228' 'th228' 'k-40' 'cs138'
 'cs139' 'cs140' 'cs141' 'cs142' 'cs143' 'cs144' 'cs145' 'cs146']
SEAWATER nuclides: 
['cs137' 'sr90' 'h3' 'cs134' 'pu238' 'pu239240' 'am241' 'cm242' 'cm244'
 'tc99' 'k40' 'ru103' 'sr89' 'sb125' 'nb95' 'ru106' 'zr95' 'ag110m'
 'cm243244' 'ba140' 'ce144' 'u234' 'u238' 'co60' 'pu239' 'pb210' 'po210'
 'np237' 'pu240' 'mn54']
SEDIMENT nuclides: 
['ra226' 'cs137' 'ra228' 'k40' 'sr90' 'cs134137' 'cs134' 'pu239240'
 'pu238' 'co60' 'ru103' 'ru106' 'sb125' 'ag110m' 'ce144' 'am241' 'be7'
 'th228' 'pb210' 'co58' 'mn54' 'zr95' 'ba140' 'po210' 'ra224' 'nb95'
 'p

### Remap nuclide names to MARIS data formats

Below, we map nuclide names used by HELCOM to the MARIS standard nuclide names. 

Remapping data provider nomenclatures to MARIS standards is a recurrent operation and is done in a semi-automated manner according to the following pattern:

1. **Inspect** data provider nomenclature:
2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); 
3. **Fix** potential mismatches; 
4. **Apply** the lookup table to the dataframe.

We will refer to this process as **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply).

The `get_unique_across_dfs` function is a utility in MARISCO that retrieves unique values from a specified column across all DataFrames. 
Note that there is one DataFrame for each sample type, such as biota, sediment, etc.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE')])

dfs_output = tfm()

get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True).T # Transpose to display the dataframe horizontally

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
index,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
value,pu241,bi214,mn54,pb212,po210,ra224,zr95,gd153,th232,cm242,...,eu155,th228,ba140,h3,u234,cs134,co58,pu239,bi212,sr90


Let's now create an instance of a [fuzzy matching algorithm](https://www.wikiwand.com/en/articles/Approximate_string_matching) `Remapper`. This instance will match the nuclide names of the HELCOM dataset to the MARIS standard nuclide names.

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_output, col_name='NUCLIDE', as_df=True),
                    maris_lut_fn=nuc_lut_path,
                    maris_col_id='nuclide_id',
                    maris_col_name='nc_name',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='nuclides_helcom.pkl')

Lets try to match HELCOM nuclide names to MARIS standard nuclide names as automatically as possible. The `match_score` column allows to assess the results:

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/77 [00:00<?, ?it/s]

Processing: 100%|██████████| 77/77 [00:02<00:00, 27.59it/s]

63 entries matched the criteria, while 14 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pu239240,pu239,pu239240,3
pu238240,pu240,pu238240,3
cs134137,cs137,cs134137,3
cm243244,cm242,cm243244,3
cs145,ce140,cs145,2
cs142,ce140,cs142,2
cs143,ce140,cs143,2
cs141,ce141,cs141,1
cs139,ce139,cs139,1
k-40,k40,k-40,1


We can now manually inspect the unmatched nuclide names and create a table to correct them to the MARIS standard:

In [None]:
#| exports
fixes_nuclide_names = {
    'cs134137': 'cs134_137_tot',
    'cm243244': 'cm243_244_tot',
    'pu239240': 'pu239_240_tot',
    'pu238240': 'pu238_240_tot',
    'cs143': 'cs137',
    'cs145': 'cs137',
    'cs142': 'cs137',
    'cs141': 'cs137',
    'cs144': 'cs137',
    'k-40': 'k40',
    'cs140': 'cs137',
    'cs146': 'cs137',
    'cs139': 'cs137',
    'cs138': 'cs137'
    }

We now include the table `fixes_nuclide_names`, which applies manual corrections to the nuclide names before the remapping process. 
The `generate_lookup_table` function has an `overwrite` parameter (default is `True`), which, when set to `True`, creates a pickle file cache of the lookup table. We can now test the remapping process:

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)
fc.test_eq(len(remapper.select_match(match_score_threshold=1, verbose=True)), 0)

Processing:   0%|          | 0/77 [00:00<?, ?it/s]

Processing: 100%|██████████| 77/77 [00:03<00:00, 23.71it/s]

77 entries matched the criteria, while 0 entries had a match score of 1 or higher.





Test passes! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.


In [None]:
#| exports
# Create a lookup table for nuclide names
lut_nuclides = lambda df: Remapper(provider_lut_df=df,
                                   maris_lut_fn=nuc_lut_path,
                                   maris_col_id='nuclide_id',
                                   maris_col_name='nc_name',
                                   provider_col_to_match='value',
                                   provider_col_key='value',
                                   fname_cache='nuclides_helcom.pkl').generate_lookup_table(fixes=fixes_nuclide_names, 
                                                                                            as_df=False, overwrite=False)

We now create the callback `RemapNuclideNameCB`, which will remap the nuclide names using the `lut_nuclides` lookup table.

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    "Remap data provider nuclide names to MARIS nuclide names."
    def __init__(self, 
                 fn_lut: Callable # Function that returns the lookup table dictionary
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        df_uniques = get_unique_across_dfs(tfm.dfs, col_name='NUCLIDE', as_df=True)
        lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()}    
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['NUCLIDE'].replace(lut)

Let's see it in action, along with the `RemapRdnNameCB` callback:

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides)
                            ])
dfs_out = tfm()

# For instance
dfs_out['BIOTA'].NUCLIDE.unique()

array(['cs134', 'k40', 'co60', 'cs137', 'sr90', 'ag108m', 'mn54', 'co58',
       'ag110m', 'zn65', 'sb125', 'pu239_240_tot', 'ru106', 'be7',
       'ce144', 'pb210', 'po210', 'sb124', 'sr89', 'zr95', 'te129m',
       'ru103', 'nb95', 'ce141', 'la140', 'i131', 'ba140', 'pu238',
       'u235', 'bi214', 'pb214', 'pb212', 'tl208', 'ac228', 'ra223',
       'eu155', 'ra226', 'gd153', 'sn113', 'fe59', 'tc99', 'co57',
       'sn117m', 'eu152', 'sc46', 'rb86', 'ra224', 'th232',
       'cs134_137_tot', 'am241', 'ra228', 'th228'], dtype=object)

### Add Nuclide Id column (REMOVE STEP)

:::{.callout-tip}

**TODO**: The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output). Soon we will use a 'decoder' to replace the open refine encoder and the openrefine csv will be created from the netcdf file. 

:::

The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output).

In [None]:
'''
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE')
                            ])
dfs_out = tfm()

# For instance
dfs_out['biota'][['NUCLIDE', 'nuclide_id']]
'''

"\n#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='NUCLIDE'),\n                            RemapNuclideNameCB(lut_nuclides),\n                            AddNuclideIdColumnCB(col_value='NUCLIDE')\n                            ])\ndfs_out = tfm()\n\n# For instance\ndfs_out['biota'][['NUCLIDE', 'nuclide_id']]\n"

## Standardize Time

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Time/date is provide in the `DATE`, `YEAR`
, `MONTH`, `DAY` columns. Note that the `DATE` contains missing values as indicated below. When missing, we fallback on the `YEAR`, `MONTH`, `DAY` columns. Note also that sometimes `DAY` and `MONTH` contain 0. In this case we systematically set them to 1.

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
for key in dfs.keys():
    print(f'{key} DATE null values: ', dfs[key]['DATE'].isna().sum())

BIOTA DATE null values:  84
SEAWATER DATE null values:  494
SEDIMENT DATE null values:  741


In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse and standardize time information in the dataframe."
    def __call__(self, tfm: Transformer):
        for df in tfm.dfs.values():
            self._process_dates(df)
            # self._define_beg_period(df) # REMOVE STEP - we will use an OPEN REFINE decoder to replace the open refine encoder. 

    def _process_dates(self, df: pd.DataFrame) -> None:
        "Process and correct date and time information in the DataFrame."
        df['TIME'] = self._parse_date(df)
        self._handle_missing_dates(df)
        self._fill_missing_time(df)

    def _parse_date(self, df: pd.DataFrame) -> pd.Series:
        "Parse the DATE column if present."
        return pd.to_datetime(df['DATE'], format='%m/%d/%y %H:%M:%S', errors='coerce')

    def _handle_missing_dates(self, df: pd.DataFrame):
        "Handle cases where DAY or MONTH is 0 or missing."
        df.loc[df["DAY"] == 0, "DAY"] = 1
        df.loc[df["MONTH"] == 0, "MONTH"] = 1
        
        missing_day_month = (df["DAY"].isna()) & (df["MONTH"].isna()) & (df["YEAR"].notna())
        df.loc[missing_day_month, ["DAY", "MONTH"]] = 1

    def _fill_missing_time(self, df: pd.DataFrame) -> None:
        "Fill missing time values using YEAR, MONTH, and DAY columns."
        missing_time = df['TIME'].isna()
        df.loc[missing_time, 'TIME'] = pd.to_datetime(
            df.loc[missing_time, ['YEAR', 'MONTH', 'DAY']], 
            format='%Y%m%d', 
            errors='coerce'
        )
        
    ''' REMOVE STEP - we will use an OPEN REFINE decoder to replace the open refine encoder. 
    def _define_beg_period(self, df: pd.DataFrame) -> None:
        "Create a standardized date representation for Open Refine."
        df['begperiod'] = df['TIME']
    '''

Apply the transformer for callbacks `ParseTimeCB`. Then, print the `TIME` data for `seawater`.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[ParseTimeCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['SEAWATER'][['TIME']])

                           BIOTA  SEAWATER  SEDIMENT
Number of rows in dfs      14893     20318     37347
Number of rows in tfm.dfs  14893     20318     37347
Number of rows removed         0         0         0 

            TIME
0     2012-05-23
1     2012-05-23
2     2012-06-17
3     2012-05-24
4     2012-05-24
...          ...
20313 2015-06-22
20314 2015-06-23
20315 2015-06-23
20316 2015-06-24
20317 2015-06-24

[20318 rows x 1 columns]


The NetCDF time format requires that time be encoded as the number of milliseconds since a specified origin. In our case, the origin is `1970-01-01`, as indicated in the `cdl.toml` file under the `[vars.defaults.time.attrs]` section.

`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[ParseTimeCB(),
                            EncodeTimeCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
                            

                           BIOTA  SEAWATER  SEDIMENT
Number of rows in dfs      14893     20318     37347
Number of rows in tfm.dfs  14893     20318     37346
Number of rows removed         0         0         1 



## Sanitize value

We allocate each column containing measurement values (named differently across sample types) into a single column `VALUE` and remove NA where needed.

In [None]:
#| exports
coi_val = {'SEAWATER' : {'VALUE': 'VALUE_Bq/m³'},
           'BIOTA':  {'VALUE': 'VALUE_Bq/kg'},
           'SEDIMENT': {'VALUE': 'VALUE_Bq/kg'}}


In [None]:
#| exports
class SanitizeValue(Callback):
    "Sanitize value/measurement by removing blank entries and populating `value` column."
    def __init__(self, 
                 coi: Dict[str, Dict[str, str]] # Columns of interest. Format: {group_name: {'val': 'column_name'}}
                 ): 
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        for grp, df in tfm.dfs.items():
            value_col = self.coi[grp]['VALUE']
            df.dropna(subset=[value_col], inplace=True)
            df['VALUE'] = df[value_col]

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[SanitizeValue(coi_val),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

                           BIOTA  SEAWATER  SEDIMENT
Number of rows in dfs      14893     20318     37347
Number of rows in tfm.dfs  14873     20242     37090
Number of rows removed        20        76       257 



## Normalize uncertainty

Function `unc_rel2stan` converts uncertainty from relative uncertainty to standard uncertainty.

In [None]:
#| exports
def unc_rel2stan(
    df: pd.DataFrame, # DataFrame containing measurement and uncertainty columns
    meas_col: str, # Name of the column with measurement values
    unc_col: str # Name of the column with relative uncertainty values (percentages)
) -> pd.Series: # Series with calculated absolute uncertainties
    "Convert relative uncertainty to absolute uncertainty."
    return df.apply(lambda row: row[unc_col] * row[meas_col] / 100, axis=1)

For each sample type in the Helcom dataset, the `UNCERTAINTY` is provided as a relative uncertainty. The column names for both the `VALUE` and the `UNCERTAINTY` vary by sample type. The `coi_units_unc` dictionary defines the column names for the `VALUE` and `UNCERTAINTY` for each sample type.

In [None]:
#| exports
# Columns of interest
coi_units_unc = [('SEAWATER', 'VALUE_Bq/m³', 'ERROR%_m³'),
                 ('BIOTA', 'VALUE_Bq/kg', 'ERROR%'),
                 ('SEDIMENT', 'VALUE_Bq/kg', 'ERROR%_kg')]


NormalizeUncCB callback normalizes the ``UNCERTAINTY`` by converting from relative uncertainty to standard uncertainty. 

In [None]:
#| exports
class NormalizeUncCB(Callback):
    "Convert from relative error % to standard uncertainty."
    def __init__(self, 
                 fn_convert_unc: Callable=unc_rel2stan, # Function converting relative uncertainty to absolute uncertainty
                 coi: List[Tuple[str, str, str]]=coi_units_unc # List of columns of interest
                ):
        fc.store_attr()
    
    def __call__(self, tfm: Transformer):
        for grp, val, unc in self.coi:
            if grp in tfm.dfs:
                df = tfm.dfs[grp]
                df['UNCERTAINTY'] = self.fn_convert_unc(df, val, unc)

Apply the transformer for callback ``NormalizeUncCB``. Then, print the value (i.e. activity per unit ) and standard uncertainty for each sample type.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[NormalizeUncCB(),
                            SanitizeValue(coi_val)])
tfm()
print(tfm.dfs['SEAWATER'][['VALUE', 'UNCERTAINTY']][:5])
print(tfm.dfs['BIOTA'][['VALUE', 'UNCERTAINTY']][:5])
print(tfm.dfs['SEDIMENT'][['VALUE', 'UNCERTAINTY']][:5])

   VALUE  UNCERTAINTY
0    5.3        1.696
1   19.9        3.980
2   25.5        5.100
3   17.0        4.930
4   22.2        3.996
        VALUE  UNCERTAINTY
0    0.010140          NaN
1  135.300000     4.830210
2    0.013980          NaN
3    4.338000     0.150962
4    0.009614          NaN
   VALUE  UNCERTAINTY
0   35.0         9.10
1   36.0         7.92
2   38.0         9.12
3   36.0         9.00
4   30.0         6.90


## Remap Biota species

In the following processing steps, we will use the same approach as described for remapping nuclide names.

First lets **inspect** the `RUBIN_NAME.csv` file provided by HELCOM, which describes the nomenclature of biota species.

In [None]:
#| eval: false
pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv').head()

Unnamed: 0,RUBIN_ID,RUBIN,SCIENTIFIC NAME,ENGLISH NAME
0,11,ABRA BRA,ABRAMIS BRAMA,BREAM
1,12,ANGU ANG,ANGUILLA ANGUILLA,EEL
2,13,ARCT ISL,ARCTICA ISLANDICA,ISLAND CYPRINE
3,14,ASTE RUB,ASTERIAS RUBENS,COMMON STARFISH
4,15,CARD EDU,CARDIUM EDULE,COCKLE


Now we try to **MATCH** the `SCIENTIFIC NAME` column of HELCOM biota dataset to the `species` column of the MARIS nomenclature, again using a `Remapper` object:

**DISCUSS**: I updated the species_lut_path function to accept a filename as an argument. This will allow us to pass updated look up tables to the remapper. However I am not sure if this is the best way to handle this. We would like control over the lookup table contents and so the lookup table files should be standardized (e.g. dbo_species.xlsx) and only the approved ones should be used. Can we discuss this?

**NOTE**: I included 'from functools import partial'

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),
                    maris_lut_fn=partial(species_lut_path, 'dbo_species_2024-11-19.xlsx'),
                    maris_col_id='species_id',
                    maris_col_name='species',
                    provider_col_to_match='SCIENTIFIC NAME',
                    provider_col_key='RUBIN',
                    fname_cache='species_helcom.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/43 [00:00<?, ?it/s]

Processing: 100%|██████████| 43/43 [00:08<00:00,  5.08it/s]

35 entries matched the criteria, while 8 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
STIZ LUC,Sander lucioperca,STIZOSTEDION LUCIOPERCA,10
LAMI SAC,Laminaria japonica,LAMINARIA SACCHARINA,7
CARD EDU,Cardiidae,CARDIUM EDULE,6
CH HI;BA,Macoma balthica,CHARA BALTICA,6
ENCH CIM,Echinodermata,ENCHINODERMATA CIM,5
PSET MAX,Pinctada maxima,PSETTA MAXIMA,5
MACO BAL,Macoma balthica,MACOMA BALTICA,1
STUC PEC,Stuckenia pectinata,STUCKENIA PECTINATE,1


Below, we will correct the entries that were not properly matched by the `Remapper` object:

In [None]:
#| exports
fixes_biota_species = {
    'CHARA BALTICA': 'NOT AVAILABLE', # CHARA BALTICA (RUBIN: CH HI;BA) is not listed in the biota data. 
    'CARDIUM EDULE': 'Cerastoderma edule',
    'LAMINARIA SACCHARINA': 'Saccharina latissima',
    'PSETTA MAXIMA': 'Scophthalmus maximus',
    'STIZOSTEDION LUCIOPERCA': 'Sander luciopercas'}

And give the ``remapper`` another try:

In [None]:
#| eval: false
remapper.generate_lookup_table(fixes=fixes_biota_species)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/43 [00:00<?, ?it/s]

Processing: 100%|██████████| 43/43 [00:07<00:00,  5.57it/s]

39 entries matched the criteria, while 4 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENCH CIM,Echinodermata,ENCHINODERMATA CIM,5
MACO BAL,Macoma balthica,MACOMA BALTICA,1
STIZ LUC,Sander lucioperca,STIZOSTEDION LUCIOPERCA,1
STUC PEC,Stuckenia pectinata,STUCKENIA PECTINATE,1


***

include tooltip 

 #### CHARA BALTICA review

**NOTE**: CHARA BALTICA is a species that is not in the MARIS nomenclature. 'The species is frequent in the Baltic Sea, and is elsewhere found in coastal lagoons and fiords' (M.D. Guiry in Guiry).


M.D. Guiry in Guiry, M.D. & Guiry, G.M. 31 December 2020. AlgaeBase. World-wide electronic publication, National University of Ireland, Galway. https://www.algaebase.org; searched on 20 November 2024



'CH HI;BA', its not in the HELCOM biota dataset. 

In [None]:
dfs['BIOTA']['RUBIN'].unique()

array(['GADU MOR', 'SPRA SPR', 'CLUP HAR', 'MERL MNG', 'LIMA LIM',
       'PLEU PLA', 'PLAT FLE', 'SADU ENT', 'ENGR ENC', 'ESOX LUC',
       'MACO BAL', 'FUCU VES', 'ZOAR VIV', 'OSME EPE', 'MYOX SCO',
       'GYMN CER', 'GAST ACU', 'SCOM SCO', 'MYTI EDU', 'CYPR CAR',
       'ABRA BRA', 'STIZ LUC', 'RUTI RUT', 'PERC FLU', 'MYA ARE',
       'CRAN CRA', 'PLANKTON', 'CARD EDU', 'ARCT ISL', 'CLAD GLO',
       'FURC LUM', 'ANGU ANG', 'FISHLARVAE', 'ENCH CIM', 'ASTE RUB',
       'RHODOPHY', 'LAMI SAC', 'PSET MAX', 'GADU MOR  ', 'POLY FUC',
       'STUC PEC', 'ZANN PALU'], dtype=object)

Other unused RUBIN:

In [None]:
unique_rubin = dfs['BIOTA']['RUBIN'].unique()
unique_rubin_set = set(unique_rubin)
rubin_lut = list(pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')['RUBIN'])
unused_rubins = [rune for rune in rubin_lut if rune not in unique_rubin_set]
print("Unused RUBIN names:", unused_rubins)

Unused RUBIN names: ['CH HI;BA', 'SOLE SOL']


**DISCUSS**: How should we handle these unused RUBIN names? 

***

Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. 

We can now use the generic `RemapCB` callback to perform the remapping of the `RUBIN` column to the `species` column after having defined the lookup table `lut_biota`.

In [None]:
#| exports
lut_biota = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv'),
                             maris_lut_fn=species_lut_path,
                             maris_col_id='species_id',
                             maris_col_name='species',
                             provider_col_to_match='SCIENTIFIC NAME',
                             provider_col_key='RUBIN',
                             fname_cache='species_helcom.pkl'
                             ).generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='RUBIN', dest_grps='BIOTA')
    ])
tfm()
tfm.dfs['BIOTA'].columns
# For instance:
print(tfm.dfs['BIOTA']['SPECIES'].unique())

[  99  243   50  139  270  192  191  284   84  269  122   96  287  279
  278  288  286  244  129  275  271  285  283  247  120   59  280  274
  273  290  289  272  277  276   21  282  110  281  245  704 1524]


## Remap Biota tissues
Let's inspect the `TISSUE.csv` file provided by HELCOM describing the tissue nomenclature. Biota tissue is known as `body part` in the maris data set.

In [None]:
#| eval: false
pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv').head()

Unnamed: 0,TISSUE,TISSUE_DESCRIPTION
0,1,WHOLE FISH
1,2,WHOLE FISH WITHOUT ENTRAILS
2,3,WHOLE FISH WITHOUT HEAD AND ENTRAILS
3,4,FLESH WITH BONES
4,5,FLESH WITHOUT BONES (FILETS)


In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),
                    maris_lut_fn=bodyparts_lut_path,
                    maris_col_id='bodypar_id',
                    maris_col_name='bodypar',
                    provider_col_to_match='TISSUE_DESCRIPTION',
                    provider_col_key='TISSUE',
                    fname_cache='tissues_helcom.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/29 [00:00<?, ?it/s]

Processing: 100%|██████████| 29/29 [00:00<00:00, 83.24it/s]

21 entries matched the criteria, while 8 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,Flesh without bones,WHOLE FISH WITHOUT HEAD AND ENTRAILS,20
2,Flesh without bones,WHOLE FISH WITHOUT ENTRAILS,13
8,Soft parts,SKIN/EPIDERMIS,10
5,Flesh without bones,FLESH WITHOUT BONES (FILETS),9
1,Whole animal,WHOLE FISH,5
12,Brain,ENTRAILS,5
15,Stomach and intestine,STOMACH + INTESTINE,3
41,Whole animal,WHOLE ANIMALS,1


We address several entries that were not correctly matched by the Remapper object, as detailed below:"

In [None]:
#| exports
fixes_biota_tissues = {
    'WHOLE FISH WITHOUT HEAD AND ENTRAILS': 'Whole animal eviscerated without head',
    'ENTRAILS': 'Viscera',
    'SKIN/EPIDERMIS': 'Skin'}

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_biota_tissues)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/29 [00:00<?, ?it/s]

Processing: 100%|██████████| 29/29 [00:00<00:00, 98.79it/s]

24 entries matched the criteria, while 5 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Flesh without bones,WHOLE FISH WITHOUT ENTRAILS,13
5,Flesh without bones,FLESH WITHOUT BONES (FILETS),9
1,Whole animal,WHOLE FISH,5
15,Stomach and intestine,STOMACH + INTESTINE,3
41,Whole animal,WHOLE ANIMALS,1


Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. 

We can now use the generic `RemapCB` callback to perform the remapping of the `TISSUE` column to the `body_part` column after having defined the lookup table `lut_tissues`.

In [None]:
#| exports
lut_tissues = lambda: Remapper(provider_lut_df=pd.read_csv('../../_data/accdb/mors/csv/TISSUE.csv'),
                               maris_lut_fn=bodyparts_lut_path,
                               maris_col_id='bodypar_id',
                               maris_col_name='bodypar',
                               provider_col_to_match='TISSUE_DESCRIPTION',
                               provider_col_key='TISSUE',
                               fname_cache='tissues_helcom.pkl'
                               ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='RUBIN', dest_grps='BIOTA'),
    RemapCB(fn_lut=lut_tissues, col_remap='BODY_PART', col_src='TISSUE', dest_grps='BIOTA'),
    ])

print(tfm()['BIOTA'][['TISSUE', 'BODY_PART']][:5])


   TISSUE  BODY_PART
0       5         52
1       5         52
2       5         52
3       5         52
4       5         52


## Remap biogroup

`lut_biogroup` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `biogroup_id`.

In [None]:
#| exports
lut_biogroup = lambda: get_lut(src_dir=species_lut_path().parent, fname=species_lut_path().name, 
                               key='species_id', value='biogroup_id')

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='RUBIN', dest_grps='BIOTA'),
    RemapCB(fn_lut=lut_tissues, col_remap='BODY_PART', col_src='TISSUE', dest_grps='BIOTA'),
    RemapCB(fn_lut=lut_biogroup, col_remap='BIO_GROUP', col_src='SPECIES', dest_grps='BIOTA')
    ])

print(tfm()['BIOTA']['BIO_GROUP'].unique())


[ 4  2  6 11  8  3]


## Remap Taxon Information (REVIEW)
**Review Note**: The taxon information is not included in the NetCDF encoding. However, it is used for importing into the MARIS master database via OpenRefine. The `SPECIES` column is used to look up the taxon information. This section should be moved to the `open refine decoder notebook`. 

**DISCUSS**: Should we include the `Taxonname`, `TaxonRepName` and `Taxonrank` in the NetCDF encoding?


We first need to retrieve the taxon information from the `dbo_species.xlsx` file.

In [None]:
'''
#| exports
# TODO: Include Commonname field after next MARIS data reconciling process.
def get_taxon_info_lut(
    maris_lut:str # Path to the MARIS lookup table (Excel file)
) -> dict: # A dictionary mapping species_id to biogroup_id
    "Retrieve a lookup table for Taxonname from a MARIS lookup table."
    species = pd.read_excel(maris_lut)
    return species[['species_id', 'Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].set_index('species_id').to_dict()

lut_taxon = lambda: get_taxon_info_lut(species_lut_path())
'''

'\n#| exports\n# TODO: Include Commonname field after next MARIS data reconciling process.\ndef get_taxon_info_lut(\n    maris_lut:str # Path to the MARIS lookup table (Excel file)\n) -> dict: # A dictionary mapping species_id to biogroup_id\n    "Retrieve a lookup table for Taxonname from a MARIS lookup table."\n    species = pd.read_excel(maris_lut)\n    return species[[\'species_id\', \'Taxonname\', \'Taxonrank\',\'TaxonDB\',\'TaxonDBID\',\'TaxonDBURL\']].set_index(\'species_id\').to_dict()\n\nlut_taxon = lambda: get_taxon_info_lut(species_lut_path())\n'

In [None]:
'''
# | exports
class RemapTaxonInformationCB(Callback):
    "Update taxon information based on MARIS species LUT."
    def __init__(self, fn_lut: Callable):
        self.fn_lut = fn_lut

    def __call__(self, tfm: Transformer):
        lut = self.fn_lut()
        df = tfm.dfs['BIOTA']
        
        df['TaxonRepName'] = df.get('RUBIN', 'Unknown')
        
        taxon_columns = ['Taxonname', 'Taxonrank', 'TaxonDB', 'TaxonDBID', 'TaxonDBURL']
        for col in taxon_columns:
            df[col] = df['SPECIES'].map(lut[col]).fillna('Unknown')
        
        unmatched = df[df['Taxonname'] == 'Unknown']['SPECIES'].unique()
        if len(unmatched) > 0:
            print(f"Unmatched species IDs: {', '.join(unmatched)}")
'''

'\n# | exports\nclass RemapTaxonInformationCB(Callback):\n    "Update taxon information based on MARIS species LUT."\n    def __init__(self, fn_lut: Callable):\n        self.fn_lut = fn_lut\n\n    def __call__(self, tfm: Transformer):\n        lut = self.fn_lut()\n        df = tfm.dfs[\'BIOTA\']\n        \n        df[\'TaxonRepName\'] = df.get(\'RUBIN\', \'Unknown\')\n        \n        taxon_columns = [\'Taxonname\', \'Taxonrank\', \'TaxonDB\', \'TaxonDBID\', \'TaxonDBURL\']\n        for col in taxon_columns:\n            df[col] = df[\'SPECIES\'].map(lut[col]).fillna(\'Unknown\')\n        \n        unmatched = df[df[\'Taxonname\'] == \'Unknown\'][\'SPECIES\'].unique()\n        if len(unmatched) > 0:\n            print(f"Unmatched species IDs: {\', \'.join(unmatched)}")\n'

In [None]:
'''
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[ 
                            RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='RUBIN', dest_grps='BIOTA'),
                            RemapCB(fn_lut=lut_tissues, col_remap='BODY_PART', col_src='TISSUE', dest_grps='BIOTA'),
                            RemapCB(fn_lut=lut_biogroup, col_remap='BIO_GROUP', col_src='SPECIES', dest_grps='BIOTA'),
                            RemapTaxonInformationCB(lut_taxon)
                            ])
tfm()
print(tfm.dfs['biota'][['TaxonRepName', 'Taxonname', 'Taxonrank',
                        'TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())
'''

"\n#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[ \n                            RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='RUBIN', dest_grps='BIOTA'),\n                            RemapCB(fn_lut=lut_tissues, col_remap='BODY_PART', col_src='TISSUE', dest_grps='BIOTA'),\n                            RemapCB(fn_lut=lut_biogroup, col_remap='BIO_GROUP', col_src='SPECIES', dest_grps='BIOTA'),\n                            RemapTaxonInformationCB(lut_taxon)\n                            ])\ntfm()\nprint(tfm.dfs['biota'][['TaxonRepName', 'Taxonname', 'Taxonrank',\n                        'TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())\n"

## Remap Sediment types
Once again, we employ the **IMFA** (Inspect, Match, Fix, Apply) pattern to remap the HELCOM sediment types.

Let's inspect the `SEDIMENT_TYPE.csv` file provided by HELCOM describing the sediment type nomenclature:

In [None]:
#| eval: false
pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv').head()

Unnamed: 0,SEDI,SEDIMENT TYPE,RECOMMENDED TO BE USED
0,-99,NO DATA,
1,30,SILT AND GRAVEL,YES
2,0,GRAVEL,YES
3,1,SAND,YES
4,2,FINE SAND,NO


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `SEDI` values `56` and `73` are not found in the `SEDIMENT_TYPE.csv` lookup table provided. Note also there are many `nan` values in the `SEDIMENT_TYPE.csv` file.

We reassign them to `-99` for now but should be clarified/fixed. This is demonstrated below.

:::

In [None]:
#| eval: false
df_sed_lut = pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv')
dfs = load_data(fname_in)

sediment_sedi = set(dfs['SEDIMENT'].SEDI.unique())
lookup_sedi = set(df_sed_lut['SEDI'])
missing = sediment_sedi - lookup_sedi
print(f"Missing SEDI values: {missing if missing else 'None'}")

Missing SEDI values: {56.0, 73.0, nan}


Let's try to match as many as possible:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=pd.read_csv(Path(fname_in)/'SEDIMENT_TYPE.csv'),
                    maris_lut_fn=sediments_lut_path,
                    maris_col_id='sedtype_id',
                    maris_col_name='sedtype',
                    provider_col_to_match='SEDIMENT TYPE',
                    provider_col_key='SEDI',
                    fname_cache='sediments_helcom.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/47 [00:00<?, ?it/s]

Processing: 100%|██████████| 47/47 [00:00<00:00, 97.43it/s] 

44 entries matched the criteria, while 3 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-99,Soft,NO DATA,5
50,Mud and gravel,MUD AND GARVEL,2
46,Glacial clay,CLACIAL CLAY,1


We address the remaining unmatched values by adding fixes_sediments:

In [None]:
#| exports
fixes_sediments = {
    'NO DATA': '(Not available)'
}

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_sediments)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/47 [00:00<?, ?it/s]

Processing: 100%|██████████| 47/47 [00:00<00:00, 103.83it/s]

45 entries matched the criteria, while 2 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50,Mud and gravel,MUD AND GARVEL,2
46,Glacial clay,CLACIAL CLAY,1


A visual inspection of the remaining values shows that they are acceptable to proceed.


**DISCUSS** : ``SedRepName`` is used by OpenRefine. ``SedRepName`` is not included in the NetCDF encoding. Description of the `SedRepName` from [MARIS Data Formats
](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide), 'Name of the sediment as reported by the data provider. The sediment name should be stored exactly as provided, without any modifications'. 

This information will be lost with the latest workflow (creating netcdf and decoding to csv) if we do not include strings. What should we do? For now, I will remove the open refine requirement from the RemapSedimentCB callback. I have commented out `SedRepName`.


In [None]:
#| exports
class RemapSedimentCB(Callback):
    "Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx)."
    
    def __init__(self, 
                 fn_lut: Callable,  # Function that returns the lookup table dictionary
                 sed_grp_name: str = 'SEDIMENT',  # The name of the sediment group
                 replace_lut: dict = None  # Dictionary for replacing SEDI values
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        "Remap sediment types in the DataFrame using the lookup table and handle specific replacements."
        lut = self.fn_lut()
        
        # Fix inconsistent SEDI values
        tfm.dfs[self.sed_grp_name] = self._fix_inconsistent_sedi(tfm.dfs[self.sed_grp_name], self.replace_lut)
        
        # Get unique SEDI values
        unique_sedi = tfm.dfs[self.sed_grp_name]['SEDI'].unique()
        
        # Get sediment types for unique SEDI values
        sediment_mapping = self._get_sediment_types(unique_sedi, lut)
        
        # Replace SEDI values in the DataFrame using the mapping
        tfm.dfs[self.sed_grp_name]['SED_TYPE'] = tfm.dfs[self.sed_grp_name]['SEDI'].map(sediment_mapping)

    def _fix_inconsistent_sedi(self, df: pd.DataFrame, replace_lut: dict) -> pd.DataFrame:
        "Temporary fix for inconsistent SEDI values. Data provider to confirm and clarify."
        df['SEDI'] = df['SEDI'].replace(replace_lut)
        return df

    def _get_sediment_types(self, unique_sedi: np.ndarray, lut: dict) -> dict:
        "Get sediment types for unique SEDI values and return a mapping dictionary."
        sediment_mapping = {}
        
        for sedi_value in unique_sedi:
            match = lut.get(sedi_value, Match(0, None, None, None))
            if match.matched_id == 0:
                self._print_unmatched_sedi(sedi_value)
            sediment_mapping[sedi_value] = match.matched_id
        
        return sediment_mapping

    def _print_unmatched_sedi(self, 
                              sedi_value: int,  # The `SEDI` value from the DataFrame
                             ) -> None:
        "Print the SEDI value if the matched_id is 0 (i.e. Not available)."
        print(f"Unmatched SEDI: {sedi_value}")


In [None]:
#| exports
lut_sediments = lambda: Remapper(provider_lut_df=pd.read_csv(Path(fname_in) / 'SEDIMENT_TYPE.csv'),
                                 maris_lut_fn=sediments_lut_path,
                                 maris_col_id='sedtype_id',
                                 maris_col_name='sedtype',
                                 provider_col_to_match='SEDIMENT TYPE',
                                 provider_col_key='SEDI',
                                 fname_cache='sediments_helcom.pkl'
                                 ).generate_lookup_table(fixes=fixes_sediments, as_df=False, overwrite=False)

Reassign the `SEDI` values of `56`, `73`, and `nan` to `-99`:

In [None]:
sed_replace_lut = {
    56: -99,
    73: -99,
    np.nan: -99
}

Apply the transformer for callback `RemapSedimentCB(get_maris_sediments)`. Then, print the `SEDI` and `sed_type` for the `SEDIMENT` dataframe.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapSedimentCB(fn_lut=lut_sediments, replace_lut=sed_replace_lut)])

tfm()

tfm.dfs['SEDIMENT']['SED_TYPE'].unique()

Unmatched SEDI: -99.0


array([ 0,  2, 58, 30, 59, 55, 56, 36, 29, 47,  4, 54, 33,  6, 44, 42, 48,
       61, 57, 28, 49, 32, 45, 39, 46, 38, 31, 60, 62, 26, 53, 52,  1, 51,
       37, 34, 50,  7, 10, 41, 43, 35])

## Remap units

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The handling of unit types varies between `biota` and `sediment` sample types. For consistency and ease of use, it would be beneficial to have dedicated unit columns for all sample types.

:::

Given the inconsistent handling of units across sample types, we need to define custom mapping rules for standardizing the units. The units available in MARIS are:

In [None]:
#| eval: false
pd.read_excel(unit_lut_path())[['unit_id', 'unit', 'unit_sanitized']]

Unnamed: 0,unit_id,unit,unit_sanitized
0,-1,Not applicable,Not applicable
1,0,NOT AVAILABLE,NOT AVAILABLE
2,1,Bq/m3,Bq per m3
3,2,Bq/m2,Bq per m2
4,3,Bq/kg,Bq per kg
5,4,Bq/kgd,Bq per kgd
6,5,Bq/kgw,Bq per kgw
7,6,kg/kg,kg per kg
8,7,TU,TU
9,8,DELTA/mill,DELTA per mill


We define unit renaming rules for HELCOM in an **ad hoc** way:

In [None]:
#| exports
lut_units = {
    'SEAWATER': 1,  # 'Bq/m3'
    'SEDIMENT': 4,  # 'Bq/kgd' for sediment
    'BIOTA': {
        'D': 4,  # 'Bq/kgd'
        'W': 5,  # 'Bq/kgw'
        'F': 5   # 'Bq/kgw' (assumed to be 'Fresh', so set to wet)
    }
}

In [None]:
#| exports
class RemapUnitCB(Callback):
    "Set the `unit` id column in the DataFrames based on a lookup table."
    def __init__(self, 
                 lut_units: dict=lut_units # Dictionary containing renaming rules for different unit categories
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        for grp in tfm.dfs.keys():
            if grp in ['SEAWATER', 'SEDIMENT']:
                tfm.dfs[grp]['UNIT'] = self.lut_units[grp]
            else:
                tfm.dfs[grp]['UNIT'] = tfm.dfs[grp]['BASIS'].apply(lambda x: lut_units[grp].get(x, 0))

Apply the transformer for callback `RemapUnitCB()`. Then, print the unique `unit` for the `seawater` dataframe.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapUnitCB()])

for grp in ['BIOTA', 'SEDIMENT', 'SEAWATER']:
    print(f"{grp}: {tfm()[grp]['UNIT'].unique()}")

BIOTA: [5 0 4]
SEDIMENT: [4]
SEAWATER: [1]


## Remap detection limit
Detection limits are encoded as follows in MARIS:

In [None]:
#| eval: false
pd.read_excel(detection_limit_lut_path())

Unnamed: 0,id,name,name_sanitized
0,-1,Not applicable,Not applicable
1,0,Not Available,Not available
2,1,=,Detected value
3,2,<,Detection limit
4,3,ND,Not detected
5,4,DE,Derived


In [None]:
#| exports
lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id']

Based on columns of interest for each sample type:

In [None]:
#| exports
coi_dl = {'SEAWATER' : {'val' : 'VALUE_Bq/m³',
                       'unc' : 'ERROR%_m³',
                       'dl' : '< VALUE_Bq/m³'},
          'BIOTA':  {'val' : 'VALUE_Bq/kg',
                     'unc' : 'ERROR%',
                     'dl' : '< VALUE_Bq/kg'},
          'SEDIMENT': {
              'val' : 'VALUE_Bq/kg',
              'unc' : 'ERROR%_kg',
              'dl' : '< VALUE_Bq/kg'}}

We follow the following business logic to encode the detection limit:

`RemapDetectionLimitCB` creates a `detection_limit` column with values determined as follows:
1. Perform a lookup with the appropriate columns value type (or detection limit) columns (`< VALUE_Bq/m³` or `< VALUE_Bq/kg`) against the table returned from the function `get_detectionlimit_lut`.
2. If `< VALUE_Bq/m³` or `< VALUE_Bq/kg` is NaN but both activity values (`VALUE_Bq/m³` or `VALUE_Bq/kg`) and standard uncertainty (`ERROR%_m³`, `ERROR%`, or `ERROR%_kg`) are provided, then assign the ID of `1` (i.e. "Detected value").
3. For other NaN values in the `detection_limit` column, set them to `0` (i.e. `Not Available`).

In [None]:
# | exports
class RemapDetectionLimitCB(Callback):
    "Remap value type to MARIS format."
    
    def __init__(self, 
                 coi: dict,  # Configuration options for column names
                 fn_lut: Callable  # Function that returns a lookup table
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        "Remap detection limits in the DataFrames using the lookup table."
        lut = self.fn_lut()
        
        for grp in tfm.dfs:
            df = tfm.dfs[grp]
            self._update_detection_limit(df, grp, lut)

    def _update_detection_limit(self, 
                                df: pd.DataFrame,  # The DataFrame to modify
                                grp: str,  # The group name to get the column configuration
                                lut: dict  # The lookup table dictionary
                               ) -> None:
        "Update detection limit column in the DataFrame based on lookup table and rules."
        
        # Check if the group exists in coi_dl
        if grp not in coi_dl:
            raise ValueError(f"Group '{grp}' not found in coi_dl configuration.")
        
        # Access column names from coi_dl
        detection_col = coi_dl[grp]['dl']
        value_col = coi_dl[grp]['val']
        uncertainty_col = coi_dl[grp]['unc']
        
        # Initialize detection limit column
        df['DL'] = df[detection_col]
        
        # Set detection limits based on conditions
        self._set_detection_limits(df, value_col, uncertainty_col, lut)

    def _set_detection_limits(self, df: pd.DataFrame, value_col: str, uncertainty_col: str, lut: dict) -> None:
        "Set detection limits based on value and uncertainty columns."
        # Condition for setting '='
        condition_eq = df[value_col].notna() & df[uncertainty_col].notna() & ~df['DL'].isin(lut.keys())
        df.loc[condition_eq, 'DL'] = '='

        # Set 'Not Available' for unmatched detection limits
        df.loc[~df['DL'].isin(lut.keys()), 'DL'] = 'Not Available'
        
        # Perform lookup to map detection limits
        df['DL'] = df['DL'].map(lut)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            NormalizeUncCB(),
                            SanitizeValue(coi_val),                       
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl)])


for grp in ['BIOTA', 'SEDIMENT', 'SEAWATER']:
    print(f"{grp}: {tfm()[grp]['DL'].unique()}")

BIOTA: [2 1 0]
SEDIMENT: [1 2 0]
SEAWATER: [1 2 0]


## Remap filtering status

HELCOM filtered status is encoded as follows in the `FILT` column:

In [None]:
#| eval: false
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='FILT', as_df=True).head(5)

Unnamed: 0,index,value
0,0,N
1,1,
2,2,F
3,3,n


MARIS uses a different encoding for filtered status:

In [None]:
#| eval: false
pd.read_excel(filtered_lut_path())

Unnamed: 0,id,name
0,-1,Not applicable
1,0,Not available
2,1,Yes
3,2,No


For only four categories to remap, the `Remapper` is an overkill. We can use a simple dictionary to map the values:

In [None]:
#| exports
lut_filtered = {
    'N': 2, # No
    'n': 2, # No
    'F': 1 # Yes
}

`RemapFiltCB` converts the HELCOM `FILT` format to the MARIS `FILT` format.

In [None]:
#| exports
class RemapFiltCB(Callback):
    "Lookup FILT value in dataframe using the lookup table."
    def __init__(self,
                 lut_filtered: dict=lut_filtered, # Dictionary mapping FILT codes to their corresponding names
                ):
        fc.store_attr()

    def __call__(self, tfm):
        for df in tfm.dfs.values():
            if 'FILT' in df.columns:
                df['FILT'] = df['FILT'].map(lambda x: self.lut_filtered.get(x, 0))

For instance:

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapFiltCB(lut_filtered)])

print(tfm()['SEAWATER']['FILT'].unique())


[0 2 1]


## Add Sample Laboratory code (REMOVE)

Sample Laboratory code is currently stored in MARIS master DB but not encoded as NetCDF variable. Decision to include it in the NetCDF output is TBD.

**DISCUSS** SMP_ID vs samplabcode. 

SMP_ID is the new variable name to describe the sample ID. It is different to `samplabcode` as `samplabcode` can be a string whilst `SMP_ID` is an integer. Below we will check the uniqueness of the integer part of the `KEY` column.

In [None]:
def check_unique_key_int(tfm):
    """
    Extracts unique 'KEY' values from specified DataFrames, separates them into string and integer components,
    and groups keys by their integer components.

    Parameters:
    tfm (Transformer): The transformer object containing DataFrames.

    Returns:
    dict: A dictionary with the unique keys, their string and integer components, and grouped keys by integer component.
    """
    # Define the groups to extract keys from
    groups = ['SEAWATER', 'BIOTA', 'SEDIMENT']
    
    # Initialize a set to store unique keys
    unique_keys = set()
    
    # Collect unique keys from each DataFrame
    for grp in groups:
        unique_keys.update(tfm.dfs[grp]['KEY'].unique())
    
    # Initialize a dictionary to group keys by their integer components
    int_key_map = {}
    
    for key in unique_keys:
        # Assuming the integer part starts after the first 5 characters
        int_part = int(key[5:]) if key[5:].isdigit() else None  # Remaining part as integer
        
        if int_part is not None:
            if int_part not in int_key_map:
                int_key_map[int_part] = []  # Initialize list for this integer part
            int_key_map[int_part].append(key)  # Append the complete key to the list
    
    return {
        'int_key_map': int_key_map  # Return the mapping of integer parts to complete keys
    }

In [None]:
check_unique_key_int(tfm)

{'int_key_map': {2001025: ['SCLOR2001025',
   'WIMGW2001025',
   'WRISO2001025',
   'BCLOR2001025',
   'SSTUK2001025',
   'SSSSI2001025',
   'WSTUK2001025'],
  2000153: ['SDHIG2000153'],
  2000101: ['WDHIG2000101', 'SSTUK2000101'],
  2011132: ['SKRIL2011132', 'WDHIG2011132'],
  2010090: ['SKRIL2010090', 'SDHIG2010090', 'SSTUK2010090'],
  2009047: ['BVTIG2009047',
   'SCLOR2009047',
   'WDHIG2009047',
   'WIMGW2009047',
   'SSTUK2009047',
   'WRISO2009047'],
  2014028: ['SSSSM2014028', 'SCLOR2014028', 'WIMGW2014028', 'SSTUK2014028'],
  1986016: ['WRISO1986016',
   'WCLOR1986016',
   'WSAAS1986016',
   'BNCRS1986016',
   'SSTUK1986016',
   'BSAAS1986016',
   'WKRIL1986016',
   'BBFFG1986016',
   'WSTUK1986016',
   'SSAAS1986016',
   'SCLOR1986016',
   'SKRIL1986016',
   'BCLOR1986016'],
  2002158: ['SSTUK2002158', 'WDHIG2002158'],
  2010049: ['SCLOR2010049',
   'SSTUK2010049',
   'WRISO2010049',
   'SKRIL2010049',
   'BVTIG2010049',
   'WIMGW2010049',
   'SDHIG2010049'],
  2012004: ['BCL

integer component of ``KEY`` is not unique.

In [None]:
'''
# | exports
class AddSampleLabCodeCB(Callback):
    "Remap `KEY` column to `samplabcode` in each DataFrame."
    def __call__(self, tfm: Transformer):
        for grp in tfm.dfs:
            self._remap_sample_id(tfm.dfs[grp])
    
    def _remap_sample_id(self, df: pd.DataFrame):
        df['samplabcode'] = df['KEY']
'''

'\n# | exports\nclass AddSampleLabCodeCB(Callback):\n    "Remap `KEY` column to `samplabcode` in each DataFrame."\n    def __call__(self, tfm: Transformer):\n        for grp in tfm.dfs:\n            self._remap_sample_id(tfm.dfs[grp])\n    \n    def _remap_sample_id(self, df: pd.DataFrame):\n        df[\'samplabcode\'] = df[\'KEY\']\n'

In [None]:
'''AttributeError
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            AddSampleLabCodeCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

print(tfm()['seawater']['samplabcode'].unique())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
'''

"AttributeError\n#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[\n                            AddSampleLabCodeCB(),\n                            CompareDfsAndTfmCB(dfs)\n                            ])\n\nprint(tfm()['seawater']['samplabcode'].unique())\nprint(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')\n"

## Add measurement note (REMOVE)

The `measurementnote` column is not included in the NetCDF output currently. 

## Add method details 

The NetCDF format includes counting method, sample method and preparation method. The HELCOM dataset includes a look-up table `ANALYSIS_METHOD.csv` capturing the methods used as described by HELCOM. The HELCOM methods provide details on the counting method, sample method and preparation method.

Lets review the analysis methods of HELCOM.

In [None]:
analsis_method_df=pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv')
analsis_method_df.head

<bound method NDFrame.head of     METHOD  COUNTRY                                        DESCRIPTION
0   BFFG01        6  Gammaspectrometric analysis with Germanium det...
1   BFFG02        6  Sr-90, a) Y-90 extraction method dried ash and...
2   CLOR02       67  Radiochemical method Radiocaesium separation f...
3   CLOR03       67  Radiochem. meth.-134+137Cs was measured after ...
4   CLOR04       67  Radiochem. meth of Sr90. Precipation with oxal...
5   CLOR05       67  Dissolved nitric acid samples evaporated and f...
6   CLOR06       67  Radiochem. meth determination of radium-226.Co...
7   CLOR07       67  For tritium liquid scintialtion counting, comb...
8   CLOR08       67  Alpha spectrometry preceded by radiochemical s...
9   DHIG01        6  direct gamma counting using HPGe-detectors; se...
10  EBRS01       91  Pretreatment drying (sediment, biota samples) ...
11  EMHI02       91                                        not defined
12  ERPC02       91  Pretreatment drying, ashin

In [None]:
analsis_method_df['DESCRIPTION'][10]

'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors / SEDIMENT, BIOTA and SEAWATER Cs137, Cs134, K40'

Review the METHODS LUT

In [None]:
prepmet_lut = pd.read_excel(prepmet_lut_path())
sampmet_lut = pd.read_excel(sampmet_lut_path())
counmet_lut = pd.read_excel(counmet_lut_path())

Preperation methods LUT

In [None]:
prepmet_lut

Unnamed: 0,prepmet_id,prepmet,code,type_1,type_2,type_3,type_4
0,-1,Not applicable,,,,,
1,0,Not available,0,1.0,3.0,2.0,4.0
2,1,Distillation,DISTS,1.0,0.0,0.0,0.0
3,2,Electrolytic enrichment,ELECE,1.0,0.0,0.0,0.0
4,3,Evaporation only,EVAPS,1.0,0.0,0.0,0.0
5,4,Evaporation with chemistry,EVAPC,1.0,0.0,0.0,0.0
6,5,Precipitation,PRECC,1.0,0.0,0.0,0.0
7,6,Drying only,DRYOS,3.0,2.0,0.0,0.0
8,7,Dry ashing only,DRYAS,3.0,2.0,0.0,0.0
9,8,Dry ashing with chemistry,DRYAD,3.0,2.0,0.0,0.0


Sample methods LUT

In [None]:
sampmet_lut

Unnamed: 0,sampmet_id,sampmet,code,type_1,type_2,type_3,type_4
0,-1,Not applicable,,,,,
1,0,Not available,0,2.0,3.0,4.0,1.0
2,1,Bottle sampling,BOTTL,1.0,4.0,0.0,0.0
3,2,Box corer,BOX,3.0,0.0,0.0,0.0
4,3,Cartridge,CART,4.0,1.0,0.0,0.0
5,4,Dredge,DRE,2.0,0.0,0.0,0.0
6,5,Grab sampler sediment,GRABS,3.0,0.0,0.0,0.0
7,6,Grab sampling water,GRABW,1.0,0.0,0.0,0.0
8,7,Gravity corer,GRAV,3.0,0.0,0.0,0.0
9,8,Gravity corer with piston,GRAVP,3.0,0.0,0.0,0.0


Counting methods LUT

**DISCUSS** repition of counting method in `counmet_lut`. When should we use each of them?

In [None]:
counmet_lut

Unnamed: 0,counmet_id,counmet,code
0,-1,Not applicable,
1,0,Not available,0
2,1,Atomic absorption,AA
3,2,Alpha,ALP
4,3,Alpha ionization chamber spectrometry,ALPI
5,4,Alpha liquid scintillation spectrometry,ALPL
6,5,Alpha semiconductor spectrometry,ALPS
7,6,Alpha total,ALPT
8,7,Accelerator mass spectrometry,AMS
9,8,Beta,BET


Lets create a template dictionary to store the method details. this dictionary will be manually updated with the method details.

In [None]:
template_dict = {
    row['METHOD']: {
        'DESCRIPTION': row['DESCRIPTION'],
        'COUNT_MET_ID': '',  # Blank entry for COUNT_MET
        'SAMP_MET_ID': '',   # Blank entry for SAMP_MET
        'PREP_MET_ID': ''    # Blank entry for PREP_MET
    }
    for index, row in analsis_method_df.iterrows()
}
template_dict

{'BFFG01': {'DESCRIPTION': "Gammaspectrometric analysis with Germanium detectors (p-type HGeLi's and HPGe's and 1 n-type HPGe), with efficiency 20-48% Energy resolution 1.8-2.3 keV at 1.33 MeV (not to in use any more)",
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'BFFG02': {'DESCRIPTION': 'Sr-90, a) Y-90 extraction method dried ash and added Y-90 + HCl, Ph adjustment and Y-90 extraction with HDEHP in n-heptane b) Modified version of classic nitric acid method (not to in use any more)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR02': {'DESCRIPTION': 'Radiochemical method Radiocaesium separation from seawater samples.134+137Cs was adsorbed on AMP mat,  dissolved with NaOH and after purification precipitated as chloroplatinate (Cs2PtCl6).Counting with low background anticoincidence beta counter.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR03': {'DESCRIPTION': 'Radiochem. meth.-134+137Cs was measured after a rad

Manually update the method dictionary with the method details.

In [None]:
method_dictionary = {'BFFG01': {'DESCRIPTION': "Gammaspectrometric analysis with Germanium detectors (p-type HGeLi's and HPGe's and 1 n-type HPGe), with efficiency 20-48% Energy resolution 1.8-2.3 keV at 1.33 MeV (not to in use any more)",
  'COUNT_MET_ID': 20,
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'BFFG02': {'DESCRIPTION': 'Sr-90, a) Y-90 extraction method dried ash and added Y-90 + HCl, Ph adjustment and Y-90 extraction with HDEHP in n-heptane b) Modified version of classic nitric acid method (not to in use any more)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR02': {'DESCRIPTION': 'Radiochemical method Radiocaesium separation from seawater samples.134+137Cs was adsorbed on AMP mat,  dissolved with NaOH and after purification precipitated as chloroplatinate (Cs2PtCl6).Counting with low background anticoincidence beta counter.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR03': {'DESCRIPTION': 'Radiochem. meth.-134+137Cs was measured after a radiochemical separation from dissolved in nitric and fluoric acids sediment samples - 134+137Cs were adsorbed on AMP mat and their beta activity was counted in a low-background anticoincidence beta counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR04': {'DESCRIPTION': 'Radiochem. meth of Sr90. Precipation with oxalate and separation of calcium, barium, radium and ytrium couting with low background anticoincidence beta counter. 1982-1994',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR05': {'DESCRIPTION': 'Dissolved nitric acid samples evaporated and filtered, dried and heated in muffle furnace and dissolved in HCl.- Fe carrier and NH4OH added and precipitate discarded Filtrate acidified Y carrier added stored, precipitated filtered and dried - Beta count',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR06': {'DESCRIPTION': 'Radiochem. meth determination of radium-226.Concentration of 226Ra was determined radiochemically using emanation method (measurement of 222Rn in Lucas-type scintillation chambers) preceded by separation of radium',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR07': {'DESCRIPTION': 'For tritium liquid scintialtion counting, combined with electrolytic enrichment of analysed water samples, double distilled, before and after electrolysis in cells. Liquid Scintillation spectrometer LKB Wallac model 1410',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR08': {'DESCRIPTION': 'Alpha spectrometry preceded by radiochemical separation Pu was separated by ion exchange, followed by electrodeposition onto stainless steel disks. 242Pu used as an internal tracer for counting alpha activity and chemical recovery- meas. alfa spectrometry',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG01': {'DESCRIPTION': 'direct gamma counting using HPGe-detectors; sediment and suspended matter undergo freeze-drying beforehand',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'EBRS01': {'DESCRIPTION': 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors / SEDIMENT, BIOTA and SEAWATER Cs137, Cs134, K40',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'EMHI02': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'ERPC02': {'DESCRIPTION': 'Pretreatment drying, ashing. HDEHP-extraction of Y-90 and counting the Cerenkov radiation in a liguid scintillation counter Sediment / Sr-90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'IMGW01': {'DESCRIPTION': 'Radiochemical method. acidified samples are pre-concentrated using NH4-Pmo separation on Bio rex 40 resin and preparation of Cs Cloropaltinate for beta counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'IMGW02': {'DESCRIPTION': 'Classic nitric acid method, Equipment: Llow level beta counter FHT 770T (ESM Eberline)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'IMGW03': {'DESCRIPTION': '137Cs activity concentrations are determined by gamma spectrometry with high purity Germanium detector with energy resolution 1.8 keV for 60Co (1332 keV) and relative efficiency of 18%.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC01': {'DESCRIPTION': 'Pretreatment drying and ashing (450 deg C), measured by gamma spectrometry using lead-shielded Ge detectors for biota and fish/Cs137 (detailed instructions as a file)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC03': {'DESCRIPTION': 'Pretreatment drying, measured by gamma spectrometry using lead-shielded Ge detectors for sediments/ Cs137, K40, Th232, Ra226',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC04': {'DESCRIPTION': 'Pretreatment ashing. Extraction of Y-90 (with 10% HDEP solution) in n-heptane toluene from dissolved ashed samples. Sr-90 measured as Y-90 by proportional low level counter for biota and fish/Sr-90 (Beta spectrometry)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC05': {'DESCRIPTION': 'Pretreatment concentration of Sr-90 (together with Cs-137) from 20 L of water with HCl, SrCl2, FeCl3, CaCl2K4Fe(CN)6, Na2CO3, adding of stable Sr as carrier.Extraction of Y-90 as in LEPA04 for seawater/Sr-90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC06': {'DESCRIPTION': 'Pretreatment drying, homogenising and ashing (610 deg C)  Extraction of Y-90 (with 10% HDEP solution) in n-heptane toluene from dissolved ashed samples. Sr-90 measured as Y-90 by proportional low level counter Sediments/Sr90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC07': {'DESCRIPTION': 'Pretreatment concerning Cs-137 (together with Sr-90) from 20 L of water with HCl, SrCl2, FeCl3, CaCl2K4Fe(CN)6, Na2CO3. Drying of precipitate. Measured by gamma spectrometry using lead-shielded Ge-detectors for seawater/Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'KRIL01': {'DESCRIPTION': 'Pretreatment drying (105 deg C), milling, measured by gamma spectrometry using lead-shielded Ge detectors; sediment, biota / Mn54, Co57, Co58, Co60, Zn65, Nb95, Ru103, Ru106, Ag110m, Sb125, Cs134, Cs137, Ba140, Ce141, Ce144, Ra226, Ra228, Th232',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'KRIL02': {'DESCRIPTION': 'Precipitation of radioCs on K4(Fe(CN)6)  together  with precipitation of radioSr on Na2CO3,stable Cs and Sr used as yield tracers, isolated radioCs is measured by gamma spectrometry using lead-shielded Ge detectors, isolated….seawater / Cs134, Cs137, Sr90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'KRIL03': {'DESCRIPTION': ' Pretreatment drying, ashing (650 deg C), radiostrontium isolated is kept for Y-90  ingrowth and Y-90 separated after ingrowth measured by low-level counter Quantulus1220; sediment, biota/ Sr90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA01': {'DESCRIPTION': 'Pretreatment drying and ashing (450 deg C), measured by gamma spectrometry using lead-shielded Ge detectors for biota and fish/Cs137 (detailed instructions as a file)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA02': {'DESCRIPTION': 'Radiocaseium filtered through Cu2Fe(CN)6 impregnated cartridges, measured by gamma spectormetry using lead-shielded Ge detectors for sewawater/Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA03': {'DESCRIPTION': 'Pretreatment drying, measured by gamma spectrometry using lead-shielded Ge detectors for sediments/ Cs137, K40, Th232, Ra226',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA04': {'DESCRIPTION': 'Pretreatment ashing. Extraction of Y-90 (with 10% HDEP solution) in n-heptane toluene from dissolved ashed samples. Sr-90 measured as Y-90 by proportional low level counter for biota and fish/Sr-90 (Beta spectrometry)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA06': {'DESCRIPTION': 'Pretreatment drying, homogenising and ashing (610 deg C)  Extraction of Y-90 (with 10% HDEP solution) in n-heptane toluene from dissolved ashed samples. Sr-90 measured as Y-90 by proportional low level counter Sediments/Sr90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA07': {'DESCRIPTION': 'Pretreatment concerning Cs-137 (together with Sr-90) from 20 L of water with HCl, SrCl2, FeCl3, CaCl2K4Fe(CN)6, Na2CO3. Drying of precipitate. Measured by gamma spectrometry using lead-shielded Ge-detectors for seawater/Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LREB01': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LREB02': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LVDC01': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LVDC02': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LVEA01': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LVEA04': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'NCRS01': {'DESCRIPTION': 'No preparation before the measurement',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'NCRS41': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample freeze-dried, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'NCRS42': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'NCRS44': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried with hot air, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO01': {'DESCRIPTION': 'Sample drying, freeze-drying and ashing (450°C) measured by gamma spectrometry using lead-shielded Germanium detectors - sediment, biota / Mn54, Co57, Co58, Co60, Zn65, Nb95, Ru103, Ru106, Ag110m, Sb125, Cs134, Cs137, Ba140, Ce141, Ce144, Ra226, Th232',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO02': {'DESCRIPTION': 'Radiocaesium absorbed on AMP (NH4 - MoPO4), Cs-134 used as yield tracer, measured by gamma spectrometry using lead-shielded Ge detectors - seawater / Cs134, Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO03': {'DESCRIPTION': 'Radiostrontium isolated (classic nitric acid method) and Y-90 separated after ingrowth and measured by gross beta counting, Sr-85 used as yield tracer, Gross beta counting using low-level GM counters - seawater / Sr90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO04': {'DESCRIPTION': 'Pretreatment drying, freeze drying, Tc-99 isolated by chemical procedures and measured by gross beta counting, Tc-99m used as yield tracer, Gross beta counting using low-level GM counters - biota, seawater / Tc99',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO05': {'DESCRIPTION': 'Pretreatment drying, freeze drying and ashing (450°C), Pu isotopes and Am-241 isolated (anion exchange) and determined by alpha spectrometry, Pu-242 and Am-243 used as yield tracers, Alpha spectrometry using Si detectors (transuranics, polonium) - part mi',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO06': {'DESCRIPTION': 'Po determined by spontaneous deposition on silver disks followed by alpha spectrometry; Pb determined from Po after time for ingrowth; biota/Po210 AND Pb210',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'RISO07': {'DESCRIPTION': 'Transuranics determined by mass spectrometry, ICPMS, after chemical separation, seawater',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SAAS01': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SAAS02': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SAAS03': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI01': {'DESCRIPTION': 'Seawater analysis: concentrate Cs in water sample by use of Cu2Fe(CN)6 filters spiked with Cs-134 as tracer, followed by gammaspectrometric measurement of filters',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI03': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI42': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dreid, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI43': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI44': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried with hot air, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM41': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample freeze-dried, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM42': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dreid, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM43': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM44': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried with hot air, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM46': {'DESCRIPTION': 'Cesium analysis of a water sample. The water is shaken with ion exchangers, then the sample is filtrated and scintillation solution is added.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'STUK01': {'DESCRIPTION': 'Gamma-spectrometric analysis (Pretreatment drying and ashing (450 deg C, Biota samples) or freeze drying (sediment samples) or vaporization to 500 ml (sea water samples), measured by gamma spectrometry using lead-shielded HPGe detectors)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'STUK02': {'DESCRIPTION': 'Strontium analysis (Radiostrontium isolated (classic nitric acid method) and Y-90 separated after ingrowth and measured by gross beta counting, Stable strontium used as yield tracer, Gross beta counting using low-level GM counters)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'STUK03': {'DESCRIPTION': 'Transuranic anal. (Pretreatm. drying, freeze drying and ashing (450 d C), Pu isotopes and Am-241 isolated (anion exchange) & determ. by alpha spectrometry,Pu-242 & Am-243 used as yield tracers,alpha spectromet.using Si detectrs, (transuranics, polonium).',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'STUK04': {'DESCRIPTION': 'Updated tritium analysis (Sea water samples distilled twice with AgNO3 and tritium measured with liquid scintillation counter)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'STUK05': {'DESCRIPTION': 'Modified (-97) from STUK02; Radiostrontium isolated (Sr.Spec-resin) & Y-90 separated after ingrowth & measured by gross beta counting, Stable strontium used as yield tracer, Gross beta counting using low-level GM counters or liquid scintillation counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'VTIG01': {'DESCRIPTION': "Gammaspectrometric analysis with Germanium detectors (p-type HGeLi's and HPGe's and 1 n-type HPGe), with efficiency 20-48% Energy resolution 1.8-2.3 keV at 1.33 MeV",
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'VTIG02': {'DESCRIPTION': 'Sr-90, a) Y-90 extraction method dried ash and added Y-90 + HCl, Ph adjustment and Y-90 extraction with HDEHP in n-heptane b) Modified version of classic nitric acid method',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'VTIG03': {'DESCRIPTION': 'Pu238, Pu239241; Ashing and and drying the traces',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'VTIG04': {'DESCRIPTION': 'Am-241',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA08': {'DESCRIPTION': 'Pretreatment drying (sediment samples) or vaporization to 500 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors/SEDIMENT and SEAWATER: Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA09': {'DESCRIPTION': 'Pretreatment concentration of Sr-90 from 20 L of water with HCl, SrCl2, Na2CO3, adding of stable Sr as carrier. Extraction of Y-90 (with 10% HDEP solution) in n-heptane toluene from dissolved ashed samples. Sr-90 measured as Y-90 by proportional low level counter/SEAWATER:Sr-90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'CLOR01': {'DESCRIPTION': '137Cs and 40K activity concentrations are determined by gamma spectrometry with high purity Germanium detector with energy resolution 1.8 keV for 60Co (1332 keV) and relative efficiency of 30%.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'EMHI01': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'ERPC01': {'DESCRIPTION': 'Pretreatment drying (sediment, biota samples) and ashing (biota samples)or vaporization to 1000 ml (sea water samples), measured by gamma-spectrometry using HPGe detectors sediment, biota, sea water /Cs-137, Cs-134, K-40',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'JORC02': {'DESCRIPTION': 'Radiocaesium filtered through Cu2Fe(CN)6 impregnated cartridges, measured by gamma spectormetry using lead-shielded Ge detectors for sewawater/Cs137',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'LEPA05': {'DESCRIPTION': 'Pretreatment concentration of Sr-90 (together with Cs-137) from 20 L of water with HCl, SrCl2, FeCl3, CaCl2K4Fe(CN)6, Na2CO3, adding of stable Sr as carrier.Extraction of Y-90 as in LEPA04 for seawater/Sr-90',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG02': {'DESCRIPTION': 'determination of Sr-90 via Y-90: after several purification steps Y-90 is measured as yttrium oxide in a low level beta gas flow counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'NCRS43': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample dried, ashed, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SAAS04': {'DESCRIPTION': 'not defined',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSI41': {'DESCRIPTION': 'Gamma analysis with high purity Germanium detector, sample freeze-dried, mixed and analysed in “absolut kalibrerad geometri” according to intercalibration with IAEA',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'SSSM45': {'DESCRIPTION': 'Tritium analysis of a water sample. The water is shaken with ion exchangers, then the sample is filtrated and scintillation solution is added. The contents of β-radiation from tritium is measured with a liquid scintillation spectrometer.',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG03': {'DESCRIPTION': 'absorption of radiocaesium on KNiFC-PAN, gamma spectrometry of KNiFC-PAN afterwards',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG04': {'DESCRIPTION': 'determination of H-3: distillation of seawater at reduced pressure, electrolytical enrichment of H-3 followed by another distillation at reduced pressure, measurement of distilled sample aliquot in a low level scintillation counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG05': {'DESCRIPTION': 'determination of H-3: distillation of seawater at room temperature and room pressure, measurement of distilled sample aliquot in a low level scintillation counter',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'DHIG06': {'DESCRIPTION': 'determination of Pu-238, Pu-239/240 and Am-241: after several purification steps the transuranic elements are electrolytically deposited on a stainless steel disk and measured by alpha spectrometry',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'BFFG03': {'DESCRIPTION': 'Pu238, Pu239241; Ashing and and drying the traces (not to in use any more)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''},
 'BFFG04': {'DESCRIPTION': 'Am-241 (not to in use any more)',
  'COUNT_MET_ID': '',
  'SAMP_MET_ID': '',
  'PREP_MET_ID': ''}}

The HELCOM dataset includes a look-up table `ANALYSIS_METHOD.csv` capturing the measurement method used as described by HELCOM. For instance:

In [None]:
'''
#| eval: false
pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').head()
'''

"\n#| eval: false\npd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').head()\n"

In [None]:
'''
#| exports
lut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION']
'''

"\n#| exports\nlut_method = lambda: pd.read_csv(Path(fname_in) / 'ANALYSIS_METHOD.csv').set_index('METHOD').to_dict()['DESCRIPTION']\n"

In [None]:
'''
#| exports
class AddMeasurementNoteCB(Callback):
    "Record measurement notes by adding a 'measurenote' column to DataFrames."
    def __init__(self, 
                 fn_lut: Callable # Function that returns the lookup dictionary with `METHOD` as key and `DESCRIPTION` as value
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        lut = self.fn_lut()
        for df in tfm.dfs.values():
            if 'METHOD' in df.columns:
                df['measurementnote'] = df['METHOD'].map(lambda x: lut.get(x, 0))
'''

'\n#| exports\nclass AddMeasurementNoteCB(Callback):\n    "Record measurement notes by adding a \'measurenote\' column to DataFrames."\n    def __init__(self, \n                 fn_lut: Callable # Function that returns the lookup dictionary with `METHOD` as key and `DESCRIPTION` as value\n                ):\n        fc.store_attr()\n        \n    def __call__(self, tfm: Transformer):\n        lut = self.fn_lut()\n        for df in tfm.dfs.values():\n            if \'METHOD\' in df.columns:\n                df[\'measurementnote\'] = df[\'METHOD\'].map(lambda x: lut.get(x, 0))\n'

In [None]:
'''
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    AddMeasurementNoteCB(lut_method),
    CompareDfsAndTfmCB(dfs)])

tfm()
print(tfm.dfs['seawater']['measurementnote'].unique()[:5])
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
'''

"\n#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[\n    AddMeasurementNoteCB(lut_method),\n    CompareDfsAndTfmCB(dfs)])\n\ntfm()\nprint(tfm.dfs['seawater']['measurementnote'].unique()[:5])\nprint(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')\n"

## Add station (REMOVE)

*For MARIS master DB import only (not included in the NetCDF output).*

In [None]:
'''
#| exports
class RemapStationIdCB(Callback):
    "Remap Station ID to MARIS format."
    def __init__(self):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        "Iterate through all DataFrames in the transformer object and remap `STATION` to `station_id`."
        for grp in tfm.dfs.keys(): 
            tfm.dfs[grp]['station'] = tfm.dfs[grp]['STATION']
'''

'\n#| exports\nclass RemapStationIdCB(Callback):\n    "Remap Station ID to MARIS format."\n    def __init__(self):\n        fc.store_attr()\n\n    def __call__(self, tfm: Transformer):\n        "Iterate through all DataFrames in the transformer object and remap `STATION` to `station_id`."\n        for grp in tfm.dfs.keys(): \n            tfm.dfs[grp][\'station\'] = tfm.dfs[grp][\'STATION\']\n'

In [None]:
'''#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RemapStationIdCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
'''

"#| eval: false\ndfs = load_data(fname_in)\ntfm = Transformer(dfs, cbs=[\n                            RemapStationIdCB(),\n                            CompareDfsAndTfmCB(dfs)\n                            ])\ntfm()\nprint(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')\n"

## Add slice position (top and bottom)

In [None]:
#| exports
class RemapSedSliceTopBottomCB(Callback):
    "Remap Sediment slice top and bottom to MARIS format."
    def __call__(self, tfm: Transformer):
        "Iterate through all DataFrames in the transformer object and remap sediment slice top and bottom."
        tfm.dfs['SEDIMENT']['TOP'] = tfm.dfs['SEDIMENT']['UPPSLI']
        tfm.dfs['SEDIMENT']['BOTTOM'] = tfm.dfs['SEDIMENT']['LOWSLI']

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapSedSliceTopBottomCB()])
tfm()
print(tfm.dfs['SEDIMENT'][['TOP','BOTTOM']].head())


    TOP  BOTTOM
0  15.0    20.0
1  20.0    27.0
2   0.0     2.0
3   2.0     4.0
4   4.0     6.0


## Add dry to wet ratio (NOT incldued in NC_VARS, update NC_VARS)

*`DW%` is not included in the NetCDF output currently.* TBD!

HELCOM Description:

**Sediment:**
1. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT.
2. VALUE_Bq/kg: Measured radioactivity concentration in Bq/kg dry wt. in scientific format(e.g. 123 = 1.23E+02, 0.076 = 7.6E-02)

**Biota:**
1. WEIGHT: Average weight (in g) of specimen in the sample
2. DW%: DRY WEIGHT AS PERCENTAGE (%) OF FRESH WEIGHT

In [None]:
#| exports
class LookupDryWetRatio(Callback):
    "Lookup dry-wet ratio and format for MARIS."
    def __call__(self, tfm: Transformer):
        "Iterate through all DataFrames in the transformer object and apply the dry-wet ratio lookup."
        for grp in tfm.dfs.keys():
            if 'DW%' in tfm.dfs[grp].columns:
                self._apply_dry_wet_ratio(tfm.dfs[grp])

    def _apply_dry_wet_ratio(self, df: pd.DataFrame) -> None:
        "Apply dry-wet ratio conversion and formatting to the given DataFrame."
        df['dry_wet_ratio'] = df['DW%']
        # Convert 'DW%' = 0% to NaN.
        df.loc[df['dry_wet_ratio'] == 0, 'dry_wet_ratio'] = np.NaN


In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LookupDryWetRatio(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['BIOTA']['dry_wet_ratio'].head())



                           BIOTA  SEAWATER  SEDIMENT
Number of rows in dfs      14893     20318     37347
Number of rows in tfm.dfs  14893     20318     37347
Number of rows removed         0         0         0 

0    18.453
1    18.453
2    18.453
3    18.453
4    18.458
Name: dry_wet_ratio, dtype: float64


## Standardize Coordinates

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Column names for geographical coordinates are inconsistent across sample types (biota, sediment, seawater). Sometimes using parentheses, sometimes not.

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
for grp in dfs.keys():
    print(f'{grp}: {[col for col in dfs[grp].columns if "LON" in col or "LAT" in col]}')

BIOTA: ['LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm', 'LONGITUDE dddddd']
SEAWATER: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']
SEDIMENT: ['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)', 'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)']


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: 

- Geographical coordinates are provided in both decimal degree and degree-minute formats. Some coordinates are missing the decimal format and obliged us to use the degree-minute format with less precision.
- Also note that latitude values have `,` as decimal separator while longitude values have `.` as decimal separator (see below)

:::

In [None]:
#| eval: false
dfs['SEDIMENT'][['LATITUDE (ddmmmm)', 'LATITUDE (dddddd)']].head()


Unnamed: 0,LATITUDE (ddmmmm),LATITUDE (dddddd)
0,59.4,59.6667
1,59.4,59.6667
2,59.516,59.86
3,59.516,59.86
4,59.516,59.86


In [None]:
#| exports
class ParseCoordinates(Callback):
    """
    Get geographical coordinates from columns expressed in degrees decimal format 
    or from columns in degrees/minutes decimal format where degrees decimal format is missing.
    """
    def __init__(self, 
                 fn_convert_cor: Callable # Function that converts coordinates from degree-minute to decimal degree format
                 ):
        self.fn_convert_cor = fn_convert_cor

    def __call__(self, tfm:Transformer):
        for df in tfm.dfs.values():
            self._format_coordinates(df)

    def _format_coordinates(self, df:pd.DataFrame) -> None:
        coord_cols = self._get_coord_columns(df.columns)
        
        for coord in ['lat', 'lon']:
            decimal_col, minute_col = coord_cols[f'{coord}_d'], coord_cols[f'{coord}_m']
            
            condition = df[decimal_col].isna() | (df[decimal_col] == 0)
            df[coord] = np.where(condition,
                                 df[minute_col].apply(self._safe_convert),
                                 df[decimal_col])
        
        df.dropna(subset=['lat', 'lon'], inplace=True)

    def _get_coord_columns(self, columns) -> dict:
        return {
            'lon_d': self._find_coord_column(columns, 'LON', 'dddddd'),
            'lat_d': self._find_coord_column(columns, 'LAT', 'dddddd'),
            'lon_m': self._find_coord_column(columns, 'LON', 'ddmmmm'),
            'lat_m': self._find_coord_column(columns, 'LAT', 'ddmmmm')
        }

    def _find_coord_column(self, columns, coord_type, coord_format) -> str:
        pattern = re.compile(f'{coord_type}.*{coord_format}', re.IGNORECASE)
        matching_columns = [col for col in columns if pattern.search(col)]
        return matching_columns[0] if matching_columns else None

    def _safe_convert(self, value) -> str:
        if pd.isna(value):
            return value
        try:
            return self.fn_convert_cor(value)
        except Exception as e:
            print(f"Error converting value {value}: {e}")
            return value

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[                    
                            ParseCoordinates(ddmm_to_dd),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['BIOTA'][['lat','lon']])

                           BIOTA  SEAWATER  SEDIMENT
Number of rows in dfs      14893     20318     37347
Number of rows in tfm.dfs  14893     20318     37346
Number of rows removed         0         0         1 

             lat        lon
0      54.283333  12.316667
1      54.283333  12.316667
2      54.283333  12.316667
3      54.283333  12.316667
4      54.283333  12.316667
...          ...        ...
14888  54.583300  19.000000
14889  54.333300  15.500000
14890  54.333300  15.500000
14891  54.333300  15.500000
14892  54.363900  19.433300

[14893 rows x 2 columns]


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Some samples have (lon, lat): (0, 0) or are outside lon/lat possible values. 

:::

Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator."

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota'][['lat','lon']])


KeyError: 'LON'

## Review all callbacks

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            AddSampleTypeIdColumnCB(),
                            LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE'),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeValue(coi_val),       
                            NormalizeUncCB(),
                            RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
                            RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
                            RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
                            RemapTaxonInformationCB(lut_taxon),
                            RemapSedimentCB(lut_sediments),
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapFiltCB(lut_filtered),
                            AddSampleLabCodeCB(),
                            AddMeasurementNoteCB(lut_method),
                            RemapStationIdCB(),
                            RemapSedSliceTopBottomCB(),
                            LookupDryWetRatio(),
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  sediment  biota
Number of rows in dfs                                  21216     39817  15827
Number of rows in tfm.dfs                              21114     39531  15798
Number of dropped rows                                   102       286     29
Number of rows in tfm.dfs + Number of dropped rows     21216     39817  15827 



For instance, to inspect dropped rows:

In [None]:
tfm.dfs_dropped['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
13439,WRISO2001025,CS137,RISO02,,,10.0,,26.0,RISO,2001025.0,...,10.5,10.833333,22.0,20.0,0.0,,N,5.0,5.0,
14017,WLEPA2002001,CS134,LEPA02,<,,,,93.0,LEPA,2002001.0,...,21.03,21.05,16.0,0.0,3.77,14.4,N,4.0,9.0,
14020,WLEPA2002002,CS134,LEPA02,<,,,,93.0,LEPA,2002004.0,...,20.574,20.956667,14.0,0.0,6.57,11.95,N,4.0,9.0,
14023,WLEPA2002003,CS134,LEPA02,<,,,,93.0,LEPA,2002007.0,...,19.236,19.393333,73.0,0.0,7.0,9.19,N,4.0,9.0,
14026,WLEPA2002004,CS134,LEPA02,<,,,,93.0,LEPA,2002010.0,...,20.205,20.3417,47.0,0.0,7.06,8.65,N,4.0,9.0,


## Rename columns of interest for NetCDF or Open Refine

> Column names are standardized to MARIS NetCDF format (i.e. PEP8 ). 

In [None]:
#| exports
def get_common_rules(
    vars: dict, # Configuration dictionary
    encoding_type: str # Encoding type (`netcdf` or `openrefine`)
    ) -> dict: # Common renaming rules for NetCDF and OpenRefine.
    "Get common renaming rules for NetCDF and OpenRefine."
    common = {
        'KEY': 'key',
        'lat': 'latitude' if encoding_type == 'openrefine' else vars['defaults']['lat']['name'],
        'lon': 'longitude' if encoding_type == 'openrefine' else vars['defaults']['lon']['name'],
        'time': 'begperiod' if encoding_type == 'openrefine' else vars['defaults']['time']['name'],
        'NUCLIDE': 'nuclide_id' if encoding_type == 'openrefine' else 'nuclide',
        'detection_limit': 'detection' if encoding_type == 'openrefine' else vars['suffixes']['detection_limit']['name'],
        'unit': 'unit_id' if encoding_type == 'openrefine' else vars['suffixes']['unit']['name'],
        'value': 'activity' if encoding_type == 'openrefine' else 'value',
        'uncertainty': 'uncertaint' if encoding_type == 'openrefine' else vars['suffixes']['uncertainty']['name'],
        'SDEPTH': 'sampdepth' if encoding_type == 'openrefine' else vars['defaults']['smp_depth']['name'],
        'TDEPTH': 'totdepth' if encoding_type == 'openrefine' else vars['defaults']['tot_depth']['name'],
    }
    
    if encoding_type == 'openrefine':
        common.update({
            'samptype_id': 'samptype_id',
            'station': 'station',
            'samplabcode': 'samplabcode',
            'SALIN': 'salinity',
            'TTEMP': 'temperatur',
            'FILT': 'filtered',
            'measurenote': 'measurenote'
        })
    else:
        common.update({
            'counting_method': vars['suffixes']['counting_method']['name'],
            'sampling_method': vars['suffixes']['sampling_method']['name'],
            'preparation_method': vars['suffixes']['preparation_method']['name'],
            'SALIN': vars['suffixes']['salinity']['name'],
            'TTEMP': vars['suffixes']['temperature']['name'],
        })
    
    return common

In [None]:
#| exports
def get_specific_rules(
    vars: dict, # Configuration dictionary
    encoding_type: str # Encoding type (`netcdf` or `openrefine`)
    ) -> dict: # Specific renaming rules for NetCDF and OpenRefine.
    "Get specific renaming rules for NetCDF and OpenRefine."
    if encoding_type == 'netcdf':
        return {
            'biota': {
                'species': vars['bio']['species']['name'],
                'body_part': vars['bio']['body_part']['name'],
                'bio_group': vars['bio']['bio_group']['name']
            },
            'sediment': {
                'sed_type': vars['sed']['sed_type']['name'],
                'top': vars['sed']['top']['name'],
                'bottom': vars['sed']['bottom']['name'],
            }
        }
    elif encoding_type == 'openrefine':
        return {
            'biota': {
                'species': 'species_id',
                'Taxonname': 'Taxonname',
                'TaxonRepName': 'TaxonRepName',
                'Taxonrank': 'Taxonrank',
                'TaxonDB': 'TaxonDB',
                'TaxonDBID': 'TaxonDBID',
                'TaxonDBURL': 'TaxonDBURL',
                'body_part': 'bodypar_id',
                'dry_wet_ratio': 'percentwt',
            },
            'sediment': {
                'sed_type': 'sedtype_id',
                'top': 'sliceup',
                'bottom': 'slicedown',
                'SedRepName': 'SedRepName',
                'dry_wet_ratio': 'percentwt',
            }
        }

In [None]:
#| exports
def get_renaming_rules(
    encoding_type: str = 'netcdf' # Encoding type (`netcdf` or `openrefine`)
    ) -> dict: # Renaming rules for NetCDF and OpenRefine.
    "Get renaming rules for NetCDF and OpenRefine."
    vars = cdl_cfg()['vars']
    
    if encoding_type not in ['netcdf', 'openrefine']:
        raise ValueError("Invalid encoding_type provided. Please use 'netcdf' or 'openrefine'.")
    
    common_rules = get_common_rules(vars, encoding_type)
    specific_rules = get_specific_rules(vars, encoding_type)
    
    rules = defaultdict(dict)
    for sample_type in ['seawater', 'biota', 'sediment']:
        rules[sample_type] = common_rules.copy()
        rules[sample_type].update(specific_rules.get(sample_type, {}))
    
    return dict(rules)

In [None]:
#| exports
class SelectAndRenameColumnCB(Callback):
    "Select and rename columns in a DataFrame based on renaming rules for a specified encoding type."
    def __init__(self, 
                 fn_renaming_rules: Callable, # A function that returns an OrderedDict of renaming rules 
                 encoding_type: str='netcdf', # The encoding type (`netcdf` or `openrefine`) to determine which renaming rules to use
                 verbose: bool=False # Whether to print out renaming rules that were not applied
                 ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        "Apply column selection and renaming to DataFrames in the transformer, and identify unused rules."
        try:
            renaming_rules = self.fn_renaming_rules(self.encoding_type)
        except ValueError as e:
            print(f"Error fetching renaming rules: {e}")
            return

        for group in tfm.dfs.keys():
            # Get relevant renaming rules for the current group
            group_rules = self._get_group_rules(renaming_rules, group)

            if not group_rules:
                continue

            # Apply renaming rules and track keys not found in the DataFrame
            df = tfm.dfs[group]
            df, not_found_keys = self._apply_renaming(df, group_rules)
            tfm.dfs[group] = df
            
            # Print any renaming rules that were not used
            if not_found_keys and self.verbose:
                print(f"\nGroup '{group}' has the following renaming rules not applied:")
                for old_col in not_found_keys:
                    print(f"Key '{old_col}' from renaming rules was not found in the DataFrame.")

    def _get_group_rules(self, 
                         renaming_rules: OrderedDict, # Renaming rules
                         group: str # Group name to filter rules
                         ) -> OrderedDict: # Renaming rules applicable to the specified group
        "Retrieve and merge renaming rules for the specified group based on the encoding type."
        relevant_rules = [rules for key, rules in renaming_rules.items() if group in key]
        merged_rules = OrderedDict()
        for rules in relevant_rules:
            merged_rules.update(rules)
        return merged_rules

    def _apply_renaming(self, 
                        df: pd.DataFrame, # DataFrame to modify
                        rename_rules: OrderedDict # Renaming rules
                        ) -> tuple: # (Renamed and filtered df, Column names from renaming rules that were not found in the DataFrame)
        """
        Select columns based on renaming rules and apply renaming, only for existing columns
        while maintaining the order of the dictionary columns."""
        existing_columns = set(df.columns)
        valid_rules = OrderedDict((old_col, new_col) for old_col, new_col in rename_rules.items() if old_col in existing_columns)

        # Create a list to maintain the order of columns
        columns_to_keep = [col for col in rename_rules.keys() if col in existing_columns]
        columns_to_keep += [new_col for old_col, new_col in valid_rules.items() if new_col in df.columns]

        df = df[list(OrderedDict.fromkeys(columns_to_keep))]

        # Apply renaming
        df.rename(columns=valid_rules, inplace=True)

        # Determine which keys were not found
        not_found_keys = set(rename_rules.keys()) - existing_columns
        return df, not_found_keys


In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE'),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeValue(coi_val),       
                            NormalizeUncCB(),
                             RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
                            RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
                            RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
                            RemapTaxonInformationCB(lut_taxon),
                            RemapSedimentCB(lut_sediments),
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapFiltCB(lut_filtered),
                            AddSampleLabCodeCB(),
                            AddMeasurementNoteCB(lut_method),
                            RemapStationIdCB(),
                            RemapSedSliceTopBottomCB(),
                            LookupDryWetRatio(),
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ])

tfm()
for grp in tfm.dfs.keys():
    print(f'{grp} columns:')
    print(tfm.dfs[grp].columns)

seawater columns:
Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',
       'smp_depth', 'tot_depth', '_sal', '_temp'],
      dtype='object')
sediment columns:
Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',
       'tot_depth', 'sed_type', 'top', 'bottom'],
      dtype='object')
biota columns:
Index(['key', 'lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',
       'smp_depth', 'species', 'body_part', 'bio_group'],
      dtype='object')


In [None]:
#| eval: false
result = tfm.dfs['sediment']; result.head()

Unnamed: 0,key,lat,lon,time,nuclide,_dl,_unit,value,_unc,tot_depth,sed_type,top,bottom
0,SKRIL2012048,59.6667,24.0,1339891200,ra226,1,4,35.0,9.1,71.0,0,15.0,20.0
1,SKRIL2012049,59.6667,24.0,1339891200,ra226,1,4,36.0,7.92,71.0,0,20.0,27.0
2,SKRIL2012050,59.86,28.8433,1344556800,ra226,1,4,38.0,9.12,23.0,0,0.0,2.0
3,SKRIL2012051,59.86,28.8433,1344556800,ra226,1,4,36.0,9.0,23.0,0,2.0,4.0
4,SKRIL2012052,59.86,28.8433,1344556800,ra226,1,4,30.0,6.9,23.0,0,4.0,6.0


## Reshape: long to wide

Convert data from long to wide and rename columns to comply with NetCDF format.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE'),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeValue(coi_val),       
                            NormalizeUncCB(),
                            RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
                            RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
                            RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
                            RemapTaxonInformationCB(lut_taxon),
                            RemapSedimentCB(lut_sediments),
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapFiltCB(lut_filtered),
                            AddSampleLabCodeCB(),
                            AddMeasurementNoteCB(lut_method),
                            RemapStationIdCB(),
                            RemapSedSliceTopBottomCB(),
                            LookupDryWetRatio(),
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ReshapeLongToWide()
                            ])

tfm()
for grp in tfm.dfs.keys():
    print(f'{grp} columns:')
    print(tfm.dfs[grp].columns)

ValueError: Must produce aggregated value

## NetCDF encoder

### Example change logs

In [None]:
#| eval: false
dfs = load_data(fname_in)

tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE'),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeValue(coi_val),       
                            NormalizeUncCB(),
                            RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
                            RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
                            RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
                            RemapTaxonInformationCB(lut_taxon),
                            RemapSedimentCB(lut_sediments),
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapFiltCB(lut_filtered),
                            AddSampleLabCodeCB(),
                            AddMeasurementNoteCB(lut_method),
                            RemapStationIdCB(),
                            RemapSedSliceTopBottomCB(),
                            LookupDryWetRatio(),
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ReshapeLongToWide()
                            ])

tfm()
tfm.logs

["Convert values from 'NUCLIDE' to lowercase, strip spaces, and store in 'None'.",
 'Parse and standardize time information in the dataframe.',
 'Encode time as `int` representing seconds since xxx',
 'Sanitize value/measurement by removing blank entries and populating `value` column.',
 'Convert from relative error % to uncertainty of activity unit.',
 "Remap values from 'RUBIN' to 'species' for groups: b, i, o, t, a.",
 "Remap values from 'TISSUE' to 'body_part' for groups: b, i, o, t, a.",
 "Remap values from 'species' to 'bio_group' for groups: b, i, o, t, a.",
 'Update taxon information based on MARIS species LUT.',
 'Update sediment id based on MARIS species LUT (dbo_sedtype.xlsx).',
 'Set the `unit` id column in the DataFrames based on a lookup table.',
 'Remap value type to MARIS format.',
 'Lookup FILT value in dataframe using the lookup table.',
 'Remap `KEY` column to `samplabcode` in each DataFrame.',
 "Record measurement notes by adding a 'measurenote' column to DataFrames

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(
    tfm: Transformer, # Transformer object
    zotero_key: str, # Zotero dataset record key
    kw: list = kw # List of keywords
    ) -> dict: # Global attributes
    "Retrieve all global attributes."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#| eval: false
get_attrs(tfm, zotero_key=zotero_key, kw=kw)

{'geospatial_lat_min': '31.17',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.6333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.6333 53.5, 31.17 53.5, 31.17 65.75, 9.6333 65.75, 9.6333 53.5))',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2021-12-15T00:00:00',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annua

In [None]:
#| exports
def enums_xtra(
    tfm: Transformer, # Transformer object
    vars: list # List of variables to extract from the transformer
    ):
    "Retrieve a subset of the lengthy enum as `species_t` for instance."
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras

### <a name="encoding-netcdf"></a>Encoding NetCDF

In [None]:
#| exports
def encode(
    fname_in: str, # Input file name
    fname_out_nc: str, # Output file name
    nc_tpl_path: str, # NetCDF template file name
    **kwargs # Additional arguments
    ) -> None:
    "Encode data to NetCDF."
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            LowerStripNameCB(col_src='NUCLIDE'),
                            RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE'),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeValue(coi_val),       
                            NormalizeUncCB(),
                            RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
                            RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
                            RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
                            RemapTaxonInformationCB(lut_taxon),
                            RemapSedimentCB(lut_sediments),
                            RemapUnitCB(),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapFiltCB(lut_filtered),
                            AddSampleLabCodeCB(),
                            AddMeasurementNoteCB(lut_method),
                            RemapStationIdCB(),
                            RemapSedSliceTopBottomCB(),
                            LookupDryWetRatio(),
                            ParseCoordinates(ddmm_to_dd),
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ReshapeLongToWide()
                            ])
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out_nc, 
                            global_attrs=get_attrs(tfm, zotero_key=zotero_key, kw=kw),
                            verbose=kwargs.get('verbose', False),
                            enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                           )
    encoder.encode()

In [None]:
#| eval: false
encode(fname_in, fname_out_nc, nc_tpl_path(), verbose=False)

## Open Refine Pipeline (WIP)

### Rename columns for Open Refine

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    AddSampleTypeIdColumnCB(),
    LowerStripNameCB(col_src='NUCLIDE'),
    RemapNuclideNameCB(lut_nuclides),
    AddNuclideIdColumnCB(col_value='NUCLIDE'),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),        
    SanitizeValue(coi_val),                       
    NormalizeUncCB(),
    RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
    RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
    RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
    RemapTaxonInformationCB(lut_taxon),
    RemapSedimentCB(lut_sediments),
    RemapUnitCB(),
    RemapDetectionLimitCB(coi_dl, lut_dl),
    RemapFiltCB(lut_filtered),
    AddSampleLabCodeCB(),
    AddMeasurementNoteCB(lut_method),
    RemapStationIdCB(),
    RemapSedSliceTopBottomCB(),
    LookupDryWetRatio(),
    ParseCoordinates(ddmm_to_dd),
    SanitizeLonLatCB(),
    SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),
    CompareDfsAndTfmCB(dfs)
    ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


Group 'seawater' has the following renaming rules not applied:
Key 'measurenote' from renaming rules was not found in the DataFrame.

Group 'sediment' has the following renaming rules not applied:
Key 'SDEPTH' from renaming rules was not found in the DataFrame.
Key 'measurenote' from renaming rules was not found in the DataFrame.
Key 'TTEMP' from renaming rules was not found in the DataFrame.
Key 'FILT' from renaming rules was not found in the DataFrame.
Key 'SALIN' from renaming rules was not found in the DataFrame.

Group 'biota' has the following renaming rules not applied:
Key 'TDEPTH' from renaming rules was not found in the DataFrame.
Key 'measurenote' from renaming rules was not found in the DataFrame.
Key 'TTEMP' from renaming rules was not found in the DataFrame.
Key 'FILT' from renaming rules was not found in the DataFrame.
Key 'SALIN' from renaming rules was not found in the DataFrame.
                                                    seawater  sediment  biota
Number of r

**Example of data included in dfs_dropped.**

Main reasons for data to be dropped from dfs:
- No activity value reported (e.g. VALUE_Bq/kg)
- No time value reported. 

In [None]:
#| eval: false
grp='sediment'
#grp='seawater'
#grp='biota'

tfm.dfs_dropped[grp]

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
11784,SLREB1998021,SR90,2,,,,,,,,...,12.0,0.02100,55.0,O,,,14.0,14.0,a,
11824,SLVDC1997023,CS137,1,,,,,,,,...,14.0,0.02100,55.0,O,,,9.0,9.0,a,
11832,SLVDC1997031,CS137,1,,,,,,,,...,14.0,0.02100,55.0,O,,,9.0,9.0,a,
11841,SLVDC1997040,CS137,1,,,,,,,,...,16.0,0.02100,55.0,O,,,9.0,9.0,a,
11849,SLVDC1998011,CS137,1,,,,,,,,...,16.0,0.02100,55.0,O,,,14.0,14.0,a,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,SSSSM2021030,CO60,SSSM43,<,,,<,,,09/06/22 00:00:00,...,2.0,0.01608,,,28.200000,15.0,12.0,12.0,,09/06/22 00:00:00
39774,SSSSM2021030,RA226,SSSM43,<,,,<,,,09/06/22 00:00:00,...,2.0,0.01608,,,28.200000,15.0,12.0,12.0,,09/06/22 00:00:00
39775,SSSSM2021030,RA223,SSSM43,<,,,<,,,09/06/22 00:00:00,...,2.0,0.01608,,,28.200000,15.0,12.0,12.0,,09/06/22 00:00:00
39777,SSSSM2021031,CS137,SSSM43,<,,,<,0.0,,09/06/22 00:00:00,...,2.0,0.01608,,,31.993243,,13.0,13.0,,09/06/22 00:00:00


## Open Refine encoder (WIP)

In [None]:
#| eval: false
def encode_or(
    fname_in: str, # Input file name
    fname_out_csv: str, # Output file name
    ref_id: str, # Reference ID as defined in MARIS master DB
    **kwargs # Additional arguments
    ) -> None:
    "Encode data to Open Refine CSV."
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[
        AddSampleTypeIdColumnCB(),
        LowerStripNameCB(col_src='NUCLIDE'),
        RemapNuclideNameCB(lut_nuclides),
        AddNuclideIdColumnCB(col_value='NUCLIDE'),
        ParseTimeCB(),
        EncodeTimeCB(cfg()),        
        SanitizeValue(coi_val),                       
        NormalizeUncCB(),
        RemapCB(fn_lut=lut_biota, col_remap='species', col_src='RUBIN', dest_grps='biota'),
        RemapCB(lut_tissues, 'body_part', 'TISSUE', 'biota'),
        RemapCB(lut_biogroup, 'bio_group', 'species', 'biota'),
        RemapTaxonInformationCB(lut_taxon),
        RemapSedimentCB(lut_sediments),
        RemapUnitCB(),
        RemapDetectionLimitCB(coi_dl, lut_dl),
        RemapFiltCB(lut_filtered),
        AddSampleLabCodeCB(),
        AddMeasurementNoteCB(lut_method),
        RemapStationIdCB(),
        RemapSedSliceTopBottomCB(),
        LookupDryWetRatio(),
        ParseCoordinates(ddmm_to_dd),
        SanitizeLonLatCB(),
        SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),
        CompareDfsAndTfmCB(dfs)
        ])
    
    tfm()

    encoder = OpenRefineCsvEncoder(tfm.dfs, 
                                    dest_fname=fname_out_csv, 
                                    ref_id = ref_id,
                                    verbose = True
                                )
    encoder.encode()

In [None]:
#| eval: false
encode_or(fname_in, fname_out_csv, ref_id, verbose=True)

###  Open Refine Variables not included in Helcom

| Field name      | Full name                | HELCOM     |
|-----------------|--------------------------|------------|
| sampquality     | Sample quality           | N          |
| lab_id          | Laboratory ID            | N          |
| profile_id      | Profile ID               | N          |
| transect_id     | Transect ID              | N          |
| endperiod       | End period               | N          |
| vartype         | Variable type            | N          |
| freq            | Frequency                | N          |
| rl_detection    | Range low detection      | N          |
| rangelow        | Range low                | N          |
| rangeupp        | Range upper              | N          |
| Commonname      | Common name              | N          |
| volume          | Volume                   | N          |
| filtpore        | Filter pore              | N          |
| acid            | Acidified                | N          |
| oxygen          | Oxygen                   | N          |
| samparea        | Sample area              | N          |
| drywt           | Dry weight               | N          |
| wetwt           | Wet weight               | N          |
| sampmet_id      | Sampling method ID       | N          |
| drymet_id       | Drying method ID         | N          |
| prepmet_id      | Preparation method ID    | N          |
| counmet_id      | Counting method ID       | N          |
| refnote         | Reference note           | N          |
| sampnote        | Sample note              | N          |
| gfe             | Good for export          | ?          |

**TODO**:

- Should we use a single encoder for both NetCDF and OpenRefine? If so, should we have a single encode function that accepts a variable 'encoding_type'.

TODO: Include FILT for NetCDF

TODO: Check sediment 'DW%' data that is less than 1%. Is this realistic? Check the 'DW%' data that is 0%. Run below before SelectAndRenameColumnCB. 

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(col_src='NUCLIDE'),
                            ])
tfm()

{'seawater':                 KEY NUCLIDE METHOD < VALUE_Bq/m³  VALUE_Bq/m³  ERROR%_m³  \
 0      WKRIL2012003   cs137    NaN           NaN          5.3  32.000000   
 1      WKRIL2012004   cs137    NaN           NaN         19.9  20.000000   
 2      WKRIL2012005   cs137    NaN           NaN         25.5  20.000000   
 3      WKRIL2012006   cs137    NaN           NaN         17.0  29.000000   
 4      WKRIL2012007   cs137    NaN           NaN         22.2  18.000000   
 ...             ...     ...    ...           ...          ...        ...   
 21211  WSSSM2021005      h3  SSM45           NaN       1030.0  93.203883   
 21212  WSSSM2021006      h3  SSM45           NaN       2240.0  43.303571   
 21213  WSSSM2021007      h3  SSM45           NaN       2060.0  47.087379   
 21214  WSSSM2021008      h3  SSM45           NaN       2300.0  43.478261   
 21215  WSSSM2021004      h3  SSM45             <          NaN        NaN   
 
          DATE_OF_ENTRY_x  COUNTRY LABORATORY   SEQUENCE  ... 

In [None]:
#| eval: false
grp='sediment'
check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] < 1) & (tfm.dfs[grp]['DW%'] > 0.001) ]
check_data_sediment

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
30938,SLVEA2010001,cs137,LVEA01,,334.25,1.57,,131.886,41179.0,,...,2.0,0.0151,5.0,O,0.115,0.9,14.0,14.0,,11/11/11 00:00:00
30939,SLVEA2010002,cs137,LVEA01,,343.58,1.49,,132.092,41179.0,,...,4.0,0.0151,5.0,A,0.159,0.8,14.0,14.0,,11/11/11 00:00:00
30940,SLVEA2010003,cs137,LVEA01,,334.69,1.56,,134.39,41179.0,,...,6.0,0.0151,5.0,A,0.189,0.8,14.0,14.0,,11/11/11 00:00:00
30941,SLVEA2010004,cs137,LVEA01,,348.5,1.56,,136.699,41179.0,,...,8.0,0.0151,5.0,A,0.194,0.8,14.0,14.0,,11/11/11 00:00:00
30942,SLVEA2010005,cs137,LVEA01,,258.67,1.73,,104.894,41179.0,,...,10.0,0.0151,5.0,A,0.195,0.8,14.0,14.0,,11/11/11 00:00:00
30943,SLVEA2010006,cs137,LVEA01,,182.02,2.05,,77.523,41179.0,,...,12.0,0.0151,5.0,A,0.221,0.8,14.0,14.0,,11/11/11 00:00:00
30944,SLVEA2010007,cs137,LVEA01,,116.34,2.79,,46.946,41179.0,,...,14.0,0.0151,5.0,A,0.238,0.8,14.0,14.0,,11/11/11 00:00:00
30945,SLVEA2010008,cs137,LVEA01,,94.07,2.61,,38.162,41179.0,,...,16.0,0.0151,5.0,A,0.234,0.8,14.0,14.0,,11/11/11 00:00:00
30946,SLVEA2010009,cs137,LVEA01,,69.7,3.12,,27.444,41179.0,,...,18.0,0.0151,5.0,A,0.242,0.8,14.0,14.0,,11/11/11 00:00:00
30947,SLVEA2010010,cs137,LVEA01,,59.63,3.4,,24.22,41179.0,,...,20.0,0.0151,5.0,A,0.257,0.7,14.0,14.0,,11/11/11 00:00:00


In [None]:
#| eval: false
grp='sediment'
check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] == 0) ]
check_data_sediment

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
9824,SERPC1997001,cs134,,,3.80,20.0,,5.75,,,...,2.0,0.008,5.0,A,0.0,0.0,11.0,11.0,a,
9825,SERPC1997001,cs137,,,389.00,4.0,,589.00,,,...,2.0,0.008,5.0,A,0.0,0.0,11.0,11.0,a,
9826,SERPC1997002,cs134,,,4.78,13.0,,12.00,,,...,4.0,0.008,5.0,A,0.0,0.0,11.0,11.0,a,
9827,SERPC1997002,cs137,,,420.00,4.0,,1060.00,,,...,4.0,0.008,5.0,A,0.0,0.0,11.0,11.0,a,
9828,SERPC1997003,cs134,,,3.12,17.0,,12.00,,,...,6.0,0.008,5.0,A,0.0,0.0,11.0,11.0,a,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15257,SKRIL1999062,th228,1,,68.00,,,,,,...,15.0,0.006,0.0,O,0.0,0.0,11.0,11.0,a,
15258,SKRIL1999063,k40,1,,1210.00,,,,,,...,21.5,0.006,0.0,O,0.0,0.0,11.0,11.0,a,
15259,SKRIL1999063,ra226,KRIL01,,56.50,,,,,,...,21.5,0.006,0.0,O,0.0,0.0,11.0,11.0,a,
15260,SKRIL1999063,ra228,KRIL01,,72.20,,,,,,...,21.5,0.006,0.0,O,0.0,0.0,11.0,11.0,a,


In [None]:
#| eval: false
grp='biota'
check_data_sediment=tfm.dfs[grp][(tfm.dfs[grp]['DW%'] == 0) ]
check_data_sediment

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
5971,BERPC1997002,k40,,,116.0,W,3.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
5972,BERPC1997002,cs137,,,12.6,W,4.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
5973,BERPC1997002,cs134,,,0.14,W,18.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
5974,BERPC1997001,k40,,,116.0,W,4.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
5975,BERPC1997001,cs137,,,12.0,W,4.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
5976,BERPC1997001,cs134,,,0.21,W,24.0,,,91.0,...,F,5,0.0,0.0,0.0,0.0,0.0,11.0,11,
