In [None]:
#| default_exp handlers.ospar

# OSPAR 

> This data pipeline, known as a "handler" in Marisco terminology, is designed to clean, standardize, and encode [OSPAR data](https://odims.ospar.org/en/) into `NetCDF` format. The handler processes raw OSPAR data, applying various transformations and lookups to align it with `MARIS` data standards.

Key functions of this handler:

- **Cleans** and **normalizes** raw OSPAR data
- **Applies standardized nomenclature** and units
- **Encodes the processed data** into `NetCDF` format compatible with MARIS requirements

This handler is a crucial component in the Marisco data processing workflow, ensuring OSPAR data is properly integrated into the MARIS database.

:::{.callout-tip}

For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.

:::

The present notebook pretends to be an instance of [Literate Programming](https://www.wikiwand.com/en/articles/Literate_programming) in the sense that it is a narrative that includes code snippets that are interspersed with explanations. When a function or a class needs to be exported in a dedicated python module (in our case `marisco/handlers/ospar.py`) the code snippet is added to the module using `#| exports` as provided by the wonderful [nbdev](https://nbdev.readthedocs.io/en/latest/) library.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd 
import numpy as np
#from functools import partial 
import fastcore.all as fc 
from fastcore.basics import patch, store_attr
from pathlib import Path 
#from dataclasses import asdict
from typing import List, Dict, Callable, Tuple, Any 
#from collections import OrderedDict, defaultdict
import re
#from functools import partial

from marisco.utils import (
    Remapper, 
    ddmm_to_dd,
    Match, 
    get_unique_across_dfs,
    NA,
    nc_to_dfs,
    get_netcdf_properties, 
    get_netcdf_group_properties,
    get_netcdf_variable_properties
)

from marisco.callbacks import (
    Callback, 
    Transformer, 
    EncodeTimeCB, 
    AddSampleTypeIdColumnCB,
    AddNuclideIdColumnCB, 
    LowerStripNameCB, 
    SanitizeLonLatCB, 
    CompareDfsAndTfmCB, 
    RemapCB
)

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB, 
    DepthRangeCB, 
    TimeRangeCB, 
    ZoteroCB, 
    KeyValuePairCB
)

from marisco.configs import (
    nuc_lut_path, 
    nc_tpl_path, 
    cfg, 
    species_lut_path, 
    sediments_lut_path, 
    bodyparts_lut_path, 
    detection_limit_lut_path, 
    filtered_lut_path, 
    get_lut, 
    unit_lut_path,
    prepmet_lut_path,
    sampmet_lut_path,
    counmet_lut_path, 
    lab_lut_path,
    NC_VARS
)

from marisco.encoders import (
    NetCDFEncoder, 
)

from marisco.handlers.data_format_transformation import (
    decode, 
)

import warnings
warnings.filterwarnings('ignore')

In [None]:
#| hide
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)  # Show full column width

## Configuration and File Paths

The handler requires several configuration parameters:

1. **fname_in**: Path to the OSPAR CSV data folder (relative paths supported)
2. **fname_out_nc**: Output path and filename for NetCDF file (relative paths supported) 
3. **zotero_key**: Key for retrieving dataset attributes from [Zotero](https://www.zotero.org/)
4. **ref_id**: Reference ID in the MARIS [Zotero library](https://www.zotero.org/groups/2432820/maris/library)

In [None]:
# | exports
fname_in = '../../_data/accdb/ospar/20241021/csv'
fname_out_nc = '../../_data/output/191-OSPAR-2024.nc'
zotero_key ='LQRA4MMK' # OSPAR MORS zotero key
ref_id = 191 # OSPAR reference id as defined by MARIS

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
from owslib.wfs import WebFeatureService

## Load data

OSPAR is provided in many diferent data foemats at 'https://odims.ospar.org/en/submissions/. For each year from 1995 data is aviable. Each year cal include multiple versions where a subsewueny versin superseed all past versions. Below we ectract the relevent webpages containing the OSPAR data and return as a list of links. 

:::{.callout-tip}

**Feedback to Data Provider:** Please note that we are assuming that a new version supersedes all previous versions. If this assumption is incorrect, please inform us so we can adjust our data handling processes accordingly.

:::

In [None]:
#| export
class WFSProcessor:
    "Processor for Web Feature Service operations, managing feature filtering and data fetching."
    
    def __init__(self, url, search_params=None, version='2.0.0'):
        "Initialize with URL, version, and search parameters."
        fc.store_attr()
        self.wfs = WebFeatureService(url=self.url, version=self.version)
        self.features_df = {}
        self.dfs = {}

    def __call__(self):
        "Process and fetch data, updating internal state with results."
        self.filter_features()
        self.check_feature_pattern()
        self.extract_version_from_feature_name()
        self.filter_latest_versions()
        self.fetch_and_combine_csv()

        return self.dfs

In [None]:
#| exports
@patch
def filter_features(self: WFSHandler):
    "Filter features based on search parameters, updating the internal state."
    available_feature_types = list(self.wfs.contents.keys())
    self.features_df = {}
    for key, value in self.search_params.items():
        filtered_features = [ftype for ftype in available_feature_types if value in ftype]
        self.features_df[key] = pd.DataFrame([{'feature': ftype} for ftype in filtered_features])


In [None]:
#| exports
@patch
def check_feature_pattern(self: WFSHandler):
    "Check and retain features conforming to a specific pattern, updating the internal state."
    pattern = re.compile(r'^odims:ospar_(biota|seawater)_(\d{4})_(\d{2})_(\d{3})$')
    for feature_type, df in list(self.features_df.items()):
        conforming_features = df[df['feature'].apply(lambda x: pattern.match(x) is not None)]
        self.features_df[feature_type] = conforming_features


In [None]:
#| exports
@patch
def extract_version_from_feature_name(self: WFSHandler):
    "Extract version from feature name, updating the internal state."
    for feature_type, df in self.features_df.items():
        df['source'] = df['feature'].apply(lambda x: x.split('_')[0])
        df['type'] = df['feature'].apply(lambda x: x.split('_')[1])
        df['year'] = df['feature'].apply(lambda x: x.split('_')[2])
        df['month'] = df['feature'].apply(lambda x: x.split('_')[3])
        df['version'] = df['feature'].apply(lambda x: x.split('_')[4])

In [None]:
#| exports
@patch
def filter_latest_versions(self: WFSHandler):
    "Filter each DataFrame to include only the latest version of each feature, updating the internal state."
    for feature_type, df in self.features_df.items():
        df[['year', 'month', 'version']] = df[['year', 'month', 'version']].astype(int)
        idx = df.groupby(['source', 'type', 'year', 'month'])['version'].idxmax()
        self.features_df[feature_type] = df.loc[idx]

In [None]:
#| exports
@patch
def fetch_and_combine_csv(self: WFSHandler):
    "Fetch CSV data for each feature from the WFS and combine into a single DataFrame for each feature type."
    for feature_type, df in self.features_df.items():
        combined_df = pd.DataFrame()
        for feature in df['feature']:
            try:
                response = self.wfs.getfeature(typename=feature, outputFormat='csv')
                csv_data = StringIO(response.read().decode('utf-8'))
                df_csv = pd.read_csv(csv_data)
                combined_df = pd.concat([combined_df, df_csv], ignore_index=True)
            except Exception as e:
                print(f"Failed to fetch data for {feature}: {e}")
        self.dfs[feature_type] = combined_df

In [None]:
#|eval: false
wfs_processor=WFSProcessor(url= 'https://odims.ospar.org/geoserver/odims/wfs', search_params={'biota': 'ospar_biota', 'seawater': 'ospar_seawater'})
dfs = wfs_processor()

KeyboardInterrupt: 

In [None]:
#|eval: false
seawater_df = dfs['seawater']
seawater_df

Unnamed: 0,FID,the_geom,ID,Contractin,RSC_Sub_di,Station_ID,Sample_ID,LatD,LatM,LatS,...,Unit,Data_provi,Measuremen,Sample_Com,Reference,LatDD,LongDD,year,F1,Reference_
0,ospar_seawater_1995_01_003.1,POINT (56.16666666666666 11.78333333333333),45552.0,Denmark,12,HesselÃ¸,H95-22,56,10,0.0,...,Bq/l,RisÃ¸-DTU,,,,56.166667,11.783333,1995.0,,
1,ospar_seawater_1995_01_003.2,POINT (56.16666666666666 11.78333333333333),45553.0,Denmark,12,HesselÃ¸,H95-23,56,10,0.0,...,Bq/l,RisÃ¸-DTU,,,,56.166667,11.783333,1995.0,,
2,ospar_seawater_1995_01_003.3,POINT (56.16666666666666 11.78333333333333),45554.0,Denmark,12,HesselÃ¸,H95-56,56,10,0.0,...,Bq/l,RisÃ¸-DTU,,,,56.166667,11.783333,1995.0,,
3,ospar_seawater_1995_01_003.4,POINT (56.16666666666666 11.78333333333333),45555.0,Denmark,12,HesselÃ¸,H95-57,56,10,0.0,...,Bq/l,RisÃ¸-DTU,,,,56.166667,11.783333,1995.0,,
4,ospar_seawater_1995_01_003.5,POINT (56.11666666666667 11.16666666666667),45556.0,Denmark,12,Kattegat SW,H95-20,56,7,0.0,...,Bq/l,RisÃ¸-DTU,,,,56.116667,11.166667,1995.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19014,ospar_seawater_2022_01_001.707,POINT (54.91633333300007 -0.2801666669999463),,UK,10,13,22-1538,54,54,58.8,...,Bq/l,BEIS,,CEND14/22,,54.916333,-0.280167,,706.0,
19015,ospar_seawater_2022_01_001.708,POINT (53.91250000000008 0.9181666670000368),,UK,10,45,22-1539,53,54,45.0,...,Bq/l,BEIS,,CEND14/22,,53.912500,0.918167,,707.0,
19016,ospar_seawater_2022_01_001.709,POINT (53.93066666700008 1.2753333330000487),,UK,10,8,22-1540,53,55,50.4,...,Bq/l,BEIS,,CEND14/22,,53.930667,1.275333,,708.0,
19017,ospar_seawater_2022_01_001.710,POINT (54.50883333300004 2.716500000000053),,UK,10,16,22-1541,54,30,31.8,...,Bq/l,BEIS,,CEND14/22,,54.508833,2.716500,,709.0,


In [None]:
#|eval: false
biota_df = dfs['biota']
biota_df

Unnamed: 0,FID,the_geom,ID,Contractin,RSC_Sub_di,Station_ID,Sample_ID,LatD,LatM,LatS,...,Data_provi,Measuremen,Sample_Com,Reference,LatDD,LongDD,year,F1,Sampling_1,Reference_
0,ospar_biota_1995_01_003.1,POINT (55.96666666666667 11.58333333333333),38847.0,Denmark,12,Klint,950089,55,58,0.0,...,RisÃÂ¸-DTU,,,,55.966667,11.583333,1995.0,,,
1,ospar_biota_1995_01_003.2,POINT (55.96666666666667 11.58333333333333),38848.0,Denmark,12,Klint,950229,55,58,0.0,...,RisÃÂ¸-DTU,,,,55.966667,11.583333,1995.0,,,
2,ospar_biota_1995_01_003.3,POINT (55.96666666666667 11.58333333333333),38849.0,Denmark,12,Klint,950360,55,58,0.0,...,RisÃÂ¸-DTU,,,,55.966667,11.583333,1995.0,,,
3,ospar_biota_1995_01_003.4,POINT (55.96666666666667 11.58333333333333),38850.0,Denmark,12,Klint,950359,55,58,0.0,...,RisÃÂ¸-DTU,,,,55.966667,11.583333,1995.0,,,
4,ospar_biota_1995_01_003.5,POINT (55.96666666666667 11.58333333333333),38851.0,Denmark,12,Klint,950489,55,58,0.0,...,RisÃÂ¸-DTU,,,,55.966667,11.583333,1995.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15968,ospar_biota_2022_01_001.707,POINT (54.91633333300007 -0.2801666669999463),,UK,10,13,22-1538,54,54,58.8,...,BEIS,,CEND14/22,,54.916333,-0.280167,,706.0,2022-08-28T00:00:00,
15969,ospar_biota_2022_01_001.708,POINT (53.91250000000008 0.9181666670000368),,UK,10,45,22-1539,53,54,45.0,...,BEIS,,CEND14/22,,53.912500,0.918167,,707.0,2022-08-29T00:00:00,
15970,ospar_biota_2022_01_001.709,POINT (53.93066666700008 1.2753333330000487),,UK,10,8,22-1540,53,55,50.4,...,BEIS,,CEND14/22,,53.930667,1.275333,,708.0,2022-08-29T00:00:00,
15971,ospar_biota_2022_01_001.710,POINT (54.50883333300004 2.716500000000053),,UK,10,16,22-1541,54,30,31.8,...,BEIS,,CEND14/22,,54.508833,2.716500,,709.0,2022-08-29T00:00:00,


The [OSPAR Environmental Monitoring Data](https://odims.ospar.org/en/) is provided as a Microsoft Access database. [`Mdbtools`](https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.

**Example steps**:

1. Download data from [OSPAR portal](https://odims.ospar.org/en/).
2. Install `mdbtools` via Terminal:

    ```
    sudo apt-get -y install mdbtools
    ````

3. Install unzip via Terminal:

    ```
    sudo apt-get -y install unzip
    ````

4.  Navigate to data directory:

    ```
    cd /home/marisco/downloads/marisco/_data/accdb/ospar
    ```

5. Unzip `OSPAR_Env_Concentrations_20241021.zip`:

    ```
    unzip OSPAR_Env_Concentrations_20241021.zip
    ```

6. Run `preprocess.sh` to generate the required data files:

    ```
    ./preprocess.sh OSPAR_Env_Concentrations_20241021.zip
    ````

7. Content of `preprocess.sh` script:

    ```
    #!/bin/bash

    # Example of use: ./preprocess.sh OSPAR_Env_Concentrations_20241021.zip
    unzip $1
    dbname=$(ls *.accdb *.mdb)
    mkdir csv
    for table in $(mdb-tables -1 "$dbname"); do
        echo "Export table $table"
        mdb-export "$dbname" "$table" > "csv/$table.csv"
    done
    ```

Once converted to `.csv` files, the data is ready to be loaded into a dictionary of dataframes.
    

Load OSPAR data and return the data in a Python dictionary of dataframes with the dictionary key as the sample type.

In [None]:
#| exports
default_smp_types = {'Seawater data': 'SEAWATER', 'Biota data': 'BIOTA'}

In [None]:
#| exports
def load_data(src_dir:str, # Directory where the source CSV files are located
              lut:dict=default_smp_types # A dictionary with the file name as key and the sample type as value
              ) -> dict: # A dictionary with sample types as keys and their corresponding dataframes as values
    "Load `OSPAR` data and return the data in a dictionary of dataframes with the dictionary key as the sample type."
    return {
        sample_type: pd.read_csv(Path(src_dir) / f'{file_name}.csv', encoding='unicode_escape')
                     .rename(columns=str.lower)  # Convert all column names to lowercase
        for file_name, sample_type in lut.items()
    }

`dfs` includes a dictionary of dataframes that is created from the OSPAR dataset defined by `fname_in`. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. 

In [None]:
#|eval: false
dfs = load_data(fname_in)
print('keys/sample types: ', dfs.keys())

for key in dfs.keys():
    print(f'{key} columns: ', dfs[key].columns)

keys/sample types:  dict_keys(['SEAWATER', 'BIOTA'])
SEAWATER columns:  Index(['id', 'contracting party', 'rsc sub-division', 'station id',
       'sample id', 'latd', 'latm', 'lats', 'latdir', 'longd', 'longm',
       'longs', 'longdir', 'sample type', 'sampling depth', 'sampling date',
       'nuclide', 'value type', 'activity or mda', 'uncertainty', 'unit',
       'data provider', 'measurement comment', 'sample comment',
       'reference comment'],
      dtype='object')
BIOTA columns:  Index(['id', 'contracting party', 'rsc sub-division', 'station id',
       'sample id', 'latd', 'latm', 'lats', 'latdir', 'longd', 'longm',
       'longs', 'longdir', 'sample type', 'biological group', 'species',
       'body part', 'sampling date', 'nuclide', 'value type',
       'activity or mda', 'uncertainty', 'unit', 'data provider',
       'measurement comment', 'sample comment', 'reference comment'],
      dtype='object')


## Nuclide Name Normalization

### Lower & strip nuclide names

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Some nuclide names contain one or multiple trailing spaces.

:::

In [None]:
#| eval: false
df = get_unique_across_dfs(load_data(fname_in), 'nuclide', as_df=True, include_nchars=True)
df['stripped_chars'] = df['value'].str.strip().str.replace(' ', '').str.len()
print(df[df['n_chars'] != df['stripped_chars']])

    index        value  n_chars  stripped_chars
0       0      137Cs        7.0             5.0
6       6       99Tc        6.0             4.0
7       7  239, 240 Pu     11.0             9.0
8       8      210Po        7.0             5.0
9       9          NaN      NaN             NaN
14     14      99Tc         7.0             4.0


To fix this issue, we use the `LowerStripNameCB` callback. For each dataframe in the dictionary of dataframes, it corrects the nuclide name by converting it lowercase, striping any leading or trailing whitespace(s).

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripNameCB(col_src='nuclide', col_dst='nuclide')])
dfs_output=tfm()
for key, df in dfs_output.items():
    print(f'{key} nuclides: ')
    print(df['nuclide'].unique())

SEAWATER nuclides: 
['137cs' '239,240pu' '226ra' '228ra' '99tc' '3h' '210po' '210pb' nan]
BIOTA nuclides: 
['137cs' '226ra' '228ra' '239,240pu' '99tc' '210po' '210pb' '3h' 'cs-137'
 '238pu' '239, 240 pu' '241am']


### Remap nuclide names to MARIS data formats

Below, we map nuclide names used by HELCOM to the MARIS standard nuclide names. 

Remapping data provider nomenclatures to MARIS standards is a recurrent operation and is done in a semi-automated manner according to the following pattern:

1. **Inspect** data provider nomenclature:
2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); 
3. **Fix** potential mismatches; 
4. **Apply** the lookup table to the dataframe.

We will refer to this process as **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply).

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `nuclide` column has inconsistent naming. E.g:

- `Cs-137`,  `137Cs` or `CS-137`
- `239, 240 pu` or `239,240 pu`
- `ra-226` and `226ra` 

See below:

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='nuclide', as_df=True)

Unnamed: 0,index,value
0,0,137Cs
1,1,226Ra
2,2,210Po
3,3,"239,240Pu"
4,4,3H
5,5,228Ra
6,6,99Tc
7,7,"239, 240 Pu"
8,8,210Po
9,9,


Let's now create an instance of a [fuzzy matching algorithm](https://www.wikiwand.com/en/articles/Approximate_string_matching) `Remapper`. This instance will match the nuclide names of the OSPAR dataset to the MARIS standard nuclide names.

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_output, col_name='nuclide', as_df=True),
                    maris_lut_fn=nuc_lut_path,
                    maris_col_id='nuclide_id',
                    maris_col_name='nc_name',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='nuclides_ospar.pkl')

Lets try to match OSPAR nuclide names to MARIS standard nuclide names as automatically as possible. The `match_score` column allows to assess the results:

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/13 [00:00<?, ?it/s]

Processing: 100%|██████████| 13/13 [00:00<00:00, 44.43it/s]

1 entries matched the criteria, while 12 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"239, 240 pu",pu240,"239, 240 pu",8
"239,240pu",pu240,"239,240pu",6
228ra,u235,228ra,4
241am,pu241,241am,4
210pb,ru106,210pb,4
137cs,i133,137cs,4
210po,ru106,210po,4
226ra,u234,226ra,4
238pu,u238,238pu,3
99tc,tu,99tc,3


We can now manually inspect the unmatched nuclide names and create a table to correct them to the MARIS standard:

In [None]:
#| exports
fixes_nuclide_names = {
    '99tc': 'tc99',
    '238pu': 'pu238',
    '226ra': 'ra226',
    '210pb': 'pb210',
    '241am': 'am241',
    '228ra': 'ra228',
    '137cs': 'cs137',
    '210po': 'po210',
    '239,240pu': 'pu239_240_tot',
    '239, 240 pu': 'pu239_240_tot',
    'cs-137': 'cs137',
    '3h': 'h3'
    }

We now include the table `fixes_nuclide_names`, which applies manual corrections to the nuclide names before the remapping process. 
The `generate_lookup_table` function has an `overwrite` parameter (default is `True`), which, when set to `True`, creates a pickle file cache of the lookup table. We can now test the remapping process:

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)
fc.test_eq(len(remapper.select_match(match_score_threshold=1)), 0)

Processing:   0%|          | 0/13 [00:00<?, ?it/s]

Processing: 100%|██████████| 13/13 [00:00<00:00, 44.64it/s]


If we want to view all the remapped nuclides we can set the match score threshold to 0; 

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)
remapper.select_match(match_score_threshold=0, verbose=True)

Processing:   0%|          | 0/13 [00:00<?, ?it/s]

Processing: 100%|██████████| 13/13 [00:00<00:00, 46.01it/s]

0 entries matched the criteria, while 13 entries had a match score of 0 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
228ra,ra228,228ra,0
241am,am241,241am,0
238pu,pu238,238pu,0
cs-137,cs137,cs-137,0
99tc,tc99,99tc,0
"239, 240 pu",pu239_240_tot,"239, 240 pu",0
,Unknown,,0
210pb,pb210,210pb,0
"239,240pu",pu239_240_tot,"239,240pu",0
137cs,cs137,137cs,0


Values are remapped correctly! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.

In [None]:
#| exports
# Create a lookup table for nuclide names
lut_nuclides = lambda df: Remapper(provider_lut_df=df,
                                   maris_lut_fn=nuc_lut_path,
                                   maris_col_id='nuclide_id',
                                   maris_col_name='nc_name',
                                   provider_col_to_match='value',
                                   provider_col_key='value',
                                   fname_cache='nuclides_ospar.pkl').generate_lookup_table(fixes=fixes_nuclide_names, 
                                                                                            as_df=False, overwrite=False)

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    "Remap data provider nuclide names to standardized MARIS nuclide names."
    def __init__(self, 
                 fn_lut: Callable, # Function that returns the lookup table dictionary
                 col_name: str # Column name to remap
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        df_uniques = get_unique_across_dfs(tfm.dfs, col_name=self.col_name, as_df=True)
        #lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()}    
        lut = {k: v.matched_id for k, v in self.fn_lut(df_uniques).items()}    
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k][self.col_name].replace(lut)

Let's see it in action, along with the `LowerStripNameCB` callback:

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LowerStripNameCB(col_src='nuclide', col_dst='nuclide'),
                            RemapNuclideNameCB(lut_nuclides, col_name='nuclide')
                            ])
dfs_out = tfm()

# For instance
for key in dfs_out.keys():
    print(f'{key} NUCLIDE unique: ', dfs_out[key]['NUCLIDE'].unique())

SEAWATER NUCLIDE unique:  [33 77 53 54 15  1 47 41 -1]
BIOTA NUCLIDE unique:  [33 53 54 77 15 47 41  1 67 72]


## Standardize Time

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: `SEAWATER` dataset contains 1O rows with `NaN` values in the `sampling date` column as shown below.

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
print('Number of NaN values in sampling date: ', dfs['SEAWATER']['sampling date'].isnull().sum())


Number of NaN values in sampling date:  10


Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`) and handle missing dates:

In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse the time format in the dataframe."
    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df['TIME'] = pd.to_datetime(df['sampling date'], format='%m/%d/%y %H:%M:%S', errors='coerce')
            df.dropna(subset=['TIME'], inplace=True)

Apply the transformer for callbacks `ParseTimeCB`. Then, print the `TIME` data for `seawater`.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    ParseTimeCB(),
    CompareDfsAndTfmCB(dfs)])

tfm()

print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['SEAWATER']['TIME'])

                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19183  15951
Number of rows removed           10      0 

0       2010-01-27 00:00:00
1       2010-01-27 00:00:00
2       2010-01-27 00:00:00
3       2010-01-27 00:00:00
4       2010-01-26 00:00:00
                ...        
19183   2019-11-13 12:54:00
19184   2019-12-10 11:37:00
19185   2019-12-10 11:37:00
19186   2019-12-10 11:37:00
19187   2019-12-18 14:43:00
Name: TIME, Length: 19183, dtype: datetime64[ns]


The NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary.

`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[ParseTimeCB(),
                            EncodeTimeCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.logs)
                            

                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19183  15951
Number of rows removed           10      0 

['Parse the time format in the dataframe.', 'Encode time as seconds since epoch.', 'Create a dataframe of dropped data. Data included in the `dfs` not in the `tfm`.']


## Sanitize value

We allocate each column containing measurement values into a single column `VALUE` and remove `NA` where needed.

In [None]:
# | exports
class SanitizeValueCB(Callback):
    "Sanitize value by removing blank entries and populating `value` column."
    def __init__(self, 
                 value_col: str='activity or mda' # Column name to sanitize
                 ):
        fc.store_attr()

    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df.dropna(subset=[self.value_col], inplace=True)
            df['VALUE'] = df[self.value_col]

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[SanitizeValueCB(),
                            CompareDfsAndTfmCB(dfs)])

tfm()

print('Example of VALUE column:')
print(tfm.dfs['SEAWATER'][['VALUE']].head())
print('\nComparison stats:')
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

Example of VALUE column:
   VALUE
0   0.20
1   0.27
2   0.26
3   0.25
4   0.20

Comparison stats:
                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19183  15951
Number of rows removed           10      0 



## Normalize uncertainty

For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).

**Note**: For MARIS the OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor 
𝑘=1.

`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:

In [None]:
#| exports
unc_exp2stan = lambda df, unc_col: df[unc_col] / 2

In [None]:
#| exports
class NormalizeUncCB(Callback):
    """Normalize uncertainty values in DataFrames."""
    def __init__(self, 
                 col_unc: str='uncertainty', # Column name to normalize
                 fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor
                 ): 
        fc.store_attr()

    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df['UNC'] = self.fn_convert_unc(df, self.col_unc)

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
        SanitizeValueCB(),               
        NormalizeUncCB()
    ])
tfm()

for grp in ['SEAWATER', 'BIOTA']:
    print(f'\n{grp}:')
    print(tfm.dfs[grp][['VALUE', 'UNC']].head())


SEAWATER:
   VALUE  UNC
0   0.20  NaN
1   0.27  NaN
2   0.26  NaN
3   0.25  NaN
4   0.20  NaN

BIOTA:
      VALUE  UNC
0  0.326416  NaN
1  0.442704  NaN
2  0.412989  NaN
3  0.202768  NaN
4  0.652833  NaN


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: `SEAWATER` dataset contains rows where the uncertainty is much greater than the value. Altough this is not impossible, I think it is worth highlighting these entries.

:::

To show situations where the uncertainty is much greater than the value we will calcualte the relative uncertainty for the seawater dataset. 

In [None]:
grp='SEAWATER'
tfm.dfs[grp]['relative_uncertainty'] = (
    # Divide 'uncertainty' by 'value'
    (tfm.dfs[grp]['UNC'] / tfm.dfs[grp]['VALUE'])
    # Multiply by 100 to convert to percentage
    * 100
)

Now we will return all rows where the relative uncertainty is greater than 100% for the seawater dataset.

In [None]:
threshold = 100
cols_to_show=['id', 'contracting party', 'nuclide', 'value type', 'activity or mda', 'uncertainty', 'unit', 'relative_uncertainty']
tfm.dfs[grp][cols_to_show][tfm.dfs[grp]['relative_uncertainty'] > threshold].head()


Unnamed: 0,id,contracting party,nuclide,value type,activity or mda,uncertainty,unit,relative_uncertainty
969,11075,United Kingdom,137Cs,=,0.0028,0.3276,Bq/l,5850.0
971,11077,United Kingdom,137Cs,=,0.0029,0.3364,Bq/l,5800.0
973,11079,United Kingdom,137Cs,=,0.0025,0.3325,Bq/l,6650.0
975,11081,United Kingdom,137Cs,=,0.0025,0.345,Bq/l,6900.0
977,11083,United Kingdom,137Cs,=,0.0038,0.3344,Bq/l,4400.0


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: `BIOTA` dataset contains rows where the uncertainty is much greater than the value. Altough this is not impossible, I think it is worth highlighting these entries.

:::

Include the relative uncertainty for the biota dataset. 

In [None]:
grp='BIOTA'
tfm.dfs[grp]['relative_uncertainty'] = (
    # Divide 'uncertainty' by 'value'
    (tfm.dfs[grp]['UNC'] / tfm.dfs[grp]['VALUE'])
    # Multiply by 100 to convert to percentage
    * 100
)

Return all rows where the relative uncertainty is greater than 100% for the biota dataset..

In [None]:
threshold = 100
cols_to_show=['id', 'contracting party', 'nuclide', 'value type', 'activity or mda', 'uncertainty', 'unit', 'relative_uncertainty']
tfm.dfs[grp][cols_to_show][tfm.dfs[grp]['relative_uncertainty'] > threshold].head()

Unnamed: 0,id,contracting party,nuclide,value type,activity or mda,uncertainty,unit,relative_uncertainty
2338,23895,Belgium,226Ra,=,1.4,118.0,Bq/kg f.w.,4214.285714
2693,29984,Belgium,137Cs,=,0.169,27.0,Bq/kg f.w.,7988.16568
3027,35011,Belgium,137Cs,=,0.1619,66.0,Bq/kg f.w.,20382.95244
4442,49221,Sweden,137Cs,=,0.295,2.74,Bq/kg f.w.,464.40678
4447,49226,Sweden,137Cs,=,0.327,1.468,Bq/kg f.w.,224.464832


## Remap units

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: It would be easier to work with the units if they were standardized. The units are not consistent across the dataset, for instance `BQ/L`, `Bq/l` and `Bq/L` are used interchangeably.

:::


:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `Unit` column contains `NaN` values for the `SEAWATER` dataset, as shown below.
:::


In [None]:
dfs['SEAWATER'][dfs['SEAWATER']['unit'].isnull()].drop(columns=['measurement comment','sample comment','reference comment']).head()

Unnamed: 0,id,contracting party,rsc sub-division,station id,sample id,latd,latm,lats,latdir,longd,...,longdir,sample type,sampling depth,sampling date,nuclide,value type,activity or mda,uncertainty,unit,data provider
16161,120369,Ireland,1.0,Salthill,,53,15.0,40.0,N,9,...,W,,,,,,,,,
16162,120370,Ireland,1.0,Woodstown,,52,11.0,55.0,N,6,...,W,,,,,,,,,
16586,120363,Ireland,4.0,N1,,53,25.0,0.0,N,6,...,W,,,,,,,,,
19188,120364,Ireland,4.0,N2,,53,36.0,0.0,N,5,...,W,,,,,,,,,
19189,120365,Ireland,4.0,N3,,53,44.0,0.0,N,5,...,W,,,,,,,,,


Let's inspect the unique units used by OSPAR:

In [None]:
get_unique_across_dfs(dfs, col_name='unit', as_df=True)

Unnamed: 0,index,value
0,0,Bq/kg f.w.
1,1,Bq/L
2,2,
3,3,Bq/l
4,4,BQ/L


We define unit renaming rules for OSPAR dataset:

In [None]:
#| export
# Define unit names renaming rules
renaming_unit_rules = {'Bq/l': 1, #'Bq/m3'
                       'Bq/L': 1,
                       'BQ/L': 1,
                       'Bq/kg f.w.': 5, # Bq/kgw
                       } 

In [None]:
#| exports
class RemapUnitCB(Callback):
    """Callback to update DataFrame 'UNIT' columns based on a lookup table."""

    def __init__(self, lut: Dict[str, str]):
        fc.store_attr('lut')  # Store the lookup table as an attribute

    def __call__(self, tfm: 'Transformer'):
        for grp, df in tfm.dfs.items():
            #if grp == 'seawater':
            #    self._apply_default_units(df)
            self._print_na_units(df)
            self._update_units(df)

    def _apply_default_units(self, df: pd.DataFrame):
        df.loc[df['unit'].isnull(), 'unit'] = 'Bq/l'

    def _print_na_units(self, df: pd.DataFrame):
        na_count = df['unit'].isnull().sum()
        if na_count > 0:
            print(f"Number of rows with NaN in 'unit' column: {na_count}")

    def _update_units(self, df: pd.DataFrame):
        df['UNIT'] = df['unit'].apply(lambda x: self.lut.get(x, 'Unknown'))

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[SanitizeValueCB(), # Remove blank value entries (also removes NaN values in Unit column) 
                            RemapUnitCB(renaming_unit_rules),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()

print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print('Unit unique values:')
for grp in ['BIOTA', 'SEAWATER']:
    print(f"{grp}: {tfm.dfs[grp]['UNIT'].unique()}")

                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19183  15951
Number of rows removed           10      0 

Unit unique values:
BIOTA: [5]
SEAWATER: [1]


## Remap detection limit

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `Value type` column contains many `nan` values, see below.
:::

In [None]:
# Count the number of NaN entries in the 'Value type' column for 'SEAWATER'
na_count_seawater = dfs['SEAWATER']['value type'].isnull().sum()
print(f"Number of NaN 'Value type' entries in 'SEAWATER': {na_count_seawater}")

# Count the number of NaN entries in the 'Value type' column for 'BIOTA'
na_count_biota = dfs['BIOTA']['value type'].isnull().sum()
print(f"Number of NaN 'Value type' entries in 'BIOTA': {na_count_biota}")

Number of NaN 'Value type' entries in 'SEAWATER': 64
Number of NaN 'Value type' entries in 'BIOTA': 23


In the OSPAR dataset the detection limit is encoded as `<`  in the `Value type` column. If a value is `<` then the `Activity or MDA` column contains the detection limit value. If the `Value type` is `=` then the `Activity or MDA` column contains the measurement value.


Lets review the `Value type` column values for the OSPAR dataset:

In [None]:
for grp in dfs.keys():
    print(f'{grp}:')
    print(tfm.dfs[grp]['value type'].unique())


SEAWATER:
['<' '=' nan]
BIOTA:
['<' '=' nan]


Detection limits are encoded as follows in MARIS:

In [None]:
#| eval: false
pd.read_excel(detection_limit_lut_path())

Unnamed: 0,id,name,name_sanitized
0,-1,Not applicable,Not applicable
1,0,Not Available,Not available
2,1,=,Detected value
3,2,<,Detection limit
4,3,ND,Not detected
5,4,DE,Derived


In [None]:
#| exports
lut_dl = lambda: pd.read_excel(detection_limit_lut_path(), usecols=['name','id']).set_index('name').to_dict()['id']

In [None]:
#| exports
coi_dl = {'SEAWATER' : {'DL' : 'value type'},
          'BIOTA':  {'DL' : 'value type'}
          }

In [None]:
# | exports
class RemapDetectionLimitCB(Callback):
    "Remap value type to MARIS format."
    
    def __init__(self, 
                 coi: dict,  # Column configuration dictionary
                 fn_lut: Callable  # Lookup table dictionary
                ):
        fc.store_attr()

    def __call__(self, tfm: Transformer):
        
        lut = self.fn_lut()
        
        "Remap detection limits in the DataFrames using the lookup table."
        for grp in tfm.dfs:
            df = tfm.dfs[grp]
            self._update_detection_limit(df, grp, lut)

    def _update_detection_limit(self, 
                                df: pd.DataFrame,  # The DataFrame to modify
                                grp: str,  # The group name to get the column configuration
                                lut: dict  # The lookup table dictionary
                               ) -> None:
        "Update detection limit column in the DataFrame based on lookup table and rules."
        
        # Access column names from coi_dl
        detection_col = self.coi[grp]['DL']   
                
        # Initialize detection limit column
        df['DL'] = df[detection_col]
        
        # Set detection limits based on conditions
        self._set_detection_limits(df, lut)

    def _set_detection_limits(self, df: pd.DataFrame, lut: dict) -> None:
        "Set detection limits based on value and uncertainty columns."
        
        # Condition for setting '='
        condition_eq = (df['VALUE'].notna() & 
                        df['UNC'].notna() & 
                        ~df['DL'].isin(lut.keys()))
        
        df.loc[condition_eq, 'DL'] = '='

        # Set 'Not Available' for unmatched detection limits
        df.loc[~df['DL'].isin(lut.keys()), 'DL'] = 'Not Available'
        
        # Perform lookup to map detection limits
        df['DL'] = df['DL'].map(lut)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[SanitizeValueCB(),
                            NormalizeUncCB(),                  
                            RemapUnitCB(renaming_unit_rules),
                            RemapDetectionLimitCB(coi_dl, lut_dl)])
tfm()
for grp in ['BIOTA', 'SEAWATER']:
    print(f"{grp}: {tfm.dfs[grp]['DL'].unique()}")

BIOTA: [2 1]
SEAWATER: [2 1]


## Remap Biota species

The OSPAR dataset contains biota species information in the `Species` column of the biota dataframe. To ensure consistency with MARIS standards, we need to remap these species names. We'll use a same approach to the one we employed for standardizing nuclide names:


We first inspect unique `Species` values used by OSPAR:

In [None]:
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='species', as_df=True)

Unnamed: 0,index,value
0,0,Argentina sphyraena
1,1,Crassostrea gigas
2,2,PATELLA VULGATA
3,3,PLUERONECTES PLATESSA
4,4,Lophius piscatorius
...,...,...
162,162,HIPPOGLOSSOIDES PLATESSOIDES
163,163,DICENTRARCHUS (MORONE) LABRAX
164,164,CYCLOPTERUS LUMPUS
165,165,GADUS MORHUA


We try to remap the `Species` column to the `species` column of the MARIS nomenclature, again using a `Remapper` object:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='species', as_df=True),
                    maris_lut_fn=species_lut_path,
                    maris_col_id='species_id',
                    maris_col_name='species',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='species_ospar.pkl')

In this step, we generate a lookup table using the `remapper` object. The lookup table maps data provider entries to MARIS entries using fuzzy matching. After generating the table, we select matches that meet a specified threshold (i.e., greater than 1), which means that matches requiring more than one character change are shown.

- **`generate_lookup_table(as_df=True)`**: This method generates the lookup table and returns it as a DataFrame. It uses fuzzy matching to align entries from the data provider with those in the MARIS lookup table.
- **`select_match(match_score_threshold=1)`**: This method filters the generated lookup table to include only those matches with a score greater than or equal to the specified threshold. A threshold of 1 ensures that only perfect matches are selected.

In [None]:
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1, verbose=True).head()

Processing: 100%|██████████| 167/167 [00:23<00:00,  7.10it/s]

129 entries matched the criteria, while 38 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,Lomentaria catenata,RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,31
"Mixture of green, red and brown algae",Mercenaria mercenaria,"Mixture of green, red and brown algae",26
SOLEA SOLEA (S.VULGARIS),Loligo vulgaris,SOLEA SOLEA (S.VULGARIS),12
Solea solea (S.vulgaris),Loligo vulgaris,Solea solea (S.vulgaris),12
Cerastoderma (Cardium) Edule,Cerastoderma edule,Cerastoderma (Cardium) Edule,10


Below, we fixthe entries that are not properly matched by the `Remapper` object:

In [None]:
#|exports
fixes_biota_species = {
    'PECTINIDAE': NA, # Dropped. In Worms as PECTINIDAE is a family.
    'Unknown': NA,
    'unknown': NA,
    'PALMARIA PALMATA': NA, # Dropped. In Worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',
    'RAJIDAE/BATOIDEA': NA, # Mix 
    'MONODONTA LINEATA': 'Phorcus lineatus',
    'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)', 
    'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',
    'Solea solea (S.vulgaris)': 'Solea solea',
    'Mixture of green, red and brown algae': NA, # Mix 
    'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA, # Mix
    'Gadiculus argenteus': 'Gadiculus argenteus thori',
    'Gadus sp.': 'Gadus morhua',
    }

We now attempt remapping again, incorporating the `fixes_biota_species` dictionary:

In [None]:
#| eval: false
remapper.generate_lookup_table(fixes=fixes_biota_species)
remapper.select_match(match_score_threshold=1, verbose=True)

Processing:   0%|          | 0/167 [00:00<?, ?it/s]

Processing: 100%|██████████| 167/167 [00:22<00:00,  7.45it/s]

140 entries matched the criteria, while 27 entries had a match score of 1 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cerastoderma (Cardium) Edule,Cerastoderma edule,Cerastoderma (Cardium) Edule,10
CERASTODERMA (CARDIUM) EDULE,Cerastoderma edule,CERASTODERMA (CARDIUM) EDULE,10
DICENTRARCHUS (MORONE) LABRAX,Dicentrarchus labrax,DICENTRARCHUS (MORONE) LABRAX,9
Pleuronectiformes [order],Pleuronectiformes,Pleuronectiformes [order],8
Rhodymenia spp.,Rhodymenia,Rhodymenia spp.,5
RAJA DIPTURUS BATIS,Dipturus batis,RAJA DIPTURUS BATIS,5
Sepia spp.,Sepia,Sepia spp.,5
Flatfish,Lambia,Flatfish,5
FUCUS SPP.,Fucus,FUCUS SPP.,5
FUCUS spp,Fucus,FUCUS spp,4


Visual inspection of the remaining imperfectly matched entries appears acceptable. We can now proceed with the final remapping process:

1. Create Remapper Lambda Function:

   We'll define a lambda function that instantiates a Remapper object and returns its corrected lookup table.

2. Apply RemapCB: 

   Using the generic `RemapCB` callback, we'll perform the actual remapping.


In [None]:
#| exports
lut_biota = lambda: Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='species', as_df=True),
                             maris_lut_fn=species_lut_path,
                             maris_col_id='species_id',
                             maris_col_name='species',
                             provider_col_to_match='value',
                             provider_col_key='value',
                             fname_cache='species_ospar.pkl').generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)

Putting it all together, we now apply the `RemapCB` to our data. This process results in the addition of a `species` column to our `biota` dataframe, containing standardized species IDs.


In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA')    
    ])

tfm()['BIOTA']['SPECIES'].unique()

array([ 377,  129,   96,   -1,  192,   99,   50,  378,  270,  379,  380,
        381,  382,  383,  384,  385,  244,  386,  387,  388,  389,  390,
        391,  392,  393,  394,  395,  396,  274,  397,  398,  243,  399,
        400,  401,  402,  403,  404,  405,  406,  407,    0,  191,  139,
        408,  410,  148,  412,  413,  272,  414,  415,  416,  417,  418,
        419,  420,  421,  422,  423,  424,  425,  426,  427,  428,  411,
        429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,
        440,  441,  442,  443,  444,  294, 1607, 1610, 1609, 1605, 1608,
         23, 1606,  234,  556,  158])

## Enhance Species Data Using Biological group column
The `Biological group` column in the OSPAR dataset provides valuable insights related to species. We will leverage this information to enrich the `species` column. To achieve this, we will employ the generic `RemapCB` callback to create an `enhanced_species` column. Subsequently, this `enhanced_species` column will be used to further enrich the `species` column.

First we inspect the unique values in the `Biological group` column.

In [None]:
get_unique_across_dfs(dfs, col_name='biological group', as_df=True)

Unnamed: 0,index,value
0,0,MOLLUSCS
1,1,Seaweeds
2,2,Fish
3,3,Molluscs
4,4,molluscs
5,5,SEAWEED
6,6,Seaweed
7,7,seaweed
8,8,FISH
9,9,fish


We will remap the `Biological group` columns data to the `species` column of the MARIS nomenclature, again using a `Remapper` object:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='biological group', as_df=True),
                    maris_lut_fn=species_lut_path,
                    maris_col_id='species_id',
                    maris_col_name='species',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='enhance_species_ospar.pkl')

Like before we will generate the lookup table and select matches that meet a specified threshold (i.e., greater than 1), which means that matches requiring more than one character change are shown.

In [None]:
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 10/10 [00:01<00:00,  6.99it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fish,Fucus,Fish,4
FISH,Fucus,FISH,4
fish,Fucus,fish,4
MOLLUSCS,Mollusca,MOLLUSCS,1
Seaweeds,Seaweed,Seaweeds,1
Molluscs,Mollusca,Molluscs,1
molluscs,Mollusca,molluscs,1


We can see that some of the entries require manual corrections.

In [None]:
fixes_enhanced_biota_species = {
    'fish': 'Pisces',
    'FISH': 'Pisces',
    'Fish': 'Pisces'    
}


Now we will apply the manual corrections to the lookup table and generate the lookup table again.

In [None]:
remapper.generate_lookup_table(fixes=fixes_enhanced_biota_species)
remapper.select_match(match_score_threshold=1)

Processing:   0%|          | 0/10 [00:00<?, ?it/s]

Processing: 100%|██████████| 10/10 [00:01<00:00,  6.48it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MOLLUSCS,Mollusca,MOLLUSCS,1
Seaweeds,Seaweed,Seaweeds,1
Molluscs,Mollusca,Molluscs,1
molluscs,Mollusca,molluscs,1


Visual inspection of the remaining imperfectly matched entries appears acceptable. We can now proceed with the final remapping process:

1. Create Remapper Lambda Function:

   We'll define a lambda function that instantiates a Remapper object and returns its corrected lookup table.

2. Apply RemapCB: 

   Using the generic `RemapCB` callback, we'll perform the actual remapping.


In [None]:
#| exports
lut_biota_enhanced = lambda: Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='biological group', as_df=True),
                             maris_lut_fn=species_lut_path,
                             maris_col_id='species_id',
                             maris_col_name='species',
                             provider_col_to_match='value',
                             provider_col_key='value',
                             fname_cache='enhance_species_ospar.pkl').generate_lookup_table(fixes=fixes_enhanced_biota_species, as_df=False, overwrite=False)

Now lets see the species that are not matched by the `LookupBiogroupCB` callback. 

Putting it all together, we now apply the `RemapCB` to our data. This process results in the addition of an `enhanced_species` column to our `BIOTA` dataframe, containing standardized species IDs.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA')    
    ])

tfm()['BIOTA']['enhanced_species'].unique()

array([ 873, 1059,  712])

Now that we have the `enhanced_species` column, we can use it to enrich the `species` column. We will use the enhanced species column in the absence of a species match if the enhanced species column is valid. 

In [None]:
# | export
class EnhanceSpeciesCB(Callback):
    """Enhance the 'SPECIES' column using the 'enhanced_species' column if conditions are met."""

    def __init__(self):
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        self._enhance_species(tfm.dfs['BIOTA'])

    def _enhance_species(self, df: pd.DataFrame):
        df['SPECIES'] = df.apply(
            lambda row: row['enhanced_species'] if row['SPECIES'] in [-1, 0] and pd.notnull(row['enhanced_species']) else row['SPECIES'],
            axis=1
        )

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA'),
    RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA'),
    EnhanceSpeciesCB()
    ])

tfm()['BIOTA']['SPECIES'].unique()

array([ 377,  129,   96,  712,  192,   99,   50,  378,  270,  379,  380,
        381,  382,  383,  384,  385,  244,  386,  387,  388,  389,  390,
        391,  392,  393,  394,  395,  396,  274,  397,  398,  243,  399,
        400,  401,  402,  403,  404,  405,  406,  407, 1059,  191,  139,
        408,  410,  148,  412,  413,  272,  414,  415,  416,  417,  418,
        419,  420,  421,  422,  423,  424,  425,  426,  427,  428,  411,
        429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,
        440,  441,  442,  443,  444,  294, 1607, 1610, 1609, 1605, 1608,
         23, 1606,  234,  556,  873,  158])

All entries are matched for the `SPECIES` column.

## Remap Biota tissues

The OSPAR dataset includes entries where the `Body Part` is labeled as `whole`. However, the MARIS data standard requires a more specific distinction in the `body_part` field, differentiating between `Whole animal` and `Whole plant`. Fortunately, the OSPAR data provides a `Biological group` field that allows us to make this distinction.

To address this discrepancy and ensure compatibility with MARIS standards, we will:

1. Create a temporary column `body_part_temp` that combines information from both `Body Part` and `Biological group`.
2. Use this temporary column to perform the lookup using our `Remapper` object.

Lets create the temporary column, `body_part_temp`, that combines `Body Part` and `Biological group`.

In [None]:
#| exports
class AddBodypartTempCB(Callback):
    "Add a temporary column with the body part and biological group combined."    
    def __call__(self, tfm):
        tfm.dfs['BIOTA']['body_part_temp'] = (
            tfm.dfs['BIOTA']['body part'] + ' ' + 
            tfm.dfs['BIOTA']['biological group']
            ).str.strip().str.lower()                                 

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[  
                            AddBodypartTempCB(),
                            ])
dfs_test = tfm()
dfs_test['BIOTA']['body_part_temp'].unique()


array(['whole animal molluscs', 'whole plant seaweed', 'whole fish fish',
       'flesh without bones fish', 'whole animal fish', 'muscle fish',
       'head fish', 'soft parts molluscs', 'growing tips seaweed',
       'soft parts fish', 'unknown fish', 'flesh without bone fish',
       'flesh fish', 'flesh with scales fish', 'liver fish',
       'flesh without bones seaweed', 'whole  fish',
       'flesh without bones molluscs', 'whole  seaweed',
       'whole plant seaweeds', 'whole fish', 'whole without head fish',
       'mix of muscle and whole fish without liver fish',
       'whole fisk fish', 'muscle  fish', 'cod medallion fish',
       'tail and claws fish'], dtype=object)

To align the ``body_part_temp`` column with the ``bodypar`` column in the MARIS nomenclature, we utilize a Remapper object. Since the OSPAR dataset does not include a predefined lookup table for the ``body_part`` column, we first create a lookup table by extracting unique values from the ``body_part_temp`` column.

In [None]:
get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True).head()

Unnamed: 0,index,value
0,0,whole fish
1,1,whole fisk fish
2,2,muscle fish
3,3,flesh with scales fish
4,4,whole without head fish


We try to remap the `body_part_temp` column to the `bodypar` column of the MARIS nomenclature, again using a `Remapper` object:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True),
                    maris_lut_fn=bodyparts_lut_path,
                    maris_col_id='bodypar_id',
                    maris_col_name='bodypar',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='tissues_ospar.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=0, verbose=True).head()

Processing:   0%|          | 0/27 [00:00<?, ?it/s]

Processing: 100%|██████████| 27/27 [00:00<00:00, 96.35it/s]

0 entries matched the criteria, while 27 entries had a match score of 0 or higher.





Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mix of muscle and whole fish without liver fish,Flesh without bones,mix of muscle and whole fish without liver fish,31
cod medallion fish,Old leaf,cod medallion fish,13
whole without head fish,Flesh without bones,whole without head fish,13
tail and claws fish,Stomach and intestine,tail and claws fish,13
whole fisk fish,Whole animal,whole fisk fish,9


Many of the lookup entries are sufficient for our needs. However, for values that don't find a match, we can use the `fixes_biota_bodyparts` dictionary to apply manual corrections. First we will create the dictionary.

In [None]:
#|exports
fixes_biota_tissues = {
    'whole seaweed' : 'Whole plant',
    'flesh fish': 'Flesh with bones', # We assume it as the category 'Flesh with bones' also exists
    'flesh fish' : 'Flesh with bones',
    'unknown fish' : NA,
    'unknown fish' : NA,
    'cod medallion fish' : NA, # TO BE DETERMINED
    'mix of muscle and whole fish without liver fish' : NA, # TO BE DETERMINED
    'whole without head fish' : NA, # TO BE DETERMINED
    'flesh without bones seaweed' : NA, # TO BE DETERMINED
    'tail and claws fish' : NA # TO BE DETERMINED
}

Now we will generate the lookup table and apply the manual corrections of the ``fixes_biota_bodyparts`` dictionary.


In [None]:
#| eval: false
remapper.generate_lookup_table(fixes=fixes_biota_tissues)
remapper.select_match(match_score_threshold=1)

Processing:   0%|          | 0/27 [00:00<?, ?it/s]

Processing: 100%|██████████| 27/27 [00:00<00:00, 98.54it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
soft parts molluscs,Soft parts,soft parts molluscs,9
flesh without bones molluscs,Flesh without bones,flesh without bones molluscs,9
whole plant seaweeds,Whole plant,whole plant seaweeds,9
whole animal molluscs,Whole animal,whole animal molluscs,9
whole fish fish,Whole animal,whole fish fish,9
whole fisk fish,Whole animal,whole fisk fish,9
growing tips seaweed,Growing tips,growing tips seaweed,8
whole plant seaweed,Whole plant,whole plant seaweed,8
whole seaweed,Whole plant,whole seaweed,7
muscle fish,Muscle,muscle fish,6


At this stage, the majority of entries have been successfully matched to MARIS nomenclature. For those entries that remain unmatched, they are appropriately marked as not available. We can now proceed with the final remapping process:

1. Create Remapper Lambda Function:

   We'll define a lambda function that instantiates a Remapper object and returns its corrected lookup table.

2. Apply RemapCB: 

   Using the generic `RemapCB` callback, we'll perform the actual remapping.

In [None]:
#| exports
lut_bodyparts = lambda: Remapper(provider_lut_df=get_unique_across_dfs(tfm.dfs, col_name='body_part_temp', as_df=True),
                               maris_lut_fn=bodyparts_lut_path,
                               maris_col_id='bodypar_id',
                               maris_col_name='bodypar',
                               provider_col_to_match='value',
                               provider_col_key='value',
                               fname_cache='tissues_ospar.pkl'
                               ).generate_lookup_table(fixes=fixes_biota_tissues, as_df=False, overwrite=False)

Putting it all together, we now apply the `RemapCB` to our data. This process results in the addition of a `body_part` column to our `biota` dataframe, containing standardized species IDs.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[  
                            AddBodypartTempCB(),
                            RemapCB(fn_lut=lut_bodyparts, col_remap='BODY_PART', col_src='body_part_temp' , dest_grps='BIOTA')
                            ])
tfm()
tfm.dfs['BIOTA']['BODY_PART'].unique()

array([ 1, 40, 52, 34, 13, 19, 56,  0,  4, 60, 25])

## Remap biogroup

The MARIS species lookup table includes a ``biogroup_id`` column that associates each species with its corresponding ``biogroup``. We will leverage this relationship to populate a ``bio_group`` column in the biota DataFrame.

In [None]:
#| exports
lut_biogroup_from_biota = lambda: get_lut(src_dir=species_lut_path().parent, fname=species_lut_path().name, 
                               key='species_id', value='biogroup_id')

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[ 
    RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA'),
    RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA'),
    EnhanceSpeciesCB(),
    RemapCB(fn_lut=lut_biogroup_from_biota, col_remap='BIO_GROUP', col_src='SPECIES', dest_grps='BIOTA')
    ])

print(tfm()['BIOTA']['BIO_GROUP'].unique())


[14 11  4 13 12  2  5  6]


## Add Laboratory ID (REVIEW)

See helcom.ipynb for details refarding the review of the laboratory ID column.

Lets use the utility `get_unique_across_dfs` function to review the unique laboratory IDs in the OSPAR dataset:

In [None]:
tfm.dfs['BIOTA'].columns

Index(['id', 'contracting party', 'rsc sub-division', 'station id',
       'sample id', 'latd', 'latm', 'lats', 'latdir', 'longd', 'longm',
       'longs', 'longdir', 'sample type', 'biological group', 'species',
       'body part', 'sampling date', 'nuclide', 'value type',
       'activity or mda', 'uncertainty', 'unit', 'data provider',
       'measurement comment', 'sample comment', 'reference comment', 'SPECIES',
       'enhanced_species', 'BIO_GROUP'],
      dtype='object')

In [None]:
tfm.dfs['BIOTA'][['data provider','contracting party']].drop_duplicates().head(5)

Unnamed: 0,data provider,contracting party
0,SCKâ¢CEN,Belgium
172,RisÃ¸-DTU,Denmark
205,Johann Heinrich von ThÃ¼nen Institute (vTI),Germany
236,"Institute for Energy Technology, Kjeller, Norway",Norway
249,Institute of Marine Research/Norwegian Radiation Protection Authority,Norway


The `LAB` information could be included with a little work. 

## Add Sample ID (REVIEW)

See helcom.ipynb for details refarding the review of the sample ID (i.e. ``SMP_ID``	) column.


The OSPAR dataset includes an `ID` column, which we will use to create the `SMP_ID` column.

In [None]:
#| exports
class AddSampleIdCB(Callback):
    "Create a SMP_ID column from the ID column"
    def __call__(self, tfm):
        for df in tfm.dfs.values():
            if 'id' in df.columns:
                df['SMP_ID'] = df['id']

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            AddSampleIdCB(),
                            CompareDfsAndTfmCB(dfs)

                            ])
tfm()
for grp in ['BIOTA', 'SEAWATER']:
    print(f"{grp}: {tfm.dfs[grp]['SMP_ID'].unique()}")

print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
    

BIOTA: [    1     2     3 ... 98060 98061 98062]
SEAWATER: [     1      2      3 ... 120366 120367 120368]
                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19193  15951
Number of rows removed            0      0 



## Add depth

The OSPAR dataset includes a column for the sampling depth (`Sampling depth`) for the `SEAWATER` dataset. In this section, we will create a callback to incorporate the sampling depth (`smp_depth`) into the MARIS dataset.

In [None]:
class AddDepthCB(Callback):
    "Ensure depth values are floats and add 'SMP_DEPTH' columns."
    def __call__(self, tfm: Transformer):
        for df in tfm.dfs.values():
            if 'sampling depth' in df.columns:
                df['SMP_DEPTH'] = df['sampling depth'].astype(float)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    AddDepthCB()
    ])
tfm()
for grp in tfm.dfs.keys():  
    if 'SMP_DEPTH' in tfm.dfs[grp].columns:
        print(f'{grp}:', tfm.dfs[grp][['SMP_DEPTH']].drop_duplicates())

SEAWATER:        SMP_DEPTH
0            3.0
80           2.0
81          21.0
85          31.0
87          32.0
...          ...
16022       71.0
16023       66.0
16025       81.0
16385     1660.0
16389     1500.0

[134 rows x 1 columns]


## Standardize Coordinates

The OSPAR dataset offers coordinates in degrees, minutes, and seconds (DMS). The following callback is designed to convert DMS to decimal degrees. 

In [None]:
# | export
class ConvertLonLatCB(Callback):
    """Convert Coordinates to decimal degrees (DDD.DDDDD°)."""
    def __init__(self):
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        for grp, df in tfm.dfs.items():
            df['LAT'] = self._convert_latitude(df)
            df['LON'] = self._convert_longitude(df)

    def _convert_latitude(self, df: pd.DataFrame) -> pd.Series:
        return np.where(
            df['latdir'].isin(['S']),
            self._dms_to_decimal(df['latd'], df['latm'], df['lats']) * -1,
            self._dms_to_decimal(df['latd'], df['latm'], df['lats'])
        )

    def _convert_longitude(self, df: pd.DataFrame) -> pd.Series:
        return np.where(
            df['longdir'].isin(['W']),
            self._dms_to_decimal(df['longd'], df['longm'], df['longs']) * -1,
            self._dms_to_decimal(df['longd'], df['longm'], df['longs'])
        )

    def _dms_to_decimal(self, degrees: pd.Series, minutes: pd.Series, seconds: pd.Series) -> pd.Series:
        return degrees + minutes / 60 + seconds / 3600


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            ConvertLonLatCB()
                            ])
tfm()
tfm.dfs['SEAWATER'][['LAT','latd', 'latm', 'lats', 'LON', 'latdir', 'longd', 'longm','longs', 'longdir']]

Unnamed: 0,LAT,latd,latm,lats,LON,latdir,longd,longm,longs,longdir
0,51.375278,51,22.0,31.0,3.188056,N,3,11.0,17.0,E
1,51.223611,51,13.0,25.0,2.859444,N,2,51.0,34.0,E
2,51.184444,51,11.0,4.0,2.713611,N,2,42.0,49.0,E
3,51.420278,51,25.0,13.0,3.262222,N,3,15.0,44.0,E
4,51.416111,51,24.0,58.0,2.809722,N,2,48.0,35.0,E
...,...,...,...,...,...,...,...,...,...,...
19188,53.600000,53,36.0,0.0,-5.933333,N,5,56.0,0.0,W
19189,53.733333,53,44.0,0.0,-5.416667,N,5,25.0,0.0,W
19190,53.650000,53,39.0,0.0,-5.233333,N,5,14.0,0.0,W
19191,53.883333,53,53.0,0.0,-5.550000,N,5,33.0,0.0,W


Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator."

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            ConvertLonLatCB(),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

print(tfm.dfs['BIOTA'][['LAT','LON']])


                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19193  15951
Number of rows removed            0      0 

             LAT        LON
0      51.393333   4.031111
1      51.393333   4.031111
2      51.393333   4.031111
3      51.393333   4.031111
4      51.393333   4.031111
...          ...        ...
15946  57.252500  12.087778
15947  57.306389  12.107500
15948  58.603333  11.245000
15949  57.302500  11.905278
15950  57.335278  12.076667

[15951 rows x 2 columns]


## Review all callbacks

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LowerStripNameCB(col_src='nuclide', col_dst='Nuclide'),
                            RemapNuclideNameCB(lut_nuclides, col_name='nuclide'),
                            ParseTimeCB(),
                            EncodeTimeCB(),
                            SanitizeValueCB(),
                            NormalizeUncCB(),
                            RemapUnitCB(renaming_unit_rules),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA'),    
                            RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA'),    
                            EnhanceSpeciesCB(),
                            AddBodypartTempCB(),
                            RemapCB(fn_lut=lut_bodyparts, col_remap='BODY_PART', col_src='body_part_temp' , dest_grps='BIOTA'),
                            AddSampleIdCB(),
                            AddDepthCB(),    
                            ConvertLonLatCB(),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

                           SEAWATER  BIOTA
Number of rows in dfs         19193  15951
Number of rows in tfm.dfs     19183  15951
Number of rows removed           10      0 



### Example change logs

Review the change logs for the netcdf encoding.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LowerStripNameCB(col_src='nuclide', col_dst='nuclide'),
                            RemapNuclideNameCB(lut_nuclides, col_name='nuclide'),
                            ParseTimeCB(),
                            EncodeTimeCB(),
                            SanitizeValueCB(),
                            NormalizeUncCB(),
                            RemapUnitCB(renaming_unit_rules),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA'),    
                            RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA'),    
                            EnhanceSpeciesCB(),
                            AddBodypartTempCB(),
                            RemapCB(fn_lut=lut_bodyparts, col_remap='BODY_PART', col_src='body_part_temp' , dest_grps='BIOTA'),
                            AddSampleIdCB(),
                            AddDepthCB(),    
                            ConvertLonLatCB(),
                            SanitizeLonLatCB(),
                            ])

# Transform
tfm()
# Check transformation logs
tfm.logs

["Convert 'nuclide' column values to lowercase, strip spaces, and store in 'nuclide' column.",
 'Remap data provider nuclide names to standardized MARIS nuclide names.',
 'Parse the time format in the dataframe.',
 'Encode time as seconds since epoch.',
 'Sanitize value by removing blank entries and populating `value` column.',
 'Normalize uncertainty values in DataFrames.',
 "Callback to update DataFrame 'UNIT' columns based on a lookup table.",
 'Remap value type to MARIS format.',
 "Remap values from 'species' to 'SPECIES' for groups: BIOTA.",
 "Remap values from 'biological group' to 'enhanced_species' for groups: BIOTA.",
 "Enhance the 'SPECIES' column using the 'enhanced_species' column if conditions are met.",
 'Add a temporary column with the body part and biological group combined.',
 "Remap values from 'body_part_temp' to 'BODY_PART' for groups: BIOTA.",
 'Create a SMP_ID column from the ID column',
 "Ensure depth values are floats and add 'SMP_DEPTH' columns.",
 'Convert Coo

## Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [None]:
#| exports
def get_attrs(
    tfm: Transformer, # Transformer object
    zotero_key: str, # Zotero dataset record key
    kw: list = kw # List of keywords
    ) -> dict: # Global attributes
    "Retrieve all global attributes."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
get_attrs(tfm, zotero_key=zotero_key, kw=kw)

{'geospatial_lat_min': '49.43222222222222',
 'geospatial_lat_max': '81.26805555555555',
 'geospatial_lon_min': '-58.23166666666667',
 'geospatial_lon_max': '36.181666666666665',
 'geospatial_bounds': 'POLYGON ((-58.23166666666667 36.181666666666665, 49.43222222222222 36.181666666666665, 49.43222222222222 81.26805555555555, -58.23166666666667 81.26805555555555, -58.23166666666667 36.181666666666665))',
 'geospatial_vertical_max': '1850.0',
 'geospatial_vertical_min': '0.0',
 'time_coverage_start': '1995-01-01T00:00:00',
 'time_coverage_end': '2022-12-31T00:00:00',
 'title': 'OSPAR Environmental Monitoring of Radioactive Substances',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "OSPAR Comission\'s Radioactive Substances Committee (RSC)"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Che

### Encoding NETCDF

In [None]:
#| export
def encode(
    fname_in: str, # Input file name
    fname_out_nc: str, # Output file name
    **kwargs # Additional arguments
    ) -> None:
    "Encode data to NetCDF."
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[
                            LowerStripNameCB(col_src='nuclide', col_dst='nuclide'),
                            RemapNuclideNameCB(lut_nuclides, col_name='nuclide'),
                            ParseTimeCB(),
                            EncodeTimeCB(),
                            SanitizeValueCB(),
                            NormalizeUncCB(),
                            RemapUnitCB(renaming_unit_rules),
                            RemapDetectionLimitCB(coi_dl, lut_dl),
                            RemapCB(fn_lut=lut_biota, col_remap='SPECIES', col_src='species', dest_grps='BIOTA'),    
                            RemapCB(fn_lut=lut_biota_enhanced, col_remap='enhanced_species', col_src='biological group', dest_grps='BIOTA'),    
                            EnhanceSpeciesCB(),
                            RemapCB(fn_lut=lut_biogroup_from_biota, col_remap='BIO_GROUP', col_src='SPECIES', dest_grps='BIOTA'),
                            AddBodypartTempCB(),
                            RemapCB(fn_lut=lut_bodyparts, col_remap='BODY_PART', col_src='body_part_temp' , dest_grps='BIOTA'),
                            AddSampleIdCB(),
                            AddDepthCB(),    
                            ConvertLonLatCB(),
                            SanitizeLonLatCB(),
                                ])
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            dest_fname=fname_out_nc, 
                            global_attrs=get_attrs(tfm, zotero_key=zotero_key, kw=kw),
                            verbose=kwargs.get('verbose', False),
                           )
    encoder.encode()

In [None]:
#|eval: false
encode(fname_in, fname_out_nc, verbose=True)

--------------------------------------------------------------------------------
Creating enums for the following columns:
['UNIT', 'DL', 'SPECIES', 'BIO_GROUP', 'BODY_PART', 'NUCLIDE']
Creating enum for unit_t with values {'Not applicable': -1, 'NOT AVAILABLE': 0, 'Bq per m3': 1, 'Bq per m2': 2, 'Bq per kg': 3, 'Bq per kgd': 4, 'Bq per kgw': 5, 'kg per kg': 6, 'TU': 7, 'DELTA per mill': 8, 'atom per kg': 9, 'atom per kgd': 10, 'atom per kgw': 11, 'atom per l': 12, 'Bq per kgC': 13}.
Creating enum for dl_t with values {'Not applicable': -1, 'Not available': 0, 'Detected value': 1, 'Detection limit': 2, 'Not detected': 3, 'Derived': 4}.
Creating enum for species_t with values {'NOT AVAILABLE': 0, 'Aristeus antennatus': 1, 'Apostichopus': 2, 'Saccharina japonica var religiosa': 3, 'Siganus fuscescens': 4, 'Alpheus dentipes': 5, 'Hexagrammos agrammus': 6, 'Ditrema temminckii': 7, 'Parapristipoma trilineatum': 8, 'Scombrops boops': 9, 'Pseudopleuronectes schrenki': 10, 'Desmarestia ligulat

## NetCDF Review

First lets review the general properties of the NetCDF file:

In [None]:
#| eval: false
properties=get_netcdf_properties(fname_out_nc)
for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

file_size_bytes: 607503
file_format: NETCDF4
groups: ['seawater', 'biota']
global_attributes:
  id: TBD
  title: OSPAR Environmental Monitoring of Radioactive Substances
  summary: 
  keywords: oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes, Earth Science > Oceans > Water Quality > Ocean Contaminants, Earth Science > Biological Classification > Animals/Vertebrates > Fish, Earth Science > Biosphere > Ecosystems > Marine Ecosystems, Earth Science > Biological Classification > Animals/Invertebrates > Mollusks, Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans, Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)
  history: TBD
  key

Review the publisher_postprocess_logs.

In [None]:
#| eval: false
print(properties['global_attributes']['publisher_postprocess_logs'])

Convert 'nuclide' column values to lowercase, strip spaces, and store in 'nuclide' column., Remap data provider nuclide names to standardized MARIS nuclide names., Parse the time format in the dataframe., Encode time as seconds since epoch., Sanitize value by removing blank entries and populating `value` column., Normalize uncertainty values in DataFrames., Callback to update DataFrame 'UNIT' columns based on a lookup table., Remap value type to MARIS format., Remap values from 'species' to 'SPECIES' for groups: BIOTA., Remap values from 'biological group' to 'enhanced_species' for groups: BIOTA., Enhance the 'SPECIES' column using the 'enhanced_species' column if conditions are met., Remap values from 'SPECIES' to 'BIO_GROUP' for groups: BIOTA., Add a temporary column with the body part and biological group combined., Remap values from 'body_part_temp' to 'BODY_PART' for groups: BIOTA., Create a SMP_ID column from the ID column, Ensure depth values are floats and add 'SMP_DEPTH' colum

Now lets review the properties of the groups in the NetCDF file:

In [None]:
#| eval: false
properties = get_netcdf_group_properties(fname_out_nc)

for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

seawater:
  variables: ['lon', 'lat', 'smp_depth', 'time', 'smp_id', 'nuclide', 'value', 'unit', 'unc', 'dl']
  dimensions: {'id': 19183}
  attributes: {}
biota:
  variables: ['lon', 'lat', 'time', 'smp_id', 'nuclide', 'value', 'unit', 'unc', 'dl', 'bio_group', 'species', 'body_part']
  dimensions: {'id': 15951}
  attributes: {}


Lets review all variable attributes for the groups of the NetCDF file:

In [None]:
#| eval: false
df_var_prop=get_netcdf_variable_properties(fname_out_nc, as_df=True).T
df_var_prop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
group,seawater,seawater,seawater,seawater,seawater,seawater,seawater,seawater,seawater,seawater,...,biota,biota,biota,biota,biota,biota,biota,biota,biota,biota
variable,lon,lat,smp_depth,time,smp_id,nuclide,value,unit,unc,dl,...,time,smp_id,nuclide,value,unit,unc,dl,bio_group,species,body_part
dimensions_id,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)",...,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)"
dimensions_size,"(19183,)","(19183,)","(19183,)","(19183,)","(19183,)","(19183,)","(19183,)","(19183,)","(19183,)","(19183,)",...,"(15951,)","(15951,)","(15951,)","(15951,)","(15951,)","(15951,)","(15951,)","(15951,)","(15951,)","(15951,)"
data_type,<f4,<f4,<f4,<u8,<u8,<i8,<f4,<i8,<f4,<i8,...,<u8,<u8,<i8,<f4,<i8,<f4,<i8,<i8,<i8,<i8
attr_long_name,Measurement longitude,Measurement latitude,Sample depth below seal level,Time of measurement,Data provider sample ID,Nuclide,Activity,Unit,Uncertainty,Detection limit,...,Time of measurement,Data provider sample ID,Nuclide,Activity,Unit,Uncertainty,Detection limit,Biota group,Species,Body part
attr_standard_name,longitude,latitude,sample_depth_below_sea_floor,time,sample_id,nuclide,activity,unit,uncertainty,detection_limit,...,time,sample_id,nuclide,activity,unit,uncertainty,detection_limit,biota_group_tbd,species,body_part_tbd
attr_units,degrees_east,degrees_north,m,seconds since 1970-01-01 00:00:00.0,,,,,,,...,seconds since 1970-01-01 00:00:00.0,,,,,,,,,
attr_axis,,,Z,T,,,,,,,...,T,,,,,,,,,
attr_time_origin,,,,1970-01-01 00:00:00,,,,,,,...,1970-01-01 00:00:00,,,,,,,,,


Lets convert the NetCDF file to a dictionary of DataFrames:

In [None]:
#| eval: false
dfs=nc_to_dfs(fname_out_nc)

Lets review the biota data:

In [None]:
#| eval: false
nc_dfs_biota=dfs['BIOTA']
nc_dfs_biota

Unnamed: 0,lon,lat,time,smp_id,nuclide,value,unit,unc,dl,bio_group,species,body_part
0,4.031111,51.393333,2010-03-03,1,33,0.326416,5,,2,14,377,1
1,4.031111,51.393333,2010-06-14,2,33,0.442704,5,,2,14,377,1
2,4.031111,51.393333,2010-09-27,3,33,0.412989,5,,2,14,377,1
3,4.031111,51.393333,2010-12-08,4,33,0.202768,5,,2,14,377,1
4,4.031111,51.393333,2010-03-03,5,53,0.652833,5,,2,14,377,1
...,...,...,...,...,...,...,...,...,...,...,...,...
15946,12.087778,57.252499,2022-08-09,98058,33,0.384000,5,0.012096,1,4,272,52
15947,12.107500,57.306389,2022-09-23,98059,33,0.456000,5,0.012084,1,4,272,52
15948,11.245000,58.603333,2022-11-07,98060,33,0.122000,5,0.031000,1,14,129,19
15949,11.905278,57.302502,2022-09-20,98061,33,0.310000,5,,2,14,129,19


Lets review the seawater data:

In [None]:
#| eval: false
nc_dfs_seawater=dfs['SEAWATER']
nc_dfs_seawater

Unnamed: 0,lon,lat,smp_depth,time,smp_id,nuclide,value,unit,unc,dl
0,3.188056,51.375278,3.0,2010-01-27 00:00:00,1,33,0.200000,1,,2
1,2.859444,51.223610,3.0,2010-01-27 00:00:00,2,33,0.270000,1,,2
2,2.713611,51.184444,3.0,2010-01-27 00:00:00,3,33,0.260000,1,,2
3,3.262222,51.420277,3.0,2010-01-27 00:00:00,4,33,0.250000,1,,2
4,2.809722,51.416111,3.0,2010-01-26 00:00:00,5,33,0.200000,1,,2
...,...,...,...,...,...,...,...,...,...,...
19178,4.615278,52.831944,1.0,2019-11-13 12:54:00,97102,77,0.000005,1,2.600000e-07,1
19179,3.565556,51.411945,1.0,2019-12-10 11:37:00,96936,1,6.152000,1,3.076000e-01,1
19180,3.565556,51.411945,1.0,2019-12-10 11:37:00,96949,53,0.005390,1,1.078000e-03,1
19181,3.565556,51.411945,1.0,2019-12-10 11:37:00,96962,54,0.001420,1,2.840000e-04,1


## Data Format Conversion 

The MARIS data processing workflow involves two key steps:

1. **NetCDF to Standardized CSV Compatible with OpenRefine Pipeline**
   - Convert standardized NetCDF files to CSV formats compatible with OpenRefine using the `NetCDFDecoder`.
   - Preserve data integrity and variable relationships.
   - Maintain standardized nomenclature and units.

2. **Database Integration**
   - Process the converted CSV files using OpenRefine.
   - Apply data cleaning and standardization rules.
   - Export validated data to the MARIS master database.

This section focuses on the first step: converting NetCDF files to a format suitable for OpenRefine processing using the `NetCDFDecoder` class.

In [None]:
#|eval: false
decode(fname_in=fname_out_nc, verbose=True)

{'SEAWATER':             LON        LAT  SMP_DEPTH        TIME  SMP_ID  NUCLIDE     VALUE  \
0      3.188056  51.375278        3.0  1264550400       1       33  0.200000   
1      2.859444  51.223610        3.0  1264550400       2       33  0.270000   
2      2.713611  51.184444        3.0  1264550400       3       33  0.260000   
3      3.262222  51.420277        3.0  1264550400       4       33  0.250000   
4      2.809722  51.416111        3.0  1264464000       5       33  0.200000   
...         ...        ...        ...         ...     ...      ...       ...   
19178  4.615278  52.831944        1.0  1573649640   97102       77  0.000005   
19179  3.565556  51.411945        1.0  1575977820   96936        1  6.152000   
19180  3.565556  51.411945        1.0  1575977820   96949       53  0.005390   
19181  3.565556  51.411945        1.0  1575977820   96962       54  0.001420   
19182  3.493889  51.719444        1.0  1576680180   96982        1  6.078000   

       UNIT           UNC 