In [None]:
#| default_exp handlers.ospar

# OSPAR 

> This data pipeline, known as a "handler" in Marisco terminology, is designed to clean, standardize, and encode [OSPAR data](https://odims.ospar.org/en/) into `NetCDF` format. The handler processes raw OSPAR data, applying various transformations and lookups to align it with `MARIS` data standards.

Key functions of this handler:

- **Cleans** and **normalizes** raw OSPAR data
- **Applies standardized nomenclature** and units
- **Encodes the processed data** into `NetCDF` format compatible with MARIS requirements

This handler is a crucial component in the Marisco data processing workflow, ensuring OSPAR data is properly integrated into the MARIS database.



Note: *Additionally, an optional encoder (pipeline) is provided below to process data into a `.csv` format compatible with the MARIS master database. This feature is maintained for legacy purposes, as data ingestion was previously performed using OpenRefine.*

:::{.callout-tip}

For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.

:::

The present notebook pretends to be an instance of [Literate Programming](https://www.wikiwand.com/en/articles/Literate_programming) in the sense that it is a narrative that includes code snippets that are interspersed with explanations. When a function or a class needs to be exported in a dedicated python module (in our case `marisco/handlers/ospar.py`) the code snippet is added to the module using `#| exports` as provided by the wonderful [nbdev](https://nbdev.readthedocs.io/en/latest/) library.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import pandas as pd 
import numpy as np
from functools import partial 
import fastcore.all as fc 
from pathlib import Path 
from dataclasses import asdict
from typing import List, Dict, Callable, Tuple, Any 
from collections import OrderedDict, defaultdict
import re

from marisco.utils import (
    has_valid_varname, 
    match_worms, 
    Remapper, 
    ddmm_to_dd,
    match_maris_lut, 
    Match, 
    get_unique_across_dfs
    )

from marisco.callbacks import (
    Callback, 
    Transformer, 
    RemoveAllNAValuesCB,
    EncodeTimeCB, 
    AddSampleTypeIdColumnCB,
    AddNuclideIdColumnCB, 
    LowerStripNameCB, 
    SanitizeLonLatCB, 
    ReshapeLongToWide, 
    CompareDfsAndTfmCB,
    RemoveAllNAValuesCB
    )

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB, 
    DepthRangeCB, 
    TimeRangeCB, 
    ZoteroCB, 
    KeyValuePairCB
    )

from marisco.configs import (
    nuc_lut_path, 
    nc_tpl_path, 
    cfg, 
    cache_path, 
    cdl_cfg, 
    Enums, 
    lut_path, 
    species_lut_path, 
    sediments_lut_path, 
    bodyparts_lut_path, 
    detection_limit_lut_path, 
    filtered_lut_path, 
    area_lut_path,
    get_lut,
    unit_lut_path
    )

from marisco.utils import NA
from marisco.serializers import NetCDFEncoder,  OpenRefineCsvEncoder

import warnings
warnings.filterwarnings('ignore')

In [None]:
#| hide
pd.set_option('display.max_rows', 100)

In [None]:
warnings.filterwarnings('ignore')

## Configuration and file paths

1. **fname_in** - is the path to the folder containing the OSPAR data in CSV format. The path can be defined as a relative path. 

2. **fname_out_nc** - is the path and filename for the NetCDF output.The path can be defined as a relative path. 

3. **fname_out_csv** - is the path and filename for the Open Refine csv output.The path can be defined as a relative path.

4. **Zotero key** - is used to retrieve attributes related to the dataset from [Zotero](https://www.zotero.org/). The MARIS datasets include a [library](https://maris.iaea.org/datasets) available on [Zotero](https://www.zotero.org/groups/2432820/maris/library). 

5. **ref_id** - refers to the location in archive of the Zotero library.


In [None]:
# | exports
fname_in = '../../_data/accdb/ospar/csv'
fname_out_nc = '../../_data/output/191-OSPAR-2024.nc'
fname_out_csv = '../../_data/output/191-OSPAR-2024.csv'
zotero_key ='LQRA4MMK' # OSPAR MORS zotero key
ref_id = 191 # OSPAR reference id as defined by MARIS

## Load data

[OSPAR Environmental Monitoring Data](https://odims.ospar.org/en/) is provided as a Microsoft Access database. [`Mdbtools`](https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.

**Example steps**:

1. [Download data](https://odims.ospar.org/en/)
2. Install `mdbtools` via `VScode` Terminal (for instance):

    ```
    sudo apt-get -y install mdbtools
    ````

3. Install unzip via VScode Terminal 

    ```
    sudo apt-get -y install unzip
    ````

4. In `VS code` terminal (for instance), navigate to the marisco data folder

    ```
    cd /home/marisco/downloads/marisco/_data/accdb/ospar
    ```

5. Unzip `OSPAR_Env_Concentrations_20240206.zip`

    ```
    unzip OSPAR_Env_Concentrations_20240206.zip
    ```

6. Run `preprocess.sh` to generate the required data files

    ```
    ./preprocess.sh OSPAR_Env_Concentrations_20240206.zip
    ````

7. Content of `preprocess.sh` script:
    ```
    #!/bin/bash

    # Example of use: ./preprocess.sh OSPAR_Env_Concentrations_20240206.zip
    unzip $1
    dbname=$(ls *.accdb *.mdb)
    mkdir csv
    for table in $(mdb-tables -1 "$dbname"); do
        echo "Export table $table"
        mdb-export "$dbname" "$table" > "csv/$table.csv"
    done
    ```

Once converted to `.csv` files, the data is ready to be loaded into a dictionary of dataframes.
    

Load OSPAR data and return the data in a Python dictionary of dataframes with the dictionary key as the sample type.

In [None]:
#| exports
default_smp_types = {'Seawater data': 'seawater', 'Biota data': 'biota'}

In [None]:
#| exports
def load_data(src_dir:str, # Directory where the source CSV files are located
              lut:dict=default_smp_types # A dictionary with the file name as key and the sample type as value
              ) -> dict: # A dictionary with sample types as keys and their corresponding dataframes as values
    "Load `OSPAR` data and return the data in a dictionary of dataframes with the dictionary key as the sample type."
    return {
        sample_type: pd.read_csv(Path(src_dir) / f'{file_name}.csv', encoding='unicode_escape')
        for file_name, sample_type in lut.items()
    }

`dfs` includes a dictionary of dataframes that is created from the OSPAR dataset defined by `fname_in`. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. 

In [None]:
#|eval: false
dfs = load_data(fname_in)
print('keys/sample types: ', dfs.keys())

for key in dfs.keys():
    print(f'{key} columns: ', dfs[key].columns)

keys/sample types:  dict_keys(['seawater', 'biota'])
seawater columns:  Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Sampling depth', 'Sampling date',
       'Nuclide', 'Value type', 'Activity or MDA', 'Uncertainty', 'Unit',
       'Data provider', 'Measurement Comment', 'Sample Comment',
       'Reference Comment'],
      dtype='object')
biota columns:  Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',
       'Body Part', 'Sampling date', 'Nuclide', 'Value type',
       'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',
       'Measurement Comment', 'Sample Comment', 'Reference Comment'],
      dtype='object')


## Remove missing data

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `Seawater` dataset contains 548 rows with all NA values as shown below.

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
for key in dfs.keys():
    cols_to_check = dfs[key].columns[1:]
    mask = dfs[key][cols_to_check].isnull().all(axis=1)
    print(f'{key}: {mask.sum()} rows with all NA values')

seawater: 538 rows with all NA values
biota: 0 rows with all NA values


In [None]:
#| exports
common_cols = [
    'Contracting Party', 'RSC Sub-division', 'Station ID', 'Sample ID',
    'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM', 'LongS', 'LongDir',
    'Sample type', 'Sampling date', 'Nuclide', 'Value type', 'Activity or MDA',
    'Uncertainty', 'Unit', 'Data provider', 'Measurement Comment',
    'Sample Comment', 'Reference Comment'
]

cols_to_check = {
    'seawater': common_cols + ['Sampling depth'],
    'biota': common_cols + ['Biological group', 'Species', 'Body Part']
}

Let's use the `RemoveAllNAValuesCB` callback to remove all rows with all NA values.

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])

# Test that all NA values have been removed
fc.test_eq(tfm()['seawater'][cols_to_check['seawater']].isnull().all(axis=1).sum(), 0)

## Add sample type column

The sample type (`seawater`, `biota`) as defined in the `configs.ipynb` are encoded group names in NetCDF produced. Addition of sample type ids into individual dataframes is done using the `AddSampleTypeIdColumnCB` callback for legacy purposes (i.e. Open Refine output).

For instance:

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[AddSampleTypeIdColumnCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

print(tfm()['seawater'][['ID', 'Station ID', 'samptype_id']].head())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

   ID   Station ID  samptype_id
0   1  Belgica-W01            1
1   2  Belgica-W02            1
2   3  Belgica-W03            1
3   4  Belgica-W04            1
4   5  Belgica-W05            1
                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



## Normalize nuclide names

### Remap nuclide names to MARIS data formats

We below map nuclide names used by OSPAR to the MARIS standard nuclide names. 

Remapping data provider nomenclatures into MARIS standards is one recurrent operation and is done in a semi-automated manner according to the following pattern:

1. **Inspect** data provider nomenclature:
2. **Match** automatically against MARIS nomenclature (using a fuzzy matching algorithm); 
3. **Fix** potential mismatches; 
4. **Apply** the lookup table to the dataframe.

As now on, we will use this pattern to remap the OSPAR data provider nomenclatures into MARIS standards and name it for the sake of brevity **IMFA** (**I**nspect, **M**atch, **F**ix, **A**pply).

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The `Nuclide` column has inconsistent naming. E.g:

- `Cs-137`,  `137Cs` or `CS-137`
- `239, 240 pu` or `239,240 pu`
- `ra-226` and `226ra` 

See below:

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True)

Unnamed: 0,index,value
0,0,"239, 240 Pu"
1,1,137Cs
2,2,CS-137
3,3,Cs-137
4,4,Cs-134
5,5,210Po
6,6,"239,240Pu"
7,7,228Ra
8,8,210Pb
9,9,RA-228


Let's now create an instance of a fuzzy matching algorithm `Remapper`:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Nuclide', as_df=True),
                    maris_lut_fn=nuc_lut_path,
                    maris_col_id='nuclide_id',
                    maris_col_name='nc_name',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='nuclides_ospar.pkl')

And try to match HELCOM to MARIS nuclide names as automatically as possible. The `match_score` column allows to assess the results:

In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 18/18 [00:00<00:00, 41.59it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"239, 240 Pu",pu240,"239, 240 Pu",8
"239,240Pu",pu240,"239,240Pu",6
241Am,pu241,241Am,4
137Cs,h3,137Cs,4
228Ra,u238,228Ra,4
210Pb,ag106m,210Pb,4
210Po,ag106m,210Po,4
226Ra,u235,226Ra,4
238Pu,u238,238Pu,3
99Tc,tu,99Tc,3


We then manually inspect the remaining unmatched names and create a fixes table to map them to the correct MARIS standards:

In [None]:
#| exports
fixes_nuclide_names = {
    '99Tc': 'tc99',
    '238Pu': 'pu238',
    '226Ra': 'ra226',
    '210Pb': 'pb210',
    '241Am': 'am241',
    '228Ra': 'ra228',
    '137Cs': 'cs137',
    '210Po': 'po210',
    '239,240Pu': 'pu239_240_tot',
    '239, 240 Pu': 'pu239_240_tot'
    }

Let's try to match again but this time we use the `fixes_nuclide_names` to map the nuclide names to the MARIS standards:


In [None]:
#| eval: false
remapper.generate_lookup_table(as_df=True, fixes=fixes_nuclide_names)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 18/18 [00:00<00:00, 50.88it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3H,h3,3H,2
CS-137,cs137,CS-137,1
Cs-137,cs137,Cs-137,1
Cs-134,cs134,Cs-134,1
RA-228,ra228,RA-228,1
RA-226,ra226,RA-226,1
CS-134,cs134,CS-134,1


Values are remapped correctly! We can now create a callback `RemapNuclideNameCB` to remap the nuclide names. Note that we pass `overwrite=False` to the `Remapper` constructor to now use the cached version.

In [None]:
#| exports
# Create a lookup table for nuclide names
lut_nuclides = lambda df: Remapper(provider_lut_df=df,
                                   maris_lut_fn=nuc_lut_path,
                                   maris_col_id='nuclide_id',
                                   maris_col_name='nc_name',
                                   provider_col_to_match='value',
                                   provider_col_key='value',
                                   fname_cache='nuclides_ospar.pkl').generate_lookup_table(fixes=fixes_nuclide_names, 
                                                                                            as_df=False, overwrite=False)

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    def __init__(self, 
                 fn_lut:Callable # Function that returns the lookup table dictionary
                ):
        "Remap data provider nuclide names to MARIS nuclide names."
        fc.store_attr()

    def __call__(self, tfm):
        df_uniques = get_unique_across_dfs(tfm.dfs, col_name='Nuclide', as_df=True)
        lut = {k: v.matched_maris_name for k, v in self.fn_lut(df_uniques).items()}    
        for k in tfm.dfs.keys():
            tfm.dfs[k]['NUCLIDE'] = tfm.dfs[k]['Nuclide'].replace(lut)

Let's see it in action, along with the `RemapRdnNameCB` callback:

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapNuclideNameCB(lut_nuclides)])
dfs_out = tfm()

# For instance
for key in dfs_out.keys():
    print(f'{key} NUCLIDE unique: ', dfs_out[key]['NUCLIDE'].unique())

seawater NUCLIDE unique:  ['cs137' 'pu239_240_tot' 'ra226' 'ra228' 'tc99' 'h3' 'po210' 'pb210'
 'Unknown']
biota NUCLIDE unique:  ['pu239_240_tot' 'tc99' 'cs137' 'ra226' 'ra228' 'pu238' 'am241' 'cs134'
 'h3' 'pb210' 'po210']


### Add Nuclide Id column

The `nuclide_id` column is added to the dataframe for legacy reasons (again Open Refine output).

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapNuclideNameCB(lut_nuclides),
                            AddNuclideIdColumnCB(col_value='NUCLIDE')
                            ])
dfs_out = tfm()

# For instance
dfs_out['biota'][['NUCLIDE', 'nuclide_id']]

Unnamed: 0,NUCLIDE,nuclide_id
0,pu239_240_tot,77
1,tc99,15
2,pu239_240_tot,77
3,pu239_240_tot,77
4,tc99,15
...,...,...
15309,tc99,15
15310,pu239_240_tot,77
15311,cs137,33
15312,cs137,33


## Standardize Time

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: `Seawater` dataset contains still 1O rows with `NaN` values in the `Sampling date` column as shown below.

:::

In [None]:
#| eval: false
dfs = load_data(fname_in)
dfs_test = Transformer(dfs, cbs=[RemoveAllNAValuesCB(cols_to_check)])()
dfs_test['seawater']['Sampling date'].isnull().sum()


10

Create a callback that remaps the time format in the dictionary of dataframes (i.e. `%m/%d/%y %H:%M:%S`) and handle missing dates:

In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse the time format in the dataframe."
    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df['time'] = pd.to_datetime(df['Sampling date'], format='%d/%m/%Y', errors='coerce')
            df['begperiod'] = df['time']
            df.dropna(subset=['time'], inplace=True)

Apply the transformer for callbacks `ParseTimeCB`. Then, print the ``begperiod`` and `time` data for `seawater`.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemoveAllNAValuesCB(cols_to_check),
    ParseTimeCB(),
    CompareDfsAndTfmCB(dfs)])

tfm()

print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['seawater'][['begperiod','time']])

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18318  15314
Number of dropped rows                                   538      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

       begperiod       time
0     2010-01-27 2010-01-27
1     2010-01-27 2010-01-27
2     2010-01-27 2010-01-27
3     2010-01-27 2010-01-27
4     2010-01-26 2010-01-26
...          ...        ...
18851 2021-04-29 2021-04-29
18852 2021-12-10 2021-12-10
18853 2021-04-07 2021-04-07
18854 2021-04-07 2021-04-07
18855 2021-04-07 2021-04-07

[18318 rows x 2 columns]


NetCDF time format requires the time to be encoded as number of milliseconds since a time of origin. In our case the time of origin is `1970-01-01` as indicated in `configs.ipynb` `CONFIFS['units']['time']` dictionary.

`EncodeTimeCB` converts the HELCOM `time` format to the MARIS NetCDF `time` format.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemoveAllNAValuesCB(cols_to_check),
    ParseTimeCB(),
    EncodeTimeCB(cfg(), verbose = True)])

tfm()
tfm.dfs['seawater'][['time']].head()

Unnamed: 0,time
0,1264550400
1,1264550400
2,1264550400
3,1264550400
4,1264464000


## Sanitize value

We allocate each column containing measurement values into a single column `value` and remove `NA` where needed.

In [None]:
# | exports
class SanitizeValue(Callback):
    "Sanitize value by removing blank entries and populating `value` column."
    def __init__(self, 
                 value_col: str='Activity or MDA' # Column name to sanitize
                 ):
        fc.store_attr()

    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df.dropna(subset=[self.value_col], inplace=True)
            df['value'] = df[self.value_col]

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[SanitizeValue()])

tfm()['seawater'][['value']].head()

Unnamed: 0,value
0,0.2
1,0.27
2,0.26
3,0.25
4,0.2


## Normalize uncertainty

For each sample type in the OSPAR dataset, the reported uncertainty is given as an expanded uncertainty with a coverage factor `𝑘=2`. For further details, refer to the [OSPAR reporting guidelines](https://mcc.jrc.ec.europa.eu/documents/OSPAR/Guidelines_forestimationof_a_%20measurefor_uncertainty_in_OSPARmonitoring.pdf).

**Note**: The OSPAR uncertainty values are normalized to standard uncertainty with a coverage factor 
𝑘=1.

`NormalizeUncCB` callback normalizes the uncertainty using the following `lambda` function:

In [None]:
#| exports
unc_exp2stan = lambda df, unc_col: df[unc_col] / 2

In [None]:
#| exports
class NormalizeUncCB(Callback):
    """Normalize uncertainty values in DataFrames."""
    def __init__(self, 
                 col_unc: str='Uncertainty', # Column name to normalize
                 fn_convert_unc: Callable=unc_exp2stan, # Function correcting coverage factor
                 ): 
        fc.store_attr()

    def __call__(self, tfm):
        for df in tfm.dfs.values():
            df['uncertainty'] = self.fn_convert_unc(df, self.col_unc)

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[       
                            SanitizeValue(),               
                            NormalizeUncCB()
                            ])
tfm()

for grp in ['seawater', 'biota']:
    print(f'\n{grp}:')
    print(tfm.dfs[grp][['value', 'uncertainty']].head())


seawater:
   value  uncertainty
0   0.20          NaN
1   0.27          NaN
2   0.26          NaN
3   0.25          NaN
4   0.20          NaN

biota:
     value  uncertainty
0   0.3510        0.033
1  39.0000        7.500
2   0.0938        0.009
3   1.5400        0.155
4  16.0000        3.000


## Remap Biota species

Biota `species` information is contained in OSPAR biota `Species` column. We follow in the next following processing steps the same approach as for remapping of nuclide names above.

We first inspect unique `Species` values used by OSPAR:

In [None]:
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='Species', as_df=True)

Unnamed: 0,index,value
0,0,Unknown
1,1,Homarus gammarus
2,2,SPRATTUS SPRATTUS
3,3,Anarhichas denticulatus
4,4,MOLVA MOLVA
...,...,...
151,151,MELANOGRAMMUS AEGLEFINUS
152,152,MERLUCCIUS MERLUCCIUS
153,153,PECTEN MAXIMUS
154,154,LITTORINA LITTOREA


We try to remap the `Species` column to the `species` column of the MARIS nomenclature, again using a `Remapper` object:

In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Species', as_df=True),
                    maris_lut_fn=species_lut_path,
                    maris_col_id='species_id',
                    maris_col_name='species',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='species_ospar.pkl')

In [None]:
remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 156/156 [00:23<00:00,  6.65it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,Lomentaria catenata,RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,31
"Mixture of green, red and brown algae",Mercenaria mercenaria,"Mixture of green, red and brown algae",26
Solea solea (S.vulgaris),Loligo vulgaris,Solea solea (S.vulgaris),12
SOLEA SOLEA (S.VULGARIS),Loligo vulgaris,SOLEA SOLEA (S.VULGARIS),12
CERASTODERMA (CARDIUM) EDULE,Cerastoderma edule,CERASTODERMA (CARDIUM) EDULE,10
Cerastoderma (Cardium) Edule,Cerastoderma edule,Cerastoderma (Cardium) Edule,10
NUCELLA LAPILLUS,Mugil cephalus,NUCELLA LAPILLUS,9
DICENTRARCHUS (MORONE) LABRAX,Dicentrarchus labrax,DICENTRARCHUS (MORONE) LABRAX,9
MONODONTA LINEATA,Ophiothrix lineata,MONODONTA LINEATA,9
Pleuronectiformes [order],Pleuronectiformes,Pleuronectiformes [order],8


We fix below some of the entries that are not properly matched by the `Remapper` object:

In [None]:
#|exports
fixes_biota_species = {
    'PECTINIDAE': NA, # Dropped. In Worms as PECTINIDAE is a family.
    'Unknown': NA,
    'unknown': NA,
    'PALMARIA PALMATA': NA, # Dropped. In Worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',
    'RAJIDAE/BATOIDEA': NA, # Mix 
    'MONODONTA LINEATA': 'Phorcus lineatus',
    'NUCELLA LAPILLUS': NA, # Dropped. In Worms 'Nucella lapillus (Linnaeus, 1758)', 
    'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',
    'Solea solea (S.vulgaris)': 'Solea solea',
    'Mixture of green, red and brown algae': NA, # Mix 
    'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': NA, # Mix
    }

And give it an another try:

In [None]:
#| eval: false
remapper.generate_lookup_table(fixes=fixes_biota_species)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 156/156 [00:23<00:00,  6.66it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CERASTODERMA (CARDIUM) EDULE,Cerastoderma edule,CERASTODERMA (CARDIUM) EDULE,10
Cerastoderma (Cardium) Edule,Cerastoderma edule,Cerastoderma (Cardium) Edule,10
DICENTRARCHUS (MORONE) LABRAX,Dicentrarchus labrax,DICENTRARCHUS (MORONE) LABRAX,9
Pleuronectiformes [order],Pleuronectiformes,Pleuronectiformes [order],8
FUCUS SPP.,Fucus,FUCUS SPP.,5
Flatfish,Lambia,Flatfish,5
Sepia spp.,Sepia,Sepia spp.,5
Rhodymenia spp.,Rhodymenia,Rhodymenia spp.,5
RAJA DIPTURUS BATIS,Dipturus batis,RAJA DIPTURUS BATIS,5
RHODYMENIA spp,Rhodymenia,RHODYMENIA spp,4


Visual inspection of the remaining unperfectly matched entries seem acceptable to proceed. 

We now define a callback to apply the lookup table to the `biota` dataframe.

In [None]:
#| exports
class RemapBiotaSpeciesCB(Callback):
    "Biota species standardized to MARIS format."
    def __init__(self, 
                 fn_lut: Callable, # Function that returns the lookup table dictionary
                 verbose: bool=False # Print unmatched values
                 ):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['species'] = tfm.dfs['biota']['Species'].apply(lambda x: self._get_species(x, lut))
        
    def _get_species(self, 
                     value_to_match:str, # The value to match
                     lut:dict # The lookup table dictionary
                    ):
        match = lut.get(value_to_match, Match(-1, None, None, None))
        if self.verbose and match.matched_id == -1:
                print(f'Unmatched species: {value_to_match}')
        return match.matched_id

Let's see it in action, along with the `RemapBiotaSpeciesCB` callback:

In [None]:
#| exports
lut_biota = lambda: Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Species', as_df=True),
                             maris_lut_fn=species_lut_path,
                             maris_col_id='species_id',
                             maris_col_name='species',
                             provider_col_to_match='value',
                             provider_col_key='value',
                             fname_cache='species_ospar.pkl').generate_lookup_table(fixes=fixes_biota_species, as_df=False, overwrite=False)

In [None]:
#| eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    RemoveAllNAValuesCB(cols_to_check),
    RemapBiotaSpeciesCB(lut_biota)
    ])

tfm()['biota']['species'].unique()

array([ 394,   96,  129,   50,  139,  270,  395,   -1,   99,  377,  414,
       1608,  244,  192,   23,    0,  402,  407,  401,  274,  378, 1609,
        384,  386,  191,  382,  404,  405,  385,  388,  383,  379,  432,
        243,  392,  393,  413,  400,  425,  419,  399,  556,  272,  391,
        234,  431,  442,  396, 1606,  403,  412,  435, 1610,  381,  437,
        434,  444,  443,  389,  440,  441,  439,  427,  438, 1605,  436,
        426,  433,  390,  420,  417,  397,  421,  294, 1221,  422,  423,
        428,  424,  415, 1607,  387,  380,  406,  398,  416,  408,  409,
        418,  430,  429,  411,  410,  148])

## Remap Biota body Part

The OSPAR data includes entries with the variable `Body Part` labelled as `whole`. The Maris data requires that the body `body_part` distinguishes between `Whole animal` and `Whole plant`. The OSPAR data defines the `Biological group` which allows for the Body Part labelled as whole to be defined as `Whole animal` and `Whole plant`. 

To account for this, we create a temporary column `body_part_temp` that combines `Body Part` and `Biological group` and use it to perform the lookup using the `Remapper` object.

In [None]:
#| exports
class AddBodypartTempCB(Callback):
    "Add a temporary column with the body part and biological group combined."    
    def __call__(self, tfm):
        tfm.dfs['biota']['body_part_temp'] = (
            tfm.dfs['biota']['Body Part'] + ' ' + 
            tfm.dfs['biota']['Biological group']
            )                                    

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[  
                            RemoveAllNAValuesCB(cols_to_check),     
                            AddBodypartTempCB(),
                            ])
dfs_test = tfm()
dfs_test['biota']['body_part_temp']

0         SOFT PARTS Molluscs
1        GROWING TIPS Seaweed
2         SOFT PARTS Molluscs
3         SOFT PARTS Molluscs
4        GROWING TIPS Seaweed
                 ...         
15309     SOFT PARTS Molluscs
15310     SOFT PARTS Molluscs
15311     WHOLE PLANT Seaweed
15312     WHOLE PLANT Seaweed
15313     WHOLE PLANT Seaweed
Name: body_part_temp, Length: 15314, dtype: object

In [None]:
get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True)

Unnamed: 0,index,value
0,0,WHOLE PLANT SEAWEED
1,1,Soft parts Fish
2,2,WHOLE ANIMAL Fish
3,3,SOFT PARTS Molluscs
4,4,WHOLE PLANT seaweed
5,5,UNKNOWN FISH
6,6,FLESH WITHOUT BONE Fish
7,7,WHOLE ANIMAL Molluscs
8,8,FLESH WITHOUT BONES Molluscs
9,9,Muscle FISH


In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs_test, col_name='body_part_temp', as_df=True),
                    maris_lut_fn=bodyparts_lut_path,
                    maris_col_id='bodypar_id',
                    maris_col_name='bodypar',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='bodyparts_ospar.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=0)

Processing: 100%|██████████| 46/46 [00:00<00:00, 123.03it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mix of muscle and whole fish without liver FISH,Flesh without bones,Mix of muscle and whole fish without liver FISH,31
Cod medallion FISH,Old leaf,Cod medallion FISH,13
Whole without head FISH,Flesh without bones,Whole without head FISH,13
Soft parts Molluscs,Soft parts,Soft parts Molluscs,9
Whole fisk FISH,Whole animal,Whole fisk FISH,9
SOFT PARTS molluscs,Soft parts,SOFT PARTS molluscs,9
WHOLE FISH Fish,Whole animal,WHOLE FISH Fish,9
SOFT PARTS MOLLUSCS,Soft parts,SOFT PARTS MOLLUSCS,9
Soft Parts Molluscs,Soft parts,Soft Parts Molluscs,9
WHOLE FISH FISH,Whole animal,WHOLE FISH FISH,9


In [None]:
#|exports
fixes_biota_bodyparts = {
    'WHOLE Seaweed' : 'Whole plant',
    'Flesh Fish': 'Flesh with bones', # We assume it as the category 'Flesh with bones' also exists
    'FLESH Fish' : 'Flesh with bones',
    'UNKNOWN Fish' : 'Not available',
    'UNKNOWN FISH': 'Not available',
    'Cod medallion FISH' : 'Not available', # TO BE DETERMINED
    'Mix of muscle and whole fish without liver FISH' : 'Not available', # TO BE DETERMINED
}

In [None]:
#| eval: false
remapper.generate_lookup_table(fixes=fixes_biota_bodyparts)
remapper.select_match(match_score_threshold=1)

Processing: 100%|██████████| 46/46 [00:00<00:00, 126.36it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Whole without head FISH,Flesh without bones,Whole without head FISH,13
Whole animal Molluscs,Whole animal,Whole animal Molluscs,9
SOFT PARTS MOLLUSCS,Soft parts,SOFT PARTS MOLLUSCS,9
Whole fish Fish,Whole animal,Whole fish Fish,9
WHOLE FISH FISH,Whole animal,WHOLE FISH FISH,9
Soft Parts Molluscs,Soft parts,Soft Parts Molluscs,9
Whole fisk FISH,Whole animal,Whole fisk FISH,9
SOFT PARTS molluscs,Soft parts,SOFT PARTS molluscs,9
WHOLE FISH Fish,Whole animal,WHOLE FISH Fish,9
Soft parts Molluscs,Soft parts,Soft parts Molluscs,9


In [None]:
#| exports
class RemapBiotaBodyPartCB(Callback):
    "Biota body part standardized to MARIS format."
    def __init__(self, 
                 fn_lut: Callable, # Function that returns the lookup table dictionary
                 verbose: bool=False # Print unmatched values
                 ):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['species'] = tfm.dfs['biota']['Species'].apply(lambda x: self._get_species(x, lut))
        
    def _get_species(self, 
                     value_to_match:str, # The value to match
                     lut:dict # The lookup table dictionary
                    ):
        match = lut.get(value_to_match, Match(-1, None, None, None))
        if self.verbose and match.matched_id == -1:
                print(f'Unmatched species: {value_to_match}')
        return match.matched_id

##### Correct OSPAR `Body Part` labelled as `Whole`

In [None]:
dfs['biota'].columns

Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',
       'Body Part', 'Sampling date', 'Nuclide', 'Value type',
       'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',
       'Measurement Comment', 'Sample Comment', 'Reference Comment'],
      dtype='object')

In [None]:
get_unique_bodyparts_group(dfs)

array(['SOFT PARTS Molluscs', 'GROWING TIPS Seaweed',
       'Whole plant Seaweed', 'WHOLE Fish', 'WHOLE ANIMAL Fish',
       'FLESH WITHOUT BONES Fish', 'WHOLE ANIMAL Molluscs',
       'WHOLE PLANT Seaweed', 'Soft Parts Molluscs',
       'FLESH WITHOUT BONES Molluscs', 'WHOLE Seaweed',
       'Whole without head FISH', 'Cod medallion FISH', 'Muscle FISH',
       'Whole animal Fish', 'Whole fisk FISH', 'Whole FISH',
       'Mix of muscle and whole fish without liver FISH', 'Flesh Fish',
       'WHOLE FISH Fish', 'Whole animal Molluscs', 'Muscle Fish',
       'Whole fish Fish', 'FLESH WITHOUT BONE Fish', 'UNKNOWN Fish',
       'WHOLE PLANT seaweed', 'WHOLE PLANT SEAWEED',
       'SOFT PARTS molluscs', 'FLESH WITHOUT BONES FISH',
       'WHOLE ANIMAL FISH', 'FLESH WITHOUT BONES fish', 'FLESH Fish',
       'FLESH WITHOUT BONES SEAWEED', 'FLESH WITH SCALES Fish',
       'FLESH WITHOUT BONE FISH', 'HEAD FISH', 'WHOLE FISH FISH',
       'Flesh without bones Fish', 'UNKNOWN FISH', 'Soft parts

In [None]:
dfs = load_data(fname_in)
get_unique_across_dfs(dfs, col_name='Body Part', as_df=True)

Unnamed: 0,index,value
0,0,Flesh without bones
1,1,WHOLE
2,2,Whole
3,3,GROWING TIPS
4,4,Soft Parts
5,5,Mix of muscle and whole fish without liver
6,6,Whole fisk
7,7,MUSCLE
8,8,Muscle
9,9,SOFT PARTS


In [None]:
#| eval: false
remapper = Remapper(provider_lut_df=get_unique_across_dfs(dfs, col_name='Body Part', as_df=True),
                    maris_lut_fn=bodyparts_lut_path,
                    maris_col_id='bodypar_id',
                    maris_col_name='bodypar',
                    provider_col_to_match='value',
                    provider_col_key='value',
                    fname_cache='bodyparts_ospar.pkl'
                    )

remapper.generate_lookup_table(as_df=True)
remapper.select_match(match_score_threshold=0)

Processing: 100%|██████████| 28/28 [00:00<00:00, 131.90it/s]


Unnamed: 0_level_0,matched_maris_name,source_name,match_score
source_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mix of muscle and whole fish without liver,Flesh without bones,Mix of muscle and whole fish without liver,27
Whole without head,Flesh without bones,Whole without head,10
Cod medallion,Exoskeleton,Cod medallion,9
WHOLE FISH,Whole animal,WHOLE FISH,5
UNKNOWN,Skin,UNKNOWN,5
Whole fisk,Whole animal,Whole fisk,5
Whole fish,Whole animal,Whole fish,5
Flesh,Leaf,Flesh,3
Whole,Molt,Whole,3
FLESH,Leaf,FLESH,3


In [None]:
#|exports
# fixes_biota_bodyparts = {}

The OSPAR data includes entries with the variable Body Part labelled as `whole`. The Maris data requires that the body `body_part` distinguishes between `Whole animal` and `Whole plant`. The OSPAR data defines the `Biological group` which allows for the Body Part labelled as whole to be defined as `Whole animal` and `Whole plant`. 

In [None]:
#| export
whole_animal_plant = {'whole' : ['Whole','WHOLE', 'WHOLE FISH', 'Whole fisk', 'Whole fish'],
                      'Whole animal' : ['Molluscs','Fish','FISH','molluscs','fish','MOLLUSCS'],
                      'Whole plant' : ['Seaweed','seaweed','SEAWEED'] }

In [None]:
#| export
class CorrectWholeBodyPartCB(Callback):
    """Update body parts labeled as 'whole' to either 'Whole animal' or 'Whole plant'."""
    
    def __init__(self, wap: Dict[str, List[str]] = whole_animal_plant):
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        self.correct_whole_body_part(tfm.dfs['biota'])

    def correct_whole_body_part(self, df: pd.DataFrame):
        df['body_part'] = df['Body Part']   
        self.update_body_part(df, self.wap['whole'], self.wap['Whole animal'], 'Whole animal')
        self.update_body_part(df, self.wap['whole'], self.wap['Whole plant'], 'Whole plant')

    def update_body_part(self, df: pd.DataFrame, whole_list: List[str], group_list: List[str], new_value: str):
        mask = (df['body_part'].isin(whole_list)) & (df['Biological group'].isin(group_list))
        df.loc[mask, 'body_part'] = new_value


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota']['body_part'].unique())

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

['SOFT PARTS' 'GROWING TIPS' 'Whole plant' 'Whole animal' 'WHOLE ANIMAL'
 'FLESH WITHOUT BONES' 'WHOLE PLANT' 'Soft Parts' 'Whole without head'
 'Cod medallion' 'Muscle' 'Mix of muscle and whole fish without liver'
 'Flesh' 'FLESH WITHOUT BONE' 'UNKNOWN' 'FLESH' 'FLESH WITH SCALES' 'HEAD'
 'Flesh without bones' 'Soft parts' 'whole plant' 'LIVER' 'MUSCLE']


Get a dataframe of matched OSPAR biota tissues with Maris Bodyparts

In [None]:
#|export
unmatched_fixes_biota_tissues = {}

In [None]:
#|eval: false
tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
                                fname_cache='tissues_ospar.pkl', 
                                data_provider_name_col='body_part',
                                maris_lut=bodyparts_lut_path,
                                maris_id='bodypar_id',
                                maris_name='bodypar',
                                unmatched_fixes=unmatched_fixes_biota_tissues,
                                as_dataframe=True,
                                overwrite=True)
tissues_lut_df

Generating lookup table:   0%|          | 0/23 [00:00<?, ?it/s]

Generating lookup table: 100%|██████████| 23/23 [00:00<00:00, 101.67it/s]


Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mix of muscle and whole fish without liver,52,Flesh without bones,Mix of muscle and whole fish without liver,27
Whole without head,52,Flesh without bones,Whole without head,10
Cod medallion,8,Exoskeleton,Cod medallion,9
UNKNOWN,12,Skin,UNKNOWN,5
FLESH,42,Leaf,FLESH,3
Flesh,42,Leaf,Flesh,3
FLESH WITHOUT BONE,52,Flesh without bones,FLESH WITHOUT BONE,1
LIVER,25,Liver,LIVER,0
whole plant,40,Whole plant,whole plant,0
Soft parts,19,Soft parts,Soft parts,0


List unmatched OSPAR tissues:

In [None]:
#|eval: false
tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()

['Mix of muscle and whole fish without liver',
 'Whole without head',
 'Cod medallion',
 'UNKNOWN',
 'FLESH',
 'Flesh',
 'FLESH WITHOUT BONE']

Read Maris tissue lut to correct unmatched tissues:

In [None]:
#|eval: false
marisco_lut_df = pd.read_excel(bodyparts_lut_path())
marisco_lut_df

Unnamed: 0,bodypar_id,bodypar,bodycode,groupcode
0,-1,Not applicable,,
1,0,(Not available),0,0
2,1,Whole animal,WHOA,WHO
3,2,Whole animal eviscerated,WHOEV,WHO
4,3,Whole animal eviscerated without head,WHOHE,WHO
...,...,...,...,...
57,56,Growing tips,GTIP,PHAN
58,57,Upper parts of plants,UPPL,PHAN
59,58,Lower parts of plants,LWPL,PHAN
60,59,Shells/carapace,SHCA,SKEL


Create a dictionary of unmatched tissues to allow for  correctection

In [None]:
#|export
unmatched_fixes_biota_tissues = {
'Mix of muscle and whole fish without liver' : 'Not available', # Drop
 'Whole without head' : 'Whole animal eviscerated without head', # Drop? eviscerated? ,
 'Cod medallion' : 'Whole animal eviscerated without head',
 'FLESH' : 'Flesh without bones', # Drop? with or without bones?
 'Flesh' : 'Flesh without bones', # Drop? with or without bones?
 'UNKNOWN' : 'Not available',
 'FLESH WITHOUT BONE' : 'Flesh without bones'
}

In [None]:
#|eval: false
# tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
#                                 fname_cache='tissues_ospar.pkl', 
#                                 data_provider_name_col='body_part',
#                                 maris_lut=bodyparts_lut_path,
#                                 maris_id='bodypar_id',
#                                 maris_name='bodypar',
#                                 unmatched_fixes=unmatched_fixes_biota_tissues,
#                                 as_dataframe=True,
#                                 overwrite=True)
# tissues_lut_df

NameError: name 'get_maris_lut' is not defined

List unmatched OSPAR tissues:

In [None]:
#|eval: false
tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()

['Mix of muscle and whole fish without liver', 'UNKNOWN']

In [None]:
#| export
class LookupBiotaBodyPartCB(Callback):
    """Update body part id based on MARIS dbo_bodypar.xlsx"""

    def __init__(self, fn_lut: Callable, unmatched_fixes_biota_tissues: Dict[str, str]):
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        lut = self.fn_lut(df_biota=tfm.dfs['biota'])
        self.drop_nan_species(tfm.dfs['biota'])
        self.drop_unmatched(tfm.dfs['biota'])
        self.perform_lookup(tfm.dfs['biota'], lut)

    def drop_nan_species(self, df: pd.DataFrame):
        """
        Drop rows where 'body_part' is NaN.

        Args:
            df (pd.DataFrame): The DataFrame to process.
        """
        df.dropna(subset=['body_part'], inplace=True)

    def drop_unmatched(self, df: pd.DataFrame):
        """
        Drop rows where the 'body_part' is in the unmatched_fixes_biota_tissues list with value 'Not available'.

        Args:
            df (pd.DataFrame): The DataFrame to process.
        """
        na_list = ['Not available']
        na_biota_tissues = [k for k, v in self.unmatched_fixes_biota_tissues.items() if v in na_list]
        df.drop(df[df['body_part'].isin(na_biota_tissues)].index, inplace=True)

    def perform_lookup(self, df: pd.DataFrame, lut: Dict[str, 'Match']):
        """
        Perform lookup to update 'body_part' with matched IDs.

        Args:
            df (pd.DataFrame): The DataFrame to process.
            lut (Dict[str, Match]): The lookup table.
        """
        df['body_part'] = df['body_part'].apply(lambda x: lut[x].matched_id if x in lut else x)


In [None]:
#|eval: false
get_maris_bodypart=partial(get_maris_lut, 
                            fname_cache='tissues_ospar.pkl', 
                            data_provider_name_col='body_part',
                            maris_lut=bodyparts_lut_path,
                            maris_id='bodypar_id',
                            maris_name='bodypar',
                            unmatched_fixes=unmatched_fixes_biota_tissues,
                            as_dataframe=False,
                            overwrite=False)


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota'][['Body Part', 'body_part']][:5])

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15308
Number of dropped rows                                     0      6
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

      Body Part  body_part
0    SOFT PARTS         19
1  GROWING TIPS         56
2    SOFT PARTS         19
3    SOFT PARTS         19
4  GROWING TIPS         56


#### Lookup : Biogroup

In [None]:
# TO BE DONE
# Species column contains nan
# Niall replace with Biological group where missing
# In our case, we will use remapped species, once 
# done we can use internal MARIS lookup to remap to biota group. 
# If species is missing, we can use the biological group to perform the lookup. 

In [None]:
# unmatched_fixes_biota_species.update({
# # Biological group corrections
# 'Molluscs' : 'Mollusca',
# 'Seaweed' : 'Seaweed',
# 'Fish' : 'Pisces',
# 'FISH' : 'Pisces',
# 'seaweed' : 'Seaweed',
# 'SEAWEED' : 'Seaweed',
# 'molluscs' : 'Mollusca',
# 'fish' : 'Pisces',
# 'MOLLUSCS' : 'Mollusca' })

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``bio_group``.*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: Biogroup is not included.*

`get_biogroup_lut` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `biogroup_id`.

In [None]:
#| export
def get_biogroup_lut(maris_lut: str) -> dict:
    """
    Retrieve a lookup table for biogroup ids from a MARIS lookup table.

    Args:
        maris_lut (str): Path to the MARIS lookup table (Excel file).

    Returns:
        dict: A dictionary mapping species_id to biogroup_id.
    """
    species = pd.read_excel(maris_lut)
    return species[['species_id', 'biogroup_id']].set_index('species_id').to_dict()['biogroup_id']


`LookupBiogroupCB` applies the corrected `biota` `bio group` data obtained from the `get_maris_lut` function to the `biota` dataframe in the dictionary of dataframes, `dfs`.

In [None]:
#| export
class LookupBiogroupCB(Callback):
    """Update biogroup id based on MARIS dbo_species.xlsx."""

    def __init__(self, fn_lut: Callable):
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        lut = self.fn_lut()
        self.update_bio_group(tfm.dfs['biota'], lut)

    def update_bio_group(self, df: pd.DataFrame, lut: dict):
        """
        Update the 'bio_group' column in the DataFrame based on the lookup table.

        Args:
            df (pd.DataFrame): The DataFrame to process.
            lut (Dict[str, Any]): The lookup table for updating 'bio_group'.
        """
        df['bio_group'] = df['species'].apply(lambda x: lut.get(x, -1))


Apply the transformer for callbacks ``LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species)``,``CorrectWholeBodyPartCB()``, ``LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues)``,             ``LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path()))``,   ``CompareDfsAndTfmCB(dfs)`` . Then, print the ``Body Part``, ``body_part``, ``species``,``bio_group`` for the `biota` dataframe.

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota'][['Body Part', 'body_part', 'species','bio_group']][:5])

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15308
Number of dropped rows                                     0      6
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

      Body Part  body_part  species  bio_group
0    SOFT PARTS         19      394         13
1  GROWING TIPS         56       96         11
2    SOFT PARTS         19      394         13
3    SOFT PARTS         19      394         13
4  GROWING TIPS         56       96         11


Biota data dropped due to 'unkown' Body Part.

In [None]:
print(tfm.dfs_dropped['biota']['Body Part'].unique())

['Mix of muscle and whole fish without liver' 'UNKNOWN']


***

#### Lookup : Taxon Information

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: Not included`*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: ``Taxonname`` , ``TaxonRepName``, ``Taxonrank``*

`get_taxonname_lut` reads the file at `species_lut_path()` and from the contents of this file creates a dictionary linking `species_id` to `Taxonname`.

In [None]:
#| export
def get_taxon_info_lut(maris_lut: str) -> dict:
    """
    Retrieve a lookup table for Taxonname from a MARIS lookup table.

    Args:
        maris_lut (str): Path to the MARIS lookup table (Excel file).

    Returns:
        dict: A dictionary mapping species_id to biogroup_id.
    """
    species = pd.read_excel(maris_lut)
    return species[['species_id', 'Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].set_index('species_id').to_dict()

# TODO include Commonname field after next MARIS data reconciling process.

In [None]:

# | export
class LookupTaxonInformationCB(Callback):
    """Update taxon names based on MARIS species LUT (dbo_species.xlsx)."""
    def __init__(self, fn_lut: Callable[[], dict]):
        """
        Initialize the LookupTaxonNameCB with a function to generate the lookup table.

        Args:
            fn_lut (Callable[[], dict]): Function that returns the lookup table dictionary.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Update the 'taxon_name' column in the DataFrame using the lookup table and print unmatched species IDs.

        Args:
            tfm (Transformer): The transformer object containing DataFrames.
        """
        lut = self.fn_lut()
        
        
        self._set_taxon_rep_name(tfm.dfs['biota'])
        tfm.dfs['biota']['Taxonname'] =  tfm.dfs['biota']['species'].apply(lambda x: self._get_name_by_species_id(x, lut['Taxonname']))
        #df['Commonname'] = df['species'].apply(lambda x: self._get_name_by_species_id(x, lut['Commonname']))
        tfm.dfs['biota']['Taxonrank'] =  tfm.dfs['biota']['species'].apply(lambda x: self._get_name_by_species_id(x, lut['Taxonrank']))
        tfm.dfs['biota']['TaxonDB'] =  tfm.dfs['biota']['species'].apply(lambda x: self._get_name_by_species_id(x, lut['TaxonDB']))
        tfm.dfs['biota']['TaxonDBID'] =  tfm.dfs['biota']['species'].apply(lambda x: self._get_name_by_species_id(x, lut['TaxonDBID']))
        tfm.dfs['biota']['TaxonDBURL'] =  tfm.dfs['biota']['species'].apply(lambda x: self._get_name_by_species_id(x, lut['TaxonDBURL']))


    def _set_taxon_rep_name(self, df: pd.DataFrame):
        """
        Remap the 'TaxonRepName' column to the 'Species' column values.

        Args:
            df (pd.DataFrame): The DataFrame to modify.
        """
        # Ensure both columns exist before attempting to remap
        if 'Species' in df.columns:
            df['TaxonRepName'] = df['Species']
        else:
            print("Warning: 'Species' column not found in DataFrame.")
            
            

    def _get_name_by_species_id(self, species_id: str, lut: dict) -> str:
        """
        Get the  name from the lookup table and print species ID if the taxon name is not found.

        Args:
            species_id (str): The species ID from the DataFrame.
            lut (dict): The lookup table dictionary.

        Returns:
            str: The name from the lookup table.
        """
        name = lut.get(species_id, 'Unknown')  # Default to 'Unknown' if not found
        if name == 'Unknown':
            print(f"Unmatched species ID: {species_id} for {lut.keys()[0]}")
        return name


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota'][['Taxonname', 'Taxonrank','TaxonDB','TaxonDBID','TaxonDBURL']].drop_duplicates().head())

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15308
Number of dropped rows                                     0      6
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

               Taxonname Taxonrank   TaxonDB TaxonDBID  \
0     Littorina littorea   species  Wikidata    Q27935   
1      Fucus vesiculosus   species  Wikidata   Q754755   
15        Mytilus edulis   species  Wikidata    Q27855   
24       Clupea harengus   species  Wikidata  Q2396858   
28  Merlangius merlangus   species  Wikidata   Q273083   

                                TaxonDBURL  
0     https://www.wikidata.org/wiki/Q27935  
1    https://www.wikidata.org/wiki/Q754755  
15    https://www.wikidata.org/wiki/Q27855  
24  https://www.wikidata.org/wiki/Q2396858  
28   https://www.wikidata.org/wiki/Q273083  


***

#### Lookup : Units

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``unit``.*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: ``Unit``.*

Create `renaming_unit_rules` to rename the units. 

In [None]:
#| export
# Define unit names renaming rules
renaming_unit_rules = {'Bq/l': 1, #'Bq/m3'
                       'Bq/L': 1,
                       'BQ/L': 1,
                       'Bq/kg f.w.': 5, # Bq/kgw
                       'Bq/kg.fw' : 5,
                       'Bq/kg fw' : 5,
                       'Bq/kg f.w' : 5 
                       } 

In [None]:
#| export
class LookupUnitCB(Callback):
    """Update the 'unit' column in DataFrames based on a lookup table.
    The class handles:
    - Assigning a default unit for NaN values in the 'Unit' column for specific groups.
    - Dropping rows with NaN values in the 'Unit' column.
    - Performing lookup to update the 'unit' column based on the provided lookup table."""

    def __init__(self, lut: dict = renaming_unit_rules):
        """
        Initialize the LookupUnitCB with a lookup table.

        Args:
            lut (dict): A dictionary used for lookup to update the 'unit' column.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the callback to each DataFrame in the transformer.

        Args:
            tfm (Transformer): The transformer containing DataFrames to process.
        """
        for grp in tfm.dfs.keys():
            if grp == 'seawater':
                self._apply_units(tfm.dfs[grp])
            self._drop_na_units(tfm.dfs[grp])
            self._perform_lookup(tfm.dfs[grp])

    def _apply_units(self, df: pd.DataFrame):
        """
        Apply a default unit where the 'Unit' column is NaN.

        Args:
            df (pd.DataFrame): The DataFrame to process.
        """
        df.loc[df['Unit'].isnull(), 'Unit'] = 'Bq/l'

    def _drop_na_units(self, df: pd.DataFrame):
        """
        Drop rows where the 'Unit' column has NaN values.

        Args:
            df (pd.DataFrame): The DataFrame to process.
        """
        df.dropna(subset=['Unit'], inplace=True)

    def _perform_lookup(self, df: pd.DataFrame):
        """
        Perform lookup to update the 'unit' column based on the lookup table.

        Args:
            df (pd.DataFrame): The DataFrame to process.
        """
        df['unit'] = df['Unit'].apply(lambda x: self.lut.get(x, 'Unknown'))


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LookupUnitCB(renaming_unit_rules),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota']['unit'].dtypes)

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

int64


***

#### Lookup : Detection limit or Value type

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``detection_limit``.*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine foramt variable: ``Value type``.*

In [None]:
#|eval: false
grp='biota'
tfm.dfs[grp]['Value type'].unique()

array(['=', '<', '>', nan], dtype=object)

In [None]:
# | export
class LookupDetectionLimitCB(Callback):
    """Remap activity value, activity uncertainty, and detection limit to MARIS format.
    This class performs the following operations:
    - Reads a lookup table from an Excel file.
    - Copies and processes the 'Value type' column.
    - Fills NaN values with 'Not Available'.
    - Drops rows where 'Value type' is not in the lookup table.
    - Performs a lookup to update the 'detection_limit' column based on the lookup table.
    """

    def __init__(self, lut_path: str):
        """
        Initialize the LookupDetectionLimitCB with a path to the lookup table.

        Args:
            lut_path (str): The path to the Excel file containing the lookup table.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the callback to each DataFrame in the transformer.

        Args:
            tfm (Transformer): The transformer containing DataFrames to process.
        """
        lut = self._load_lookup_table()
        for grp in tfm.dfs.keys():
            df = tfm.dfs[grp]
            df = self._copy_and_fill_na(df)
            df = self._correct_greater_than(df)  # Ensure to correct 'Value type' if necessary
            df = self._drop_na_rows(df, lut)
            self._perform_lookup(df, lut)
            tfm.dfs[grp] = df  # Update the DataFrame in the transformer

    def _load_lookup_table(self) -> dict:
        """
        Load the lookup table from the Excel file and create a mapping dictionary.

        Returns:
            dict: A dictionary mapping value types to detection limits.
        """
        df = pd.read_excel(self.lut_path)
        df = df.astype({'id': 'int'})
        return dict((v, k) for k, v in df.set_index('id')['name'].to_dict().items())

    def _copy_and_fill_na(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Copy the 'Value type' column and fill NaN values with 'Not Available'.

        Args:
            df (pd.DataFrame): The DataFrame to process.

        Returns:
            pd.DataFrame: The DataFrame with updated 'detection_limit' column.
        """
        df['detection_limit'] = df['Value type']
        df['detection_limit'].fillna('Not Available', inplace=True)
        return df
    
    def _correct_greater_than(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Correct the 'Value type' where it is '>' by changing it to '<'.

        Args:
            df (pd.DataFrame): The DataFrame to process.

        Returns:
            pd.DataFrame: The DataFrame with corrected 'Value type'.
        """
        df.loc[df['detection_limit'] == '>', 'detection_limit'] = '<'
        return df


    def _drop_na_rows(self, df: pd.DataFrame, lut: dict) -> pd.DataFrame:
        """
        Drop rows where the 'detection_limit' column has values not in the lookup table.

        Args:
            df (pd.DataFrame): The DataFrame to process.

        Returns:
            pd.DataFrame: The DataFrame with rows dropped where 'detection_limit' is not in the lookup table.
        """
        return df[df['detection_limit'].isin(lut.keys())]

    def _perform_lookup(self, df: pd.DataFrame, lut: dict):
        """
        Perform lookup to update the 'detection_limit' column based on the lookup table.

        Args:
            df (pd.DataFrame): The DataFrame to process.
            lut (dict): The lookup table dictionary.
        """
        df['detection_limit'] = df['detection_limit'].apply(lambda x: lut.get(x, 0))


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
tfm.dfs['seawater'][['detection_limit','Value type']]

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



Unnamed: 0,detection_limit,Value type
0,2,<
1,2,<
2,2,<
3,2,<
4,2,<
...,...,...
18851,2,<
18852,2,<
18853,1,=
18854,1,=


***

### Include Sample Laboratory code. 

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: Sample Laboratory code is not included.*`*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: ``samplabcode``*

>  MARIS NetCDF format does not include Sample Laboratory code.

In [None]:
# | export
class RemapDataProviderSampleIdCB(Callback):
    """Remap 'KEY' column to 'samplabcode' in each DataFrame."""

    def __init__(self):
        """
        Initialize the RemapDataProviderSampleIdCB.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Remap 'KEY' column to 'samplabcode' in the DataFrames.

        Args:
            tfm (Transformer): The transformer object containing DataFrames.
        """
        for grp in tfm.dfs:
            self._remap_sample_id(tfm.dfs[grp])
    
    def _remap_sample_id(self, df: pd.DataFrame):
        """
        Remap the 'KEY' column to 'samplabcode' in the DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame to modify.
        """
        df['samplabcode'] = df['Sample ID']


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RemapDataProviderSampleIdCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

print(tfm()['seawater']['samplabcode'].unique())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

['WNZ 01' 'WNZ 02' 'WNZ 03' ... '21-656' '21-657' '21-654']
                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



***

### Include Station

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: Station ID is not included.*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variable: ``Station``*

>  MARIS NetCDF format does not include Station ID.

The OSPAR dataset includes look-up in the `Seawater_Station_Dictionary.xlsx` file for Station information. 

In [None]:
Path(fname_in)

Path('../../_data/accdb/ospar/csv')

In [None]:
dfs['seawater'].columns

Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Sampling depth', 'Sampling date',
       'Nuclide', 'Value type', 'Activity or MDA', 'Uncertainty', 'Unit',
       'Data provider', 'Measurement Comment', 'Sample Comment',
       'Reference Comment'],
      dtype='object')

In [None]:
# | export
class RemapStationIdCB(Callback):
    """Remap Station ID to MARIS format."""

    def __init__(self):
        """
        Initialize the RemapStationIdCB with no specific parameters.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Iterate through all DataFrames in the transformer object and remap 'STATION' to 'station_id'.

        Args:
            tfm (Transformer): The transformer object containing DataFrames.
        """
        for grp in tfm.dfs.keys():
            self._remap_station_id(tfm.dfs[grp])

    def _remap_station_id(self, df: pd.DataFrame):
        """
        Remap 'STATION' column to 'station_id' in the given DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame to modify.
        """
        df['station'] = df['Station ID'] + ', ' + df['Contracting Party']

In [None]:

#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RemapStationIdCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
#print(tfm.dfs['seawater']['station'].unique())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



***

### Measurement note

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variables: Not included in NetCDF*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``measurenote``*

In [None]:
# | export
class RecordMeasurementNoteCB(Callback):
    """Record measurement notes by adding a 'measurenote' column to DataFrames."""
    
    def __init__(self):
        """
        Initialize the RecordMeasurementNoteCB.

        This class does not require additional arguments or setup for initialization.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the callback to each DataFrame in the transformer to add the 'measurenote' column.

        Args:
            tfm (Transformer): The transformer object containing DataFrames to process.
        
        This method iterates over all DataFrames in the transformer and checks for the
        presence of the 'Measurement Comment' column. If found, it copies the values
        to a new 'measurenote' column. If not found, it prints a warning message.
        """
        for grp, df in tfm.dfs.items():
            if 'Measurement Comment' in df.columns:
                self._add_measurementnote(df)
            else:
                print(f"Warning: 'Measurement Comment' column not found in DataFrame for group '{grp}'")

    def _add_measurementnote(self, df: pd.DataFrame):
        """
        Add the 'measurenote' column to the DataFrame by mapping values from 'Measurement Comment'.

        Args:
            df (pd.DataFrame): DataFrame containing the 'Measurement Comment' column.
        
        The 'Measurement Comment' column values are copied to the new 'measurenote' column.
        """
        df['measurenote'] = df['Measurement Comment']


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RecordMeasurementNoteCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



***

### Reference note

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variables: Not included in NetCDF*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``refnote``* 

In [None]:
# | export
class RecordRefNoteCB(Callback):
    """Record reference notes by adding a 'refnote' column to DataFrames."""
    
    def __init__(self):
        """
        Initialize the RecordRefNoteCB.

        This class does not require additional arguments or setup for initialization.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the callback to each DataFrame in the transformer to add the 'refnote' column.

        Args:
            tfm (Transformer): The transformer object containing DataFrames to process.
        
        This method iterates over all DataFrames in the transformer and checks for the
        presence of the 'Reference Comment' column. If found, it copies the values
        to a new 'refnote' column. If not found, it prints a warning message.
        """
        for grp, df in tfm.dfs.items():
            if 'Reference Comment' in df.columns:
                self._add_refnote(df)
            else:
                print(f"Warning: 'Reference Comment' column not found in DataFrame for group '{grp}'")

    def _add_refnote(self, df: pd.DataFrame):
        """
        Add the 'refnote' column to the DataFrame by mapping values from 'Reference Comment'.

        Args:
            df (pd.DataFrame): DataFrame containing the 'Reference Comment' column.
        
        The 'Reference Comment' column values are copied to the new 'refnote' column.
        """
        df['refnote'] = df['Reference Comment']


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RecordRefNoteCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



***

### Sample note

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variables: Not included in NetCDF*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variable: ``sampnote``*

In [None]:
dfs['biota'].columns

Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',
       'Body Part', 'Sampling date', 'Nuclide', 'Value type',
       'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',
       'Measurement Comment', 'Sample Comment', 'Reference Comment'],
      dtype='object')

In [None]:
# | export
class RecordSampleNoteCB(Callback):
    """Record sample notes by adding a 'sampnote' column to DataFrames."""
    
    def __init__(self):
        """
        Initialize the RecordSampleNoteCB.

        This class does not require additional arguments or setup for initialization.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the callback to each DataFrame in the transformer to add the 'sampnote' column.

        Args:
            tfm (Transformer): The transformer object containing DataFrames to process.
        
        This method iterates over all DataFrames in the transformer and checks for the
        presence of the 'Sample Comment' column. If found, it copies the values
        to a new 'sampnote' column. If not found, it prints a warning message.
        """
        for grp, df in tfm.dfs.items():
            if 'Sample Comment' in df.columns:
                self._add_samplenote(df)
            else:
                print(f"Warning: 'Sample Comment' column not found in DataFrame for group '{grp}'")

    def _add_samplenote(self, df: pd.DataFrame):
        """
        Add the 'sampnote' column to the DataFrame by mapping values from 'Sample Comment'.

        Args:
            df (pd.DataFrame): DataFrame containing the 'Measurement Comment' column.
        
        The 'Sample Comment' column values are copied to the new 'sampnote' column.
        """
        df['sampnote'] = df['Sample Comment']


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            RecordSampleNoteCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



***

### Standardize Coordinates

#### Capture Coordinates

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variables: ``lon``  and ``lat``*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variables: ``Longitude`` and ``Latitude``.*

Use decimal degree coordinates if available; otherwise, convert from ``LatD``, ``LatM``, ``LatS``, ``LongD``, ``LongM`` and ``LongS``.

In [None]:
# | export
class ConvertLonLatCB(Callback):
    """Convert Longitude and Latitude values to decimal degrees (DDD.DDDDD°). This class processes DataFrames to convert latitude and longitude from degrees, minutes, and seconds 
    (DMS) format with direction indicators to decimal degrees format."""
    def __init__(self):
        """
        Initialize the ConvertLonLatCB class.
        """
        fc.store_attr()

    def __call__(self, tfm: 'Transformer'):
        """
        Apply the conversion to latitude and longitude in each DataFrame within the transformer.

        Args:
            tfm (Transformer): The transformer object containing DataFrames to process.
        
        This method processes each DataFrame to convert latitude and longitude values into decimal degrees.
        """
        for grp, df in tfm.dfs.items():
            df['lat'] = self._convert_latitude(df)
            df['lon'] = self._convert_longitude(df)

    def _convert_latitude(self, df: pd.DataFrame) -> pd.Series:
        """
        Convert latitude values from DMS format to decimal degrees.

        Args:
            df (pd.DataFrame): DataFrame containing latitude columns.

        Returns:
            pd.Series: Series with latitude values converted to decimal degrees.
        """
        return np.where(
            df['LatDir'].isin(['S']),
            self._dms_to_decimal(df['LatD'], df['LatM'], df['LatS']) * -1,
            self._dms_to_decimal(df['LatD'], df['LatM'], df['LatS'])
        )

    def _convert_longitude(self, df: pd.DataFrame) -> pd.Series:
        """
        Convert longitude values from DMS format to decimal degrees.

        Args:
            df (pd.DataFrame): DataFrame containing longitude columns.

        Returns:
            pd.Series: Series with longitude values converted to decimal degrees.
        """
        return np.where(
            df['LongDir'].isin(['W']),
            self._dms_to_decimal(df['LongD'], df['LongM'], df['LongS']) * -1,
            self._dms_to_decimal(df['LongD'], df['LongM'], df['LongS'])
        )

    def _dms_to_decimal(self, degrees: pd.Series, minutes: pd.Series, seconds: pd.Series) -> pd.Series:
        """
        Convert DMS (degrees, minutes, seconds) format to decimal degrees.

        Args:
            degrees (pd.Series): Series containing degree values.
            minutes (pd.Series): Series containing minute values.
            seconds (pd.Series): Series containing second values.

        Returns:
            pd.Series: Series with values converted to decimal degrees.
        """
        return degrees + minutes / 60 + seconds / 3600


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            ConvertLonLatCB()
                            ])
tfm()
tfm.dfs['seawater'][['lat','LatD', 'LatM', 'LatS', 'lon', 'LatDir', 'LongD', 'LongM','LongS', 'LongDir']]

Unnamed: 0,lat,LatD,LatM,LatS,lon,LatDir,LongD,LongM,LongS,LongDir
0,51.375278,51.0,22.0,31.0,3.188056,N,3.0,11.0,17.0,E
1,51.223611,51.0,13.0,25.0,2.859444,N,2.0,51.0,34.0,E
2,51.184444,51.0,11.0,4.0,2.713611,N,2.0,42.0,49.0,E
3,51.420278,51.0,25.0,13.0,3.262222,N,3.0,15.0,44.0,E
4,51.416111,51.0,24.0,58.0,2.809722,N,2.0,48.0,35.0,E
...,...,...,...,...,...,...,...,...,...,...
18851,56.011111,56.0,0.0,40.0,-3.406667,N,3.0,24.0,24.0,W
18852,56.011111,56.0,0.0,40.0,-3.406667,N,3.0,24.0,24.0,W
18853,53.413333,53.0,24.0,48.0,-3.870278,N,3.0,52.0,13.0,W
18854,53.569722,53.0,34.0,11.0,-3.769722,N,3.0,46.0,11.0,W


***

#### Sanitize coordinates

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*NetCDF format variables: ``lon``  and ``lat``*

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Open Refine format variables: ``Longitude decimal`` and ``Latitude decimal``.*

Sanitize coordinates drops a row when both longitude & latitude equal 0 or data contains unrealistic longitude & latitude values. Converts longitude & latitude `,` separator to `.` separator."

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            ConvertLonLatCB(),
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')
print(tfm.dfs['biota'][['lat','lon']])


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18856  15314
Number of dropped rows                                     0      0
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 

             lat       lon
0      55.725278 -4.901944
1      54.968889 -3.240556
2      58.565833 -3.791389
3      58.618611 -3.647778
4      55.964722 -2.398056
...          ...       ...
15309  54.455000 -3.566111
15310  48.832778 -1.591389
15311  48.832778 -1.591389
15312  49.551667 -1.860000
15313  49.714444 -1.946111

[15314 rows x 2 columns]


***

### Combine Callbacks and review DFS and TFM data

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            GetSampleTypeCB(type_lut),
                            LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),        
                            SanitizeValue(),                       
                            NormalizeUncCB(unc_exp2stan),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            RemapDataProviderSampleIdCB(),
                            RemapStationIdCB(),
                            RecordMeasurementNoteCB(),
                            RecordRefNoteCB(),
                            RecordSampleNoteCB(),   
                            ConvertLonLatCB(),                    
                            SanitizeLonLatCB(),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()

print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')


                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18308  15308
Number of dropped rows                                   548      6
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



In [None]:
seawater_dfs_dropped_review=tfm.dfs_dropped['seawater']
biota_dfs_dropped_review=tfm.dfs_dropped['biota']

***

### Rename columns of interest for NetCDF or Open Refine/

In [None]:
tfm.dfs['seawater'].columns

Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Sampling depth', 'Sampling date',
       'Nuclide', 'Value type', 'Activity or MDA', 'Uncertainty', 'Unit',
       'Data provider', 'Measurement Comment', 'Sample Comment',
       'Reference Comment', 'samptype_id', 'NUCLIDE', 'nuclide_id', 'time',
       'begperiod', 'value', 'uncertainty', 'unit', 'detection_limit',
       'samplabcode', 'station', 'measurenote', 'refnote', 'sampnote', 'lat',
       'lon'],
      dtype='object')

In [None]:
tfm.dfs['biota'].columns

Index(['ID', 'Contracting Party', 'RSC Sub-division', 'Station ID',
       'Sample ID', 'LatD', 'LatM', 'LatS', 'LatDir', 'LongD', 'LongM',
       'LongS', 'LongDir', 'Sample type', 'Biological group', 'Species',
       'Body Part', 'Sampling date', 'Nuclide', 'Value type',
       'Activity or MDA', 'Uncertainty', 'Unit', 'Data provider',
       'Measurement Comment', 'Sample Comment', 'Reference Comment',
       'samptype_id', 'NUCLIDE', 'nuclide_id', 'time', 'begperiod', 'value',
       'uncertainty', 'species', 'body_part', 'bio_group', 'TaxonRepName',
       'Taxonname', 'Taxonrank', 'TaxonDB', 'TaxonDBID', 'TaxonDBURL', 'unit',
       'detection_limit', 'samplabcode', 'station', 'measurenote', 'refnote',
       'sampnote', 'lat', 'lon'],
      dtype='object')

In [None]:
#| export
# Define columns of interest (keys) and renaming rules (values).
def get_renaming_rules(encoding_type='netcdf'):
    vars = cdl_cfg()['vars']
    if encoding_type == 'netcdf':
        return OrderedDict({
            ('seawater', 'biota', 'sediment'): {
                # DEFAULT
                'lat': vars['defaults']['lat']['name'],
                'lon': vars['defaults']['lon']['name'],
                'time': vars['defaults']['time']['name'],
                'NUCLIDE': 'nuclide',
                'detection_limit': vars['suffixes']['detection_limit']['name'],
                'unit': vars['suffixes']['unit']['name'],
                'value': 'value',
                'uncertainty': vars['suffixes']['uncertainty']['name'],
                #'counting_method': vars['suffixes']['counting_method']['name'],
                #'sampling_method': vars['suffixes']['sampling_method']['name'],
                #'preparation_method': vars['suffixes']['preparation_method']['name']
            },
            ('seawater',): {
                # SEAWATER
            },
            ('biota',): {
                # BIOTA
                'species': vars['bio']['species']['name'],
                'body_part': vars['bio']['body_part']['name'],
                'bio_group': vars['bio']['bio_group']['name']
            }
        })
    
    elif encoding_type == 'openrefine':
        return OrderedDict({
            ('seawater', 'biota', 'sediment'): {
                # DEFAULT
                'samptype_id': 'samptype_id',
                'lat': 'latitude',
                'lon': 'longitude',
                'station': 'station',
                'begperiod': 'begperiod',
                'samplabcode': 'samplabcode',
                #'endperiod': 'endperiod',
                'nuclide_id': 'nuclide_id',
                'detection_limit': 'detection',
                'unit': 'unit_id',
                'value': 'activity',
                'uncertainty': 'uncertaint',
                'sampnote': 'sampnote',
                'measurenote': 'measurenote',
                'refnote' : 'refnote'
            },
            ('seawater',) : {
                # SEAWATER
                #'volume': 'volume',
                #'filtpore': 'filtpore',
                #'acid': 'acid'
            },
            ('biota',) : {
                # BIOTA
                'species': 'species_id',
                'Taxonname': 'Taxonname',
                'TaxonRepName': 'TaxonRepName',
                #'Commonname': 'Commonname',
                'Taxonrank': 'Taxonrank',
                'TaxonDB': 'TaxonDB',
                'TaxonDBID': 'TaxonDBID',
                'TaxonDBURL': 'TaxonDBURL',
                'body_part': 'bodypar_id',
            }
        })
    else:
        print("Invalid encoding_type provided. Please use 'netcdf' or 'openrefine'.")
        return None

In [None]:
#| export
class SelectAndRenameColumnCB(Callback):
    """A callback to select and rename columns in a DataFrame based on provided renaming rules
    for a specified encoding type. It also prints renaming rules that were not applied
    because their keys were not found in the DataFrame."""
    
    def __init__(self, fn_renaming_rules, encoding_type='netcdf', verbose=False):
        """
        Initialize the SelectAndRenameColumnCB callback.

        Args:
            fn_renaming_rules (function): A function that returns an OrderedDict of renaming rules.
            encoding_type (str): The encoding type ('netcdf' or 'openrefine') to determine which renaming rules to use.
            verbose (bool): Whether to print out renaming rules that were not applied.
        """
        fc.store_attr()

    def __call__(self, tfm):
        """
        Apply column selection and renaming to DataFrames in the transformer, and identify unused rules.

        Args:
            tfm (Transformer): The transformer object containing DataFrames.
        """
        try:
            renaming_rules = self.fn_renaming_rules(self.encoding_type)
        except ValueError as e:
            print(f"Error fetching renaming rules: {e}")
            return

        for group in tfm.dfs.keys():
            # Get relevant renaming rules for the current group
            group_rules = self._get_group_rules(renaming_rules, group)

            if not group_rules:
                continue

            # Apply renaming rules and track keys not found in the DataFrame
            df = tfm.dfs[group]
            df, not_found_keys = self._apply_renaming(df, group_rules)
            tfm.dfs[group] = df
            
            # Print any renaming rules that were not used
            if not_found_keys and self.verbose:
                print(f"\nGroup '{group}' has the following renaming rules not applied:")
                for old_col in not_found_keys:
                    print(f"Key '{old_col}' from renaming rules was not found in the DataFrame.")

    def _get_group_rules(self, renaming_rules, group):
        """
        Retrieve and merge renaming rules for the specified group based on the encoding type.

        Args:
            renaming_rules (OrderedDict): OrderedDict of all renaming rules.
            group (str): Group name to filter rules.

        Returns:
            OrderedDict: An OrderedDict of renaming rules applicable to the specified group.
        """
        relevant_rules = [rules for key, rules in renaming_rules.items() if group in key]
        merged_rules = OrderedDict()
        for rules in relevant_rules:
            merged_rules.update(rules)
        return merged_rules

    def _apply_renaming(self, df, rename_rules):
        """
        Select columns based on renaming rules and apply renaming, only for existing columns,
        while maintaining the order of the dictionary columns.

        Args:
            df (pd.DataFrame): DataFrame to modify.
            rename_rules (OrderedDict): OrderedDict of column renaming rules.

        Returns:
            tuple: A tuple containing:
                - The DataFrame with columns renamed and filtered.
                - A set of column names from renaming rules that were not found in the DataFrame.
        """
        existing_columns = set(df.columns)
        valid_rules = OrderedDict((old_col, new_col) for old_col, new_col in rename_rules.items() if old_col in existing_columns)

        # Create a list to maintain the order of columns
        columns_to_keep = [col for col in rename_rules.keys() if col in existing_columns]
        columns_to_keep += [new_col for old_col, new_col in valid_rules.items() if new_col in df.columns]

        df = df[list(OrderedDict.fromkeys(columns_to_keep))]

        # Apply renaming
        df.rename(columns=valid_rules, inplace=True)

        # Determine which keys were not found
        not_found_keys = set(rename_rules.keys()) - existing_columns
        return df, not_found_keys


In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            GetSampleTypeCB(type_lut),
                            LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),        
                            SanitizeValue(),                       
                            NormalizeUncCB(unc_exp2stan),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            RemapDataProviderSampleIdCB(),
                            RemapStationIdCB(),
                            RecordMeasurementNoteCB(),
                            RecordRefNoteCB(),
                            RecordSampleNoteCB(),   
                            ConvertLonLatCB(),                    
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ])

tfm()
print(tfm.dfs['seawater'].columns)
print(tfm.dfs['biota'].columns)

Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc'], dtype='object')
Index(['lat', 'lon', 'time', 'nuclide', '_dl', '_unit', 'value', '_unc',
       'species', 'body_part', 'bio_group'],
      dtype='object')


***

### Reshape: long to wide

Convert data from long to wide and rename columns to comply with NetCDF format.

In [None]:

#| export
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, columns=['nuclide'], values=['value']):
        fc.store_attr()
        # Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs
        self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
    
    def renamed_cols(self, cols):
        "Flatten columns name"
        return [inner if outer == "value" else f'{inner}{outer}'
                if inner else outer
                for outer, inner in cols]

    def pivot(self, df):
        # Among all possible 'derived cols' select the ones present in df
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        df.index.name = 'org_index'
        df=df.reset_index()
        idx = list(set(df.columns) - set(self.columns + derived_coi + self.values))
        
        # Create a fill_value to replace NaN values in the columns used as the index in the pivot table.
        # Check if num_fill_value is already in the dataframe index values. If num_fill_value already exists
        # then increase num_fill_value by 1 until a value is found for num_fill_value that is not in the dataframe. 
        num_fill_value = -999
        while (df[idx] == num_fill_value).any().any():
            num_fill_value += 1
        # Fill in nan values for each col found in idx. 
        for col in idx:   
            if pd.api.types.is_numeric_dtype(df[col]):
                fill_value = num_fill_value
            if pd.api.types.is_string_dtype(df[col]):
                fill_value = 'NOT AVAILABLE'
                
            df[col]=df[col].fillna(fill_value)

        pivot_df=df.pivot_table(index=idx,
                              columns=self.columns,
                              values=self.values + derived_coi,
                              fill_value=np.nan,
                              aggfunc=lambda x: x
                              ).reset_index()
        

        # Replace fill_value  with  np.nan
        pivot_df[idx]=pivot_df[idx].replace({'NOT AVAILABLE': np.nan,
                                             num_fill_value : np.nan})
        # Set the index to be the org_index
        pivot_df = pivot_df.set_index('org_index')
                
        return (pivot_df)

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp] = self.pivot(tfm.dfs[grp])
            tfm.dfs[grp].columns = self.renamed_cols(tfm.dfs[grp].columns)

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            GetSampleTypeCB(type_lut),
                            LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),        
                            SanitizeValue(),                       
                            NormalizeUncCB(unc_exp2stan),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            RemapDataProviderSampleIdCB(),
                            RemapStationIdCB(),
                            RecordMeasurementNoteCB(),
                            RecordRefNoteCB(),
                            RecordSampleNoteCB(),   
                            ConvertLonLatCB(),                    
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ReshapeLongToWide(), 
                            CompareDfsAndTfmCB(dfs)
                            ])
tfm()
print(tfm.dfs['biota'].head())
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

                 lon  species        time  bio_group        lat  body_part  \
org_index                                                                    
11474     -39.634444       99  1017014400          4  66.784167         52   
4714      -39.150000      426  1375228800          4  62.116667         52   
6465      -35.920000       99  1287273600          4  64.289722         52   
6466      -35.100000      381  1287100800          4  64.720000         52   
6143      -34.000000       99  1306886400          4  64.000000         52   

           am241_dl  cs134_dl  cs137_dl  h3_dl  ...  cs134   cs137  h3  pb210  \
org_index                                       ...                             
11474           NaN       NaN       1.0    NaN  ...    NaN  0.1800 NaN    NaN   
4714            NaN       NaN       1.0    NaN  ...    NaN  0.2198 NaN    NaN   
6465            NaN       NaN       1.0    NaN  ...    NaN  0.2090 NaN    NaN   
6466            NaN       NaN       1.0    NaN  

In [None]:
seawater_dfs_review=tfm.dfs['seawater']
biota_dfs_review=tfm.dfs['biota']

***

## NetCDF encoder

### Example change logs

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[                         
                            GetSampleTypeCB(type_lut),
                            LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),        
                            SanitizeValue(),                       
                            NormalizeUncCB(unc_exp2stan),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            RemapDataProviderSampleIdCB(),
                            RemapStationIdCB(),
                            RecordMeasurementNoteCB(),
                            RecordRefNoteCB(),
                            RecordSampleNoteCB(),   
                            ConvertLonLatCB(),                    
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                            ReshapeLongToWide(), 
                            CompareDfsAndTfmCB(dfs)
                            ])

# Transform
tfm()
# Check transformation logs
tfm.logs

["Set the 'Sample type' column in the DataFrames based on a lookup table.",
 'Convert nuclide names to lowercase and strip any trailing spaces.',
 'Remap and standardize radionuclide names to MARIS radionuclide names and define nuclide ids.',
 'Encode time as `int` representing seconds since xxx',
 'Sanitize value by removing blank entries.',
 'Callback to normalize uncertainty values in DataFrames. This callback applies a conversion function to standardize the uncertainty values in each DataFrame.',
 "Remap biota species to MARIS database format.This class updates the 'Species' column in the biota DataFrame by:\n    - Replacing 'NaN' or 'Not available' values with corresponding biological groups.\n    - Performing a lookup to remap species to MARIS format.",
 "Update body parts labeled as 'whole' to either 'Whole animal' or 'Whole plant'.",
 'Update body part id based on MARIS dbo_bodypar.xlsx',
 'Update biogroup id based on MARIS dbo_species.xlsx.',
 'Update taxon names based on MARI

***

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [None]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
get_attrs(tfm, zotero_key=zotero_key, kw=kw)

{'geospatial_lat_min': '49.43222222222222',
 'geospatial_lat_max': '81.26805555555555',
 'geospatial_lon_min': '-58.23166666666667',
 'geospatial_lon_max': '36.181666666666665',
 'geospatial_bounds': 'POLYGON ((-58.23166666666667 36.181666666666665, 49.43222222222222 36.181666666666665, 49.43222222222222 81.26805555555555, -58.23166666666667 81.26805555555555, -58.23166666666667 36.181666666666665))',
 'time_coverage_start': '1995-01-01T00:00:00',
 'time_coverage_end': '2021-12-31T00:00:00',
 'title': 'OSPAR Environmental Monitoring of Radioactive Substances',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "OSPAR Comission\'s Radioactive Substances Committee (RSC)"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth 

In [None]:
#| export
def enums_xtra(tfm, vars):
    "Retrieve a subset of the lengthy enum as 'species_t' for instance"
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras

### Encoding NETCDF

In [None]:
#| export
def encode(fname_in, fname_out_nc, nc_tpl_path, **kwargs):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[
                                GetSampleTypeCB(type_lut),
                                LowerStripRdnNameCB(),
                                RemapRdnNameCB(),
                                ParseTimeCB(),
                                EncodeTimeCB(cfg()),        
                                SanitizeValue(),                       
                                NormalizeUncCB(unc_exp2stan),
                                LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                                CorrectWholeBodyPartCB(),
                                LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                                LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                                LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                                LookupUnitCB(renaming_unit_rules),
                                LookupDetectionLimitCB(detection_limit_lut_path()),
                                RemapDataProviderSampleIdCB(),
                                RemapStationIdCB(),
                                RecordMeasurementNoteCB(),
                                RecordRefNoteCB(),
                                RecordSampleNoteCB(),   
                                ConvertLonLatCB(),                    
                                SanitizeLonLatCB(),
                                SelectAndRenameColumnCB(get_renaming_rules, encoding_type='netcdf'),
                                ReshapeLongToWide(),
                                ])
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out_nc, 
                            global_attrs=get_attrs(tfm, zotero_key=zotero_key, kw=kw),
                            verbose=kwargs.get('verbose', False),
                            enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                           )
    encoder.encode()

In [None]:
#|eval: false
encode(fname_in, fname_out_nc, nc_tpl_path(), verbose=True)

--------------------------------------------------------------------------------
Group: seawater, Variable: lon
--------------------------------------------------------------------------------
Group: seawater, Variable: lat
--------------------------------------------------------------------------------
Group: seawater, Variable: time
--------------------------------------------------------------------------------
Group: seawater, Variable: h3
--------------------------------------------------------------------------------
Group: seawater, Variable: h3_unc
--------------------------------------------------------------------------------
Group: seawater, Variable: h3_dl
--------------------------------------------------------------------------------
Group: seawater, Variable: h3_unit
--------------------------------------------------------------------------------
Group: seawater, Variable: tc99
--------------------------------------------------------------------------------
Group: seawat

***

## Open Refine Pipeline

### Rename columns for Open Refine

In [None]:
#|eval: false
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
                            GetSampleTypeCB(type_lut),
                            LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            EncodeTimeCB(cfg()),        
                            SanitizeValue(),                       
                            NormalizeUncCB(unc_exp2stan),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPartCB(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            LookupDetectionLimitCB(detection_limit_lut_path()),
                            RemapDataProviderSampleIdCB(),
                            RemapStationIdCB(),
                            RecordMeasurementNoteCB(),
                            RecordRefNoteCB(),
                            RecordSampleNoteCB(),   
                            ConvertLonLatCB(),                    
                            SanitizeLonLatCB(),
                            SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),
                            CompareDfsAndTfmCB(dfs)
                            ])

tfm()
print(pd.DataFrame.from_dict(tfm.compare_stats) , '\n')

                                                    seawater  biota
Number of rows in dfs                                  18856  15314
Number of rows in tfm.dfs                              18308  15308
Number of dropped rows                                   548      6
Number of rows in tfm.dfs + Number of dropped rows     18856  15314 



**Example of data included in dfs_dropped.**

Main reasons for data to be dropped from dfs:
- No activity value reported (i.e. ``Activity or MDA``)

Reason 6 biota values are dropped:
- The body part is not known (i.e.'Mix of muscle and whole fish without liver' or 'UNKNOWN') 

In [None]:
grp='seawater'
#grp='biota'
tfm.dfs_dropped[grp]

Unnamed: 0,ID,Contracting Party,RSC Sub-division,Station ID,Sample ID,LatD,LatM,LatS,LatDir,LongD,...,Sampling date,Nuclide,Value type,Activity or MDA,Uncertainty,Unit,Data provider,Measurement Comment,Sample Comment,Reference Comment
16799,97147,,,,,,,,,,...,,,,,,,,,,
16800,97148,,,,,,,,,,...,,,,,,,,,,
16801,97149,,,,,,,,,,...,,,,,,,,,,
16802,97150,,,,,,,,,,...,,,,,,,,,,
16803,97151,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18474,120366,Ireland,4.0,N8,,53.0,39.0,0.0,N,5.0,...,,,,,,,,2021 data,,
18475,120367,Ireland,4.0,N9,,53.0,53.0,0.0,N,5.0,...,,,,,,,,2021 data,,
18476,120368,Ireland,4.0,N10,,53.0,52.0,0.0,N,5.0,...,,,,,,,,2021 data,,
18477,120369,Ireland,1.0,Salthill,,53.0,15.0,40.0,N,9.0,...,,,,,,,,2021 data,Woodstown (County Waterford) and Salthill (Cou...,


## Open Refine encoder

In [None]:
#| export
def encode_or(fname_in, fname_out_csv, ref_id, **kwargs):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[
                                GetSampleTypeCB(type_lut),
                                LowerStripRdnNameCB(),
                                RemapRdnNameCB(),
                                ParseTimeCB(),
                                EncodeTimeCB(cfg()),        
                                SanitizeValue(),                       
                                NormalizeUncCB(unc_exp2stan),
                                LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                                CorrectWholeBodyPartCB(),
                                LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                                LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                                LookupTaxonInformationCB(partial(get_taxon_info_lut, species_lut_path())),
                                LookupUnitCB(renaming_unit_rules),
                                LookupDetectionLimitCB(detection_limit_lut_path()),
                                RemapDataProviderSampleIdCB(),
                                RemapStationIdCB(),
                                RecordMeasurementNoteCB(),
                                RecordRefNoteCB(),
                                RecordSampleNoteCB(),   
                                ConvertLonLatCB(),                    
                                SanitizeLonLatCB(),
                                SelectAndRenameColumnCB(get_renaming_rules, encoding_type='openrefine', verbose=True),
                                CompareDfsAndTfmCB(dfs)
                                ])
    tfm()

    encoder = OpenRefineCsvEncoder(tfm.dfs, 
                                    dest_fname=fname_out_csv, 
                                    ref_id = ref_id,
                                    verbose = True
                                )
    encoder.encode()

In [None]:
#|eval: false
encode_or(fname_in, fname_out_csv, ref_id, verbose=True)