In [1]:
#| default_exp handlers.ospar

# OSPAR (WIP)
Data pipeline (handler) to convert OSPAR data ([source](https://odims.ospar.org/en/)) to `NetCDF` format.


***

## OSPAR Environment database

OSPAR [data](https://odims.ospar.org/en/) is provided as a Microsoft Access database. 
`Mdbtools` (https://github.com/mdbtools/mdbtools) can be used to convert the tables of the Microsoft Access database to `.csv` files on Unix-like OS.

Example steps:
1. Download data.
2. Install mdbtools via VScode Terminal 

    ```
    sudo apt-get -y install mdbtools
    ````

3. Install unzip via VScode Terminal 

    ```
    sudo apt-get -y install unzip
    ````

4. In VS code terminal, navigate to the marisco data folder

    ```
    cd /home/marisco/downloads/marisco/_data/accdb/mors_19840101_20211231
    ```

5. Unzip MORS_ENVIRONMENT.zip 

    ```
    unzip MORS_ENVIRONMENT.zip 
    ```

6. Run preprocess.sh to generate the required data files

    ```
    ./preprocess.sh MORS_ENVIRONMENT.zip
    ````
7. Conetens of 'preprocess.sh' script.
    ```
    #!/bin/bash

    # Example of use: ./preprocess.sh MORS_ENVIRONMENT.zip
    unzip $1
    dbname=$(ls *.accdb *.mdb)
    mkdir csv
    for table in $(mdb-tables -1 "$dbname"); do
        echo "Export table $table"
        mdb-export "$dbname" "$table" > "csv/$table.csv"
    done
    ```


***

## Packages import

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#| export
import pandas as pd # Python package that provides fast, flexible, and expressive data structures.
import numpy as np
from tqdm import tqdm # Python Progress Bar Library
from functools import partial # Function which Return a new partial object which when called will behave like func called with the positional arguments args and keyword arguments keywords
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from pathlib import Path # This module offers classes representing filesystem paths
from dataclasses import asdict
import re # provides regular expression matching operations

from marisco.utils import (has_valid_varname, match_worms, match_maris_lut, Match)
from marisco.callbacks import (Callback, Transformer, EncodeTimeCB, SanitizeLonLatCB)
from marisco.metadata import (GlobAttrsFeeder, BboxCB, DepthRangeCB, TimeRangeCB, ZoteroCB, KeyValuePairCB)
from marisco.configs import (base_path, nc_tpl_path, cfg, cache_path, cdl_cfg, Enums, lut_path,
                             species_lut_path, sediments_lut_path, bodyparts_lut_path, unit_lut_path)
from marisco.serializers import NetCDFEncoder



In [4]:
import warnings
warnings.filterwarnings('ignore')

Get the current working directory (cwd).  

In [5]:
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

Here we define the fname_in and fname_out variables. These variables are paths which are defined as relative paths. These paths are relative to 
the current working directory. Note that fname_in refers to the csv folder that contains the  HELCOM data. fname_out defines the path and filename for the NetCDF output.

In [6]:
fname_in = '../../_data/accdb/ospar/csv'
fname_out = '../../_data/output/ospar_19950103_2021214.nc'

***

## Utils

In [7]:
#| export
def load_data(src_dir,
                smp_types=['Seawater data', 'Biota data']):
    "Load OSPAR data and return them as an individual dataframe by sample type"
    '''
    Load data from the measurement files and sample information files found 
    in the src_dir (i.e. fname_in).
    Returns a dictionary of pandas' dataframes. The key to the dictionary is 
    the sample type (i.e lut_smp_type)
    '''    
    dfs = {}
    lut_smp_type = {'Seawater data': 'seawater', 'Biota data': 'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '.csv' # measurement (i.e. radioactivity) information and sample information     
        df = pd.read_csv(Path(src_dir)/fname_meas, encoding='unicode_escape')
        dfs[lut_smp_type[smp_type]] = df
    return dfs

In [8]:
dfs = load_data(fname_in)
dfs

{'seawater':            ID Contracting Party  RSC Sub-division   Station ID Sample ID  \
 0           1           Belgium               8.0  Belgica-W01    WNZ 01   
 1           2           Belgium               8.0  Belgica-W02    WNZ 02   
 2           3           Belgium               8.0  Belgica-W03    WNZ 03   
 3           4           Belgium               8.0  Belgica-W04    WNZ 04   
 4           5           Belgium               8.0  Belgica-W05    WNZ 05   
 ...       ...               ...               ...          ...       ...   
 18851  121646    United Kingdom              10.0       Rosyth   2100318   
 18852  121647    United Kingdom              10.0       Rosyth   2101399   
 18853  121648    United Kingdom               6.0        Wylfa    21-656   
 18854  121649    United Kingdom               6.0        Wylfa    21-657   
 18855  121650    United Kingdom               6.0        Wylfa    21-654   
 
        LatD  LatM  LatS LatDir  LongD  ...  Sampling date  Nu

In [9]:
#| export
def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unit':
                new_cols.append(inner + '_' + outer)
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

***

## Load tables (dataframes)

dfs includes a dictionary of tables (dataframes) that is created from the OSPAR dataset defined by fname_in. The data to be included in each dataframe is sorted by sample type. Each dictionary is defined with a key equal to the sample type. 

In [10]:
dfs = load_data(fname_in)
dfs

{'seawater':            ID Contracting Party  RSC Sub-division   Station ID Sample ID  \
 0           1           Belgium               8.0  Belgica-W01    WNZ 01   
 1           2           Belgium               8.0  Belgica-W02    WNZ 02   
 2           3           Belgium               8.0  Belgica-W03    WNZ 03   
 3           4           Belgium               8.0  Belgica-W04    WNZ 04   
 4           5           Belgium               8.0  Belgica-W05    WNZ 05   
 ...       ...               ...               ...          ...       ...   
 18851  121646    United Kingdom              10.0       Rosyth   2100318   
 18852  121647    United Kingdom              10.0       Rosyth   2101399   
 18853  121648    United Kingdom               6.0        Wylfa    21-656   
 18854  121649    United Kingdom               6.0        Wylfa    21-657   
 18855  121650    United Kingdom               6.0        Wylfa    21-654   
 
        LatD  LatM  LatS LatDir  LongD  ...  Sampling date  Nu

List the keys for the dictionary of dataframes.  

In [11]:
keys=dfs.keys()
keys

dict_keys(['seawater', 'biota'])

Show the structure of the 'seawater' dataframe. 

In [12]:
dfs['seawater'].head()


Unnamed: 0,ID,Contracting Party,RSC Sub-division,Station ID,Sample ID,LatD,LatM,LatS,LatDir,LongD,...,Sampling date,Nuclide,Value type,Activity or MDA,Uncertainty,Unit,Data provider,Measurement Comment,Sample Comment,Reference Comment
0,1,Belgium,8.0,Belgica-W01,WNZ 01,51.0,22.0,31.0,N,3.0,...,27/01/2010,137Cs,<,0.2,,Bq/l,SCKCEN,,,
1,2,Belgium,8.0,Belgica-W02,WNZ 02,51.0,13.0,25.0,N,2.0,...,27/01/2010,137Cs,<,0.27,,Bq/l,SCKCEN,,,
2,3,Belgium,8.0,Belgica-W03,WNZ 03,51.0,11.0,4.0,N,2.0,...,27/01/2010,137Cs,<,0.26,,Bq/l,SCKCEN,,,
3,4,Belgium,8.0,Belgica-W04,WNZ 04,51.0,25.0,13.0,N,3.0,...,27/01/2010,137Cs,<,0.25,,Bq/l,SCKCEN,,,
4,5,Belgium,8.0,Belgica-W05,WNZ 05,51.0,24.0,58.0,N,2.0,...,26/01/2010,137Cs,<,0.2,,Bq/l,SCKCEN,,,


Show the structure of the 'biota' dataframe. 

In [13]:
dfs['biota'].head()

Unnamed: 0,ID,Contracting Party,RSC Sub-division,Station ID,Sample ID,LatD,LatM,LatS,LatDir,LongD,...,Sampling date,Nuclide,Value type,Activity or MDA,Uncertainty,Unit,Data provider,Measurement Comment,Sample Comment,Reference Comment
0,96793,United Kingdom,5,Hunterston,2200086,55,43,31.0,N,4,...,31/12/2021,"239,240Pu",=,0.351,0.066,Bq/kg f.w.,SEPA-Scottish Environment Protection Agency,,"PLZ. Annual bulk of 2 samples, representative ...",
1,96822,United Kingdom,6,Chapelcross,2200081,54,58,8.0,N,3,...,31/12/2021,99Tc,=,39.0,15.0,Bq/kg f.w.,SEPA-Scottish Environment Protection Agency,,PLZ,
2,96823,United Kingdom,7,Dounreay,2200093,58,33,57.0,N,3,...,31/12/2021,"239,240Pu",=,0.0938,0.018,Bq/kg f.w.,SEPA-Scottish Environment Protection Agency,,"Sandside Bay. Annual bulk of 4 samples, repre...",
3,96824,United Kingdom,7,Dounreay,2200089,58,37,7.0,N,3,...,31/12/2021,"239,240Pu",=,1.54,0.31,Bq/kg f.w.,SEPA-Scottish Environment Protection Agency,,"Brims Ness. Annual bulk of 4 samples, represe...",
4,96857,United Kingdom,10,Torness,2100074,55,57,53.0,N,2,...,31/12/2021,99Tc,=,16.0,6.0,Bq/kg f.w.,SEPA-Scottish Environment Protection Agency,,"Thornton Loch. Annual bulk of 2 samples, repre...",


***

## Data transformation pipeline

### Normalize nuclide names

**Lower & strip** 

Creates a class ,LowerStripRdnNameCB, that receives a dictionary of dataframes. For each dataframe in the dictionary of dataframes it coverts the contents of the nuclide name column, 'Nuclides', to lowercase and strips any leading or trailing whitespace(s). 

In [14]:
def get_rdn_format(var):
    # lowercase, strip separators (e.g. `-`,`,`) and any white-space(s)
    separators="-,"
    var= var.lower().translate({ord(x): '' for x in separators}).replace(" ", "")
    # Format nuclide name with number then letters (e.g. 137cs) to 
    # letters and then numbers (e.g. cs137).
    reg_num_str=re.compile("([0-9]+)([a-zA-Z]+)")
    sol=reg_num_str.match(var)
    if sol is not None:
        reg_group=sol.groups()
        var=reg_group[1]+reg_group[0]
    return (var)  
    

In [15]:
#| export
class LowerStripRdnNameCB(Callback):
    "Drop NaN nuclide names, convert nuclide names to lowercase, strip separators (e.g. `-`,`,`) and any trailing space(s)"
    def __init__(self, fn_format_rdn): fc.store_attr()
    def __call__(self, tfm):        
        # Apply condition to Nuclide col. 
        for k in tfm.dfs.keys():
            # drop nan values
            tfm.dfs[k] = tfm.dfs[k][tfm.dfs[k]['Nuclide'].notna()]
            # Apply condition
            tfm.dfs[k]['nuclide'] = tfm.dfs[k]['Nuclide'].apply(lambda x: self.fn_format_rdn(x))
                
                
    

Here we apply the transformer LowerStripRdnNameCB. Print the nuclide name that is unique from the column, 'Nuclide', of each dataframe include in the dictionary of dataframes. 

In [16]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format)])
print('seawater nuclides: ')
print(tfm()['seawater']['nuclide'].unique())
print('biota nuclides: ')
print(tfm()['biota']['nuclide'].unique())

seawater nuclides: 
['cs137' 'pu239240' 'ra226' 'ra228' 'tc99' 'h3' 'po210' 'pb210']
biota nuclides: 
['pu239240' 'tc99' 'cs137' 'ra226' 'ra228' 'pu238' 'am241' 'cs134' 'h3'
 'pb210' 'po210']


***


#### Remap to MARIS nuclide names 

The marisco package includes a template that defines the permitted structure of the data. This template is located at `nc_tpl_path` and is available in a `NetCDF` format.
This template can be viewed in a human-readable form as CDL (Common Data Language).

Path to maris-template.nc

In [17]:
nc_tpl_path()

Path('/home/marisco/.marisco/maris-template.nc')

*View the 'maris-template.nc' with 'ncdump' via Terminal*
```
cd /home/marisco/.marisco/
ncdump -h maris-template.nc
```

The function, get_unique_nuclides, returns list of unique nuclides from each dataframe that is included in the dictionary of dataframes.

In [18]:
#| export
def get_unique_nuclides(dfs):
    "Get list of unique radionuclide types measured across samples."
    nuclides = []
    for k in dfs.keys():
        nuclides += dfs[k]['nuclide'].unique().tolist()
    return nuclides

Function, has_valid_varname, checks if a variable defined in the dataframes (i.e. Helcom dataset), in this case nuclide names, are consistent with the template defined by maris-template.nc. If the variable name is not valid it will print the variable name. 

In [19]:
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

"pu239240" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL


False

Create a look up table, varnames_lut_updates, which will be used to correct the nuclide names in the dictionary of dataframes (i.e. dfs) that are not compatible with the template at nc_tpl_path. 

Note : Known error in Helcom dataset. cs138, cs139, cs140, cs141, cs142, cs143, cs144, cs145, cs146 are all cs137. 

In [20]:
#| export
varnames_lut_updates = {
    'pu239240': 'pu239_240_tot'}

Create a function, get_varnames_lut, which returns a dictionary of nuclide names. This dictionary of nuclide names includes the 'Nuclide' names in the dictionary and the corrections included in varnames_lut_updates.

In [21]:
#| export
def get_varnames_lut(dfs, lut=varnames_lut_updates):
    lut = {n: n for n in set(get_unique_nuclides(dfs))}
    lut.update(varnames_lut_updates)
    return lut

Create the varnames_lut variable, a dictionary of nuclide names including updates defined by varnames_lut_updates.  

In [22]:
#|eval: false
varnames_lut = partial(get_varnames_lut, lut=varnames_lut_updates)(tfm.dfs)
varnames_lut

{'ra226': 'ra226',
 'pu239240': 'pu239_240_tot',
 'cs134': 'cs134',
 'tc99': 'tc99',
 'po210': 'po210',
 'h3': 'h3',
 'am241': 'am241',
 'pu238': 'pu238',
 'ra228': 'ra228',
 'pb210': 'pb210',
 'cs137': 'cs137'}

Create a class that remaps the nuclide names in the dfs to those in varnames_lut_updates.

In [23]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self,
                 fn_lut=partial(get_varnames_lut, lut=varnames_lut_updates)):
        fc.store_attr()
        
    def __call__(self, tfm):       
        # Replace 'Nuclide' vars according to lut. 
        lut = self.fn_lut(tfm.dfs)
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide'].replace(lut, inplace=True)


Apply the transformers LowerStripRdnNameCB and RemapRdnNameCB. Print the unique nuclides for each dataframe included in the dictionary of dataframes. 

In [24]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB()])
print('seawater nuclides: ')
print(tfm()['seawater']['nuclide'].unique())
print('biota nuclides: ')
print(tfm()['biota']['nuclide'].unique())


seawater nuclides: 
['cs137' 'pu239_240_tot' 'ra226' 'ra228' 'tc99' 'h3' 'po210' 'pb210']
biota nuclides: 
['pu239_240_tot' 'tc99' 'cs137' 'ra226' 'ra228' 'pu238' 'am241' 'cs134'
 'h3' 'pb210' 'po210']


Check that all nuclide varnames are valid. Returns True if all are valid.

In [25]:
has_valid_varname(get_unique_nuclides(tfm.dfs), nc_tpl_path())

True

### Parse time

Create a class that remaps the time format in the dictionary of dataframes (i.e. '%d/%m/%Y')

In [26]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # drop nan values
            tfm.dfs[k] = tfm.dfs[k][tfm.dfs[k]['Sampling date'].notna()]            
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k]['Sampling date'], 
                                                format='%d/%m/%Y')
                

In [27]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

0   2010-01-27
1   2010-01-27
2   2010-01-27
3   2010-01-27
4   2010-01-26
Name: time, dtype: datetime64[ns]


***

### Lookup

#### Biota species

Review unique Rubin names included in the biota dataframe. 

In [36]:
#|export
def get_maris_lut(df_biota,
                  fname_cache, # For instance 'species_ospar.pkl'
                  data_provider_name_col:str, # Data provider lookup column name of interest
                  maris_lut:str, # MARIS source lookup table name and path
                  maris_id: str, # Id of MARIS lookup table nomenclature item to match
                  maris_name: str, # Name of MARIS lookup table nomenclature item to match
                  unmatched_fixes={},
                  as_dataframe=False,
                  overwrite=False
                 ):
    fname_cache = cache_path() / fname_cache
    lut = {}

    if overwrite or (not fname_cache.exists()):        
        df = pd.DataFrame({data_provider_name_col : df_biota[data_provider_name_col].unique()})
        for _, row in tqdm(df.iterrows(), total=len(df)):
            
            # Fix if unmatched
            has_to_be_fixed = row[data_provider_name_col] in unmatched_fixes       
            name_to_match = unmatched_fixes[row[data_provider_name_col]] if has_to_be_fixed else row[data_provider_name_col]

            # Match
            result = match_maris_lut(maris_lut, name_to_match, maris_id, maris_name)
            match = Match(result.iloc[0][maris_id], result.iloc[0][maris_name], 
                        row[data_provider_name_col], result.iloc[0]['score'])
                    
            lut[row[data_provider_name_col]] = match
            
        fc.save_pickle(fname_cache, lut)
    else:
        lut = fc.load_pickle(fname_cache)

    if as_dataframe:
        df_lut = pd.DataFrame({k: asdict(v) for k, v in lut.items()}).transpose()
        df_lut.index.name = 'source_id'
        return df_lut.sort_values(by='match_score', ascending=False)
    else:
        return lut

In [37]:
#|export
# key equals name in dfs['biota']. 
# value equals replacement name to use in match_maris_lut (i.e. name_to_match)
unmatched_fixes_biota_species = {}

using dbo_species_expanded.xlsx which does not  includes 'Not available' like dbo_species.xlsx

TODO: investigate speeding this up. 

In [38]:
# drop nan values
tfm.dfs['biota']=tfm.dfs['biota'][tfm.dfs['biota']['Species'].notna()]

species_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
                                fname_cache='species_ospar.pkl', 
                                data_provider_name_col='Species',
                                maris_lut=species_lut_path(),
                                maris_id='species_id',
                                maris_name='species',
                                unmatched_fixes=unmatched_fixes_biota_species,
                                as_dataframe=True,
                                overwrite=True)

  0%|          | 0/155 [00:00<?, ?it/s]

100%|██████████| 155/155 [01:14<00:00,  2.08it/s]


TODO Mixed species ID (e.g.RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA ). Drop?

Show maris_species_lut where match_type is not a perfect match ( i.e. not equal 0).

In [40]:
species_lut_df[species_lut_df['match_score'] > 1]

Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,898,Rhombosolea leporina,RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA,31
"Mixture of green, red and brown algae",1041,Melongena melongena,"Mixture of green, red and brown algae",26
Solea solea (S.vulgaris),161,Loligo vulgaris,Solea solea (S.vulgaris),12
SOLEA SOLEA (S.VULGARIS),161,Loligo vulgaris,SOLEA SOLEA (S.VULGARIS),12
CERASTODERMA (CARDIUM) EDULE,274,Cerastoderma edule,CERASTODERMA (CARDIUM) EDULE,10
Cerastoderma (Cardium) Edule,274,Cerastoderma edule,Cerastoderma (Cardium) Edule,10
MONODONTA LINEATA,425,Osilinus lineatus,MONODONTA LINEATA,9
NUCELLA LAPILLUS,1074,Nacella concinna,NUCELLA LAPILLUS,9
DICENTRARCHUS (MORONE) LABRAX,424,Dicentrarchus labrax,DICENTRARCHUS (MORONE) LABRAX,9
Pleuronectiformes [order],411,Pleuronectiformes,Pleuronectiformes [order],8


Match unmatched biota_species

In [46]:
#|export
# LookupBiotaSpeciesCB filters 'Not available'. 
unmatched_fixes_biota_species = {'RHODYMENIA PSEUDOPALAMATA & PALMARIA PALMATA': 'Not available', # mix
 'Mixture of green, red and brown algae': 'Not available', #mix 
 'Solea solea (S.vulgaris)': 'Solea solea',
 'SOLEA SOLEA (S.VULGARIS)': 'Solea solea',
 'CERASTODERMA (CARDIUM) EDULE': 'Cerastoderma edule',
 'Cerastoderma (Cardium) Edule': 'Cerastoderma edule',
 'MONODONTA LINEATA': 'Phorcus lineatus',
 'NUCELLA LAPILLUS': 'Not available', # Droped. In worms 'Nucella lapillus (Linnaeus, 1758)', 
 'DICENTRARCHUS (MORONE) LABRAX': 'Dicentrarchus labrax',
 'Pleuronectiformes [order]': 'Pleuronectiformes',
 'RAJIDAE/BATOIDEA': 'Not available', #mix 
 'PALMARIA PALMATA': 'Not available', # Droped. In worms 'Palmaria palmata (Linnaeus) F.Weber & D.Mohr, 1805',
 'Sepia spp.': 'Sepia',
 'Rhodymenia spp.': 'Rhodymenia',
 'unknown': 'Not available',
 'RAJA DIPTURUS BATIS': 'Dipturus batis',
 'Unknown': 'Not available',
 'Flatfish': 'Not available',
 'FUCUS SPP.': 'FUCUS',
 'Patella sp.': 'Patella',
 'Gadus sp.': 'Gadus',
 'FUCUS spp': 'FUCUS',
 'Tapes sp.': 'Tapes',
 'Thunnus sp.': 'Thunnus',
 'RHODYMENIA spp': 'RHODYMENIA',
 'Fucus sp.': 'Fucus',
 'PECTINIDAE': 'Not available', # Droped. In worms as PECTINIDAE is a family.
 'PLUERONECTES PLATESSA': 'Pleuronectes platessa',
 'Gaidropsarus argenteus': 'Gaidropsarus argentatus'}

In [48]:
# Drop row in the dfs['biota] where the unmatched_fixes_biota_species value is 'Not available'. 
na_list = ['Not available']     
na_biota_species = [k for k,v in unmatched_fixes_biota_species.items() if v in na_list]
tfm.dfs['biota'] = tfm.dfs['biota'][~tfm.dfs['biota']['Species'].isin(na_biota_species)]
# drop nan values
tfm.dfs['biota']=tfm.dfs['biota'][tfm.dfs['biota']['Species'].notna()]
species_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
                                fname_cache='species_ospar.pkl', 
                                data_provider_name_col='Species',
                                maris_lut=species_lut_path(),
                                maris_id='species_id',
                                maris_name='species',
                                unmatched_fixes=unmatched_fixes_biota_species,
                                as_dataframe=True,
                                overwrite=True)

  0%|          | 0/146 [00:00<?, ?it/s]

100%|██████████| 146/146 [01:10<00:00,  2.08it/s]


In [49]:
species_lut_df[species_lut_df['match_score'] >= 1]

Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [50]:
#| export
class LookupBiotaSpeciesCB(Callback):
    """
    Biota species remapped to MARIS db:

    """
    def __init__(self, fn_lut, unmatched_fixes_biota_species): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut(df_biota=tfm.dfs['biota'])      
        # Drop rows where 'Species' are 'nan'
        tfm.dfs['biota'] = tfm.dfs['biota'][tfm.dfs['biota']['Species'].notna()]
        # Drop row in the dfs['biota] where the unmatched_fixes_biota_species value is 'Not available'. 
        na_list = ['Not available']     
        na_biota_species = [k for k,v in self.unmatched_fixes_biota_species.items() if v in na_list]
        tfm.dfs['biota'] = tfm.dfs['biota'][~tfm.dfs['biota']['Species'].isin(na_biota_species)]
        # Perform lookup 
        tfm.dfs['biota']['species'] = tfm.dfs['biota']['Species'].apply(lambda x: lut[x].matched_id)
        

In [51]:
#| export
get_maris_species = partial(get_maris_lut, 
                fname_cache='species_ospar.pkl', 
                data_provider_name_col='SCIENTIFIC NAME',
                maris_lut=species_lut_path(),
                maris_id='species_id',
                maris_name='species',
                unmatched_fixes=unmatched_fixes_biota_species,
                as_dataframe=False,
                overwrite=False)

In [52]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species)
                            ])
print(tfm()['biota'])


          ID Contracting Party  RSC Sub-division   Station ID   Sample ID  \
0      96793    United Kingdom                 5   Hunterston     2200086   
1      96822    United Kingdom                 6  Chapelcross     2200081   
2      96823    United Kingdom                 7     Dounreay     2200093   
3      96824    United Kingdom                 7     Dounreay     2200089   
4      96857    United Kingdom                10      Torness     2100074   
...      ...               ...               ...          ...         ...   
15309  54203    United Kingdom                 6   Sellafield  1995001077   
15310  48606            France                 2    Granville         NaN   
15311  48634            France                 2    Granville         NaN   
15312  48650            France                 2     Dielette         NaN   
15313  48610            France                 2        Goury         NaN   

       LatD  LatM  LatS LatDir  LongD  ...  Activity or MDA  Uncertainty  \

***

#### Biota tissues

##### Correct OSPAR 'Body Part' labelled as Whole

The OSPAR data includes entries with the variable Body Part labelled as whole. The Maris data requires that the body 'body_part' distinguishes between 'Whole animal' and 'Whole plant'. The OSPAR data defines the 'Biological group' which allows for the Body Part labelled as whole to be defined as 'Whole animal' and 'Whole plant'. 

In [53]:
#| export
whole_animal_plant = {'whole' : ['Whole','WHOLE', 'WHOLE FISH', 'Whole fisk', 'Whole fish'],
                      'Whole animal' : ['Molluscs','Fish','FISH','molluscs','fish','MOLLUSCS'],
                      'Whole plant' : ['Seaweed','seaweed','SEAWEED'] }

In [54]:

#| export
class CorrectWholeBodyPart(Callback):
    """
    Update bodypart labeled as 'whole' to either 'Whole animal' or 'Whole plant'.
    """
    
    def __init__(self, wap=whole_animal_plant): fc.store_attr()
    
    def __call__(self, tfm):        
        tfm.dfs['biota'] = self.correct_whole_body_part(tfm.dfs['biota'],self.wap)

    def correct_whole_body_part(self, df, wap):
        whole_list= wap['whole']
        animal_list = wap['Whole animal']
        plant_lst = wap['Whole plant']
        df['body_part']=df['Body Part']   
        df['body_part'].loc[(df['body_part'].isin(whole_list)) & (df['Biological group'].isin(animal_list))] = 'Whole animal'
        df['body_part'].loc[(df['body_part'].isin(whole_list)) & (df['Biological group'].isin(plant_lst))] = 'Whole plant'
        
        return df

In [55]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart()
                            ])
print(tfm()['biota']['body_part'].unique())

['SOFT PARTS' 'GROWING TIPS' 'Whole plant' 'Whole animal'
 'FLESH WITHOUT BONES' 'WHOLE ANIMAL' 'WHOLE PLANT' 'Soft Parts'
 'Whole without head' 'Cod medallion' 'Muscle'
 'Mix of muscle and whole fish without liver' 'Flesh' 'FLESH WITHOUT BONE'
 'UNKNOWN' 'FLESH' 'FLESH WITH SCALES' 'HEAD' 'Flesh without bones'
 'Soft parts' 'whole plant' 'LIVER' 'MUSCLE']


Get a dataframe of matched OSPAR biota tissues with Maris Bodyparts

In [64]:
#|export
unmatched_fixes_biota_tissues = {}

In [65]:
tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
                                fname_cache='tissues_ospar.pkl', 
                                data_provider_name_col='body_part',
                                maris_lut=bodyparts_lut_path(),
                                maris_id='bodypar_id',
                                maris_name='bodypar',
                                unmatched_fixes=unmatched_fixes_biota_tissues,
                                as_dataframe=True,
                                overwrite=True)
tissues_lut_df

100%|██████████| 23/23 [00:00<00:00, 100.91it/s]


Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mix of muscle and whole fish without liver,52,Flesh without bones,Mix of muscle and whole fish without liver,27
Whole without head,52,Flesh without bones,Whole without head,10
Cod medallion,9,Endoskeleton,Cod medallion,9
UNKNOWN,12,Skin,UNKNOWN,5
FLESH,42,Leaf,FLESH,3
Flesh,42,Leaf,Flesh,3
FLESH WITHOUT BONE,52,Flesh without bones,FLESH WITHOUT BONE,1
LIVER,25,Liver,LIVER,0
whole plant,40,Whole plant,whole plant,0
Soft parts,19,Soft parts,Soft parts,0


List unmatched OSPAR tissues

In [66]:
tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()

['Mix of muscle and whole fish without liver',
 'Whole without head',
 'Cod medallion',
 'UNKNOWN',
 'FLESH',
 'Flesh',
 'FLESH WITHOUT BONE']

Read Maris tissue lut to correct unmatched tissues

In [59]:
marisco_lut_df=pd.read_excel(bodyparts_lut_path())
marisco_lut_df

Unnamed: 0,bodypar_id,bodypar,bodycode,groupcode
0,0,(Not available),0,0
1,1,Whole animal,WHOA,WHO
2,2,Whole animal eviscerated,WHOEV,WHO
3,3,Whole animal eviscerated without head,WHOHE,WHO
4,4,Flesh with bones,FLEB,FLEB
...,...,...,...,...
56,56,Growing tips,GTIP,PHAN
57,57,Upper parts of plants,UPPL,PHAN
58,58,Lower parts of plants,LWPL,PHAN
59,59,Shells/carapace,SHCA,SKEL


Create a dictionary of unmatched tissues to allow for  correctection

In [72]:
unmatched_fixes_biota_tissues = {
'Mix of muscle and whole fish without liver' : 'Not available', # Drop
 'Whole without head' : 'Whole animal eviscerated without head', # Drop? eviscerated? ,
 'Cod medallion' : 'Whole animal eviscerated without head',
 'FLESH' : 'Flesh without bones', # Drop? with or without bones?
 'Flesh' : 'Flesh without bones', # Drop? with or without bones?
 'UNKNOWN' : 'Not available',
 'FLESH WITHOUT BONE' : 'Flesh without bones'
}

In [76]:
# Drop row in the dfs['biota] where the unmatched_fixes_biota_species value is 'Not available'. 
na_list = ['Not available']     
na_biota_tissues = [k for k,v in unmatched_fixes_biota_tissues.items() if v in na_list]
tfm.dfs['biota'] = tfm.dfs['biota'][~tfm.dfs['biota']['body_part'].isin(na_biota_tissues)]

tissues_lut_df = get_maris_lut(df_biota=tfm.dfs['biota'], 
                                fname_cache='tissues_ospar.pkl', 
                                data_provider_name_col='body_part',
                                maris_lut=bodyparts_lut_path(),
                                maris_id='bodypar_id',
                                maris_name='bodypar',
                                unmatched_fixes=unmatched_fixes_biota_tissues,
                                as_dataframe=True,
                                overwrite=True)
tissues_lut_df

100%|██████████| 21/21 [00:00<00:00, 93.89it/s]


Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SOFT PARTS,19,Soft parts,SOFT PARTS,0
Flesh,52,Flesh without bones,Flesh,0
LIVER,25,Liver,LIVER,0
whole plant,40,Whole plant,whole plant,0
Soft parts,19,Soft parts,Soft parts,0
Flesh without bones,52,Flesh without bones,Flesh without bones,0
HEAD,13,Head,HEAD,0
FLESH WITH SCALES,60,Flesh with scales,FLESH WITH SCALES,0
FLESH,52,Flesh without bones,FLESH,0
FLESH WITHOUT BONE,52,Flesh without bones,FLESH WITHOUT BONE,0


List unmatched OSPAR tissues

In [77]:
tissues_lut_df[tissues_lut_df['match_score'] >= 1]['source_name'].tolist()

[]

In [88]:
#| export
class LookupBiotaBodyPartCB(Callback):
    """
    Update bodypart id based on MARIS dbo_bodypar.xlsx:
        - 3: 'Whole animal eviscerated without head',
        - 12: 'Viscera',
        - 8: 'Skin'
    """
    def __init__(self, fn_lut, unmatched_fixes_biota_tissues): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut(df_biota=tfm.dfs['biota'])      
        # Drop rows where 'Species' are 'nan'
        tfm.dfs['biota']=tfm.dfs['biota'][tfm.dfs['biota']['body_part'].notna()]
        # Drop row in the dfs['biota] where the unmatched_fixes_biota_species value is 'Not available'. 
        na_list = ['Not available']     
        na_biota_tissues = [k for k,v in self.unmatched_fixes_biota_tissues.items() if v in na_list]
        tfm.dfs['biota'] = tfm.dfs['biota'][~tfm.dfs['biota']['body_part'].isin(na_biota_tissues)]
        # Perform lookup         
        tfm.dfs['biota']['body_part'] = tfm.dfs['biota']['body_part'].apply(lambda x: lut[x].matched_id)

In [89]:
get_maris_bodypart=partial(get_maris_lut, 
                            fname_cache='tissues_ospar.pkl', 
                            data_provider_name_col='body_part',
                            maris_lut=bodyparts_lut_path(),
                            maris_id='bodypar_id',
                            maris_name='bodypar',
                            unmatched_fixes=unmatched_fixes_biota_tissues,
                            as_dataframe=False,
                            overwrite=False)
tissues_lut_df.head()

Unnamed: 0_level_0,matched_id,matched_maris_name,source_name,match_score
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SOFT PARTS,19,Soft parts,SOFT PARTS,0
Flesh,52,Flesh without bones,Flesh,0
LIVER,25,Liver,LIVER,0
whole plant,40,Whole plant,whole plant,0
Soft parts,19,Soft parts,Soft parts,0


In [91]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species,unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart,unmatched_fixes_biota_tissues)
                            ])
print(tfm()['biota'][['Body Part', 'body_part']][:5])

      Body Part  body_part
0    SOFT PARTS         19
1  GROWING TIPS         56
2    SOFT PARTS         19
3    SOFT PARTS         19
4  GROWING TIPS         56


***

#### Biogroup

Define bio_group

In [92]:
#| export
def get_biogroup_lut(maris_lut):
    species = pd.read_excel(maris_lut)
    return species[['species_id', 'biogroup_id']].set_index('species_id').to_dict()['biogroup_id']

In [93]:
#| export
class LookupBiogroupCB(Callback):
    """
    Update biogroup id  based on MARIS dbo_species.xlsx
    """
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()        
        tfm.dfs['biota']['bio_group'] = tfm.dfs['biota']['species'].apply(lambda x: lut[x])

In [94]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path()))
                            ])
print(tfm()['biota']['bio_group'].unique())

[ 6. 11.  4.  2.  5. 13.]


***

### Capture Units

View units

In [95]:
tfm.dfs['seawater'][ 'Unit'].unique()

array(['Bq/l', nan, 'Bq/L', 'BQ/L'], dtype=object)

In [96]:
tfm.dfs['biota'][ 'Unit'].unique()

array(['Bq/kg f.w.', 'Bq/kg.fw', 'Bq/kg fw', 'Bq/kg f.w'], dtype=object)

TODO : Review units!
see units_df. 
We have Bq/m3 or atom/l	 but no Bq/l. 

Is unit_id 5 Bq/kg wet?

In [97]:
units_df=pd.read_excel(unit_lut_path())
units_df

Unnamed: 0,unit_id,unit,unit_sanitized,ordlist,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,0,NOT AVAILABLE,NOT AVAILABLE,0,,,
1,1,Bq/m3,Bq per m3,1,Bq/m3,,Bq/m<sup>3</sup>
2,2,Bq/m2,Bq per m2,2,,,
3,3,Bq/kg,Bq per kg,3,,,
4,4,Bq/kgd,Bq per kgd,4,,,
5,5,Bq/kgw,Bq per kgw,5,,,
6,6,kg/kg,kg per kg,6,,,
7,7,TU,TU,7,,,
8,8,DELTA/mill,DELTA per mill,8,,,
9,9,atom/kg,atom per kg,9,,,


In [98]:
#| export
# Define unit names renaming rules
renaming_unit_rules = {'Bq/l': 1, #'Bq/m3'
                       'Bq/L': 1,
                       'BQ/L': 1,
                       'Bq/kg f.w.': 5, # Bq/kgw
                       'Bq/kg.fw' : 5,
                       'Bq/kg fw' : 5,
                       'Bq/kg f.w' : 5 
                       } 

In [99]:
#| export
class LookupUnitCB(Callback):
    def __init__(self,
                 lut=renaming_unit_rules):
        fc.store_attr()
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            # Drop rows where 'Species' are 'nan'
            tfm.dfs[grp]=tfm.dfs[grp][tfm.dfs[grp]['Unit'].notna()]
            # Perform lookup         
            tfm.dfs[grp]['unit'] = tfm.dfs[grp]['Unit'].apply(lambda x: self.lut[x])

In [101]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules)
                            ])
print(tfm()['biota']['unit'].unique())

[5]


***

### Value, Uncertainty and Detection Limit

TODO: Review OSPAR '>'? See tfm.dfs[grp]['Value type'].unique()

In [102]:
grp='biota'
tfm.dfs[grp]['Value type'].unique()

array(['=', '<', '>', nan], dtype=object)

In [103]:
# | export
class RemapValueUncertaintyDetectionLimit(Callback):
    "Remamp activity value, activity uncertainty and detection limit to MARIS format."
    def __init__(self):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]['detection_limit'] = np.where(tfm.dfs[grp]['Value type'] == '<', tfm.dfs[grp]['Activity or MDA'] , np.NaN)
            tfm.dfs[grp]['value'] = np.where(tfm.dfs[grp]['Value type'] == '=', tfm.dfs[grp]['Activity or MDA'] , np.NaN)
            tfm.dfs[grp]['uncertainty'] = np.where(tfm.dfs[grp]['Value type'] == '=', tfm.dfs[grp]['Uncertainty'] , np.NaN)


In [104]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit()
                            ])
tfm()['seawater'].head()

Unnamed: 0,ID,Contracting Party,RSC Sub-division,Station ID,Sample ID,LatD,LatM,LatS,LatDir,LongD,...,Data provider,Measurement Comment,Sample Comment,Reference Comment,nuclide,time,unit,detection_limit,value,uncertainty
0,1,Belgium,8.0,Belgica-W01,WNZ 01,51.0,22.0,31.0,N,3.0,...,SCKCEN,,,,cs137,2010-01-27,1,0.2,,
1,2,Belgium,8.0,Belgica-W02,WNZ 02,51.0,13.0,25.0,N,2.0,...,SCKCEN,,,,cs137,2010-01-27,1,0.27,,
2,3,Belgium,8.0,Belgica-W03,WNZ 03,51.0,11.0,4.0,N,2.0,...,SCKCEN,,,,cs137,2010-01-27,1,0.26,,
3,4,Belgium,8.0,Belgica-W04,WNZ 04,51.0,25.0,13.0,N,3.0,...,SCKCEN,,,,cs137,2010-01-27,1,0.25,,
4,5,Belgium,8.0,Belgica-W05,WNZ 05,51.0,24.0,58.0,N,2.0,...,SCKCEN,,,,cs137,2010-01-26,1,0.2,,


***

# Long Lat 

TODO Review Latitude/Longitude units 

Continuing with Decimal Degrees (DDD.DDDDD°)


In [105]:
# | export
class ConvertLonLat(Callback):
    "Convert Longitude and Latitude values to DDD.DDDDD°"
    def __init__(self):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]['latitude'] = np.where(tfm.dfs[grp]['LatDir'].isin(['S']), ((tfm.dfs[grp]['LatD'] + tfm.dfs[grp]['LatM']/60 + tfm.dfs[grp]['LatS'] /(60*60))* (-1)), (tfm.dfs[grp]['LatD'] + tfm.dfs[grp]['LatM']/60 + tfm.dfs[grp]['LatS'] /(60*60)))
            tfm.dfs[grp]['longitude'] = np.where(tfm.dfs[grp]['LongDir'].isin(['W']), ((tfm.dfs[grp]['LongD'] + tfm.dfs[grp]['LongM']/60 + tfm.dfs[grp]['LongS'] /(60*60))* (-1)), (tfm.dfs[grp]['LongD'] + tfm.dfs[grp]['LongM']/60 + tfm.dfs[grp]['LongS'] /(60*60)))

In [106]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat()
                            ])
tfm()['seawater'][['latitude','LatD', 'LatM', 'LatS', 'longitude', 'LatDir', 'LongD', 'LongM','LongS', 'LongDir']]

Unnamed: 0,latitude,LatD,LatM,LatS,longitude,LatDir,LongD,LongM,LongS,LongDir
0,51.375278,51.0,22.0,31.0,3.188056,N,3.0,11.0,17.0,E
1,51.223611,51.0,13.0,25.0,2.859444,N,2.0,51.0,34.0,E
2,51.184444,51.0,11.0,4.0,2.713611,N,2.0,42.0,49.0,E
3,51.420278,51.0,25.0,13.0,3.262222,N,3.0,15.0,44.0,E
4,51.416111,51.0,24.0,58.0,2.809722,N,2.0,48.0,35.0,E
...,...,...,...,...,...,...,...,...,...,...
18851,56.011111,56.0,0.0,40.0,-3.406667,N,3.0,24.0,24.0,W
18852,56.011111,56.0,0.0,40.0,-3.406667,N,3.0,24.0,24.0,W
18853,53.413333,53.0,24.0,48.0,-3.870278,N,3.0,52.0,13.0,W
18854,53.569722,53.0,34.0,11.0,-3.769722,N,3.0,46.0,11.0,W


***

### Encode time (seconds since ...)

In [107]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat(),
                            EncodeTimeCB(cfg())
                            ])
tfm()['seawater']['time']

0        1264550400
1        1264550400
2        1264550400
3        1264550400
4        1264464000
            ...    
18851    1619654400
18852    1639094400
18853    1617753600
18854    1617753600
18855    1617753600
Name: time, Length: 15652, dtype: int64

******

# Compare DFS and TFM data


In [111]:
# | export
class CompareDfsAndTfm(Callback):
    "Create a dfs of dropped data. Data included in the DFS not in the TFM"
    def __init__(self, dfs_compare):
        fc.store_attr()

    def __call__(self, tfm):
        tfm.dfs_dropped={}
        tfm.compare_stats={}
        for grp in tfm.dfs.keys():
            dfs_all = self.dfs_compare[grp].merge(tfm.dfs[grp], on=self.dfs_compare[grp].columns.to_list(), how='left', indicator=True)
            tfm.dfs_dropped[grp]=dfs_all[dfs_all['_merge'] == 'left_only']  
            tfm.compare_stats[grp]= {'Number of rows dfs:' : len(self.dfs_compare[grp].index),
                                     'Number of rows tfm.dfs:' : len(tfm.dfs[grp].index),
                                     'Number of dropped rows:' : len(tfm.dfs_dropped[grp].index),
                                     'Number of rows tfm.dfs + Number of dropped rows:' : len(tfm.dfs[grp].index) + len(tfm.dfs_dropped[grp].index)
                                    }
            
            
                        

In [112]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat(),
                            EncodeTimeCB(cfg()),
                            CompareDfsAndTfm(dfs)
                            ])
tfm()
tfm.compare_stats

{'seawater': {'Number of rows dfs:': 18856,
  'Number of rows tfm.dfs:': 15652,
  'Number of dropped rows:': 3204,
  'Number of rows tfm.dfs + Number of dropped rows:': 18856},
 'biota': {'Number of rows dfs:': 15314,
  'Number of rows tfm.dfs:': 13188,
  'Number of dropped rows:': 2126,
  'Number of rows tfm.dfs + Number of dropped rows:': 15314}}

In [113]:
dfs_dropped_biota=tfm.dfs_dropped['biota']
dfs_dropped_seawater=tfm.dfs_dropped['seawater']

TODO: In the dfs_dropped_biota many of the 'species' is NaN.

TODO: In the dfs_dropped_seawater many of the 'Unit' is NaN.

***

### Rename columns

TODO: Ask about SALIN and TTEMP. Some comment include this data. 
TODO: TDEPTH . We could approx TDEPTH based on location. 

In [114]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['Nuclide', 'value', 'uncertainty','detection_limit','unit', 'time', 'Sampling depth',
                        'latitude', 'longitude'],
           'biota': ['Nuclide', 'value', 'uncertainty','detection_limit','unit', 'time', 'latitude', 'longitude',
                     'species', 'body_part', 'bio_group']}

In [115]:
vars = cdl_cfg()['vars']
vars['defaults'].keys()

dict_keys(['lon', 'lat', 'depth', 'time'])

TODO: Review the cdl_cfg as latest Helcom uses "vars['defaults']['smp_depth']['name']". 

In [116]:
#| export
def get_renaming_rules():
    vars = cdl_cfg()['vars']
    # Define column names renaming rules
    return {
        'Nuclide': 'nuclide',
        'uncertainty': vars['suffixes']['uncertainty']['name'],
        'Sampling depth': vars['defaults']['depth']['name'],
        #'Sampling depth': vars['defaults']['smp_depth']['name'],
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'unit': vars['suffixes']['unit']['name'],
        'detection_limit': vars['suffixes']['detection_limit']['name']
    }

In [117]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi,
                 fn_renaming_rules,
                ):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.fn_renaming_rules(), inplace=True)

In [118]:

dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat(),
                            EncodeTimeCB(cfg()),
                            #CompareDfsAndTfm(dfs),
                            RenameColumnCB(coi_grp, get_renaming_rules)
                            ])                            
                            
tfm()['seawater']

Unnamed: 0,nuclide,value,_unc,_dl,_unit,time,depth,lat,lon
0,137Cs,,,0.2000,1,1264550400,3.0,51.375278,3.188056
1,137Cs,,,0.2700,1,1264550400,3.0,51.223611,2.859444
2,137Cs,,,0.2600,1,1264550400,3.0,51.184444,2.713611
3,137Cs,,,0.2500,1,1264550400,3.0,51.420278,3.262222
4,137Cs,,,0.2000,1,1264464000,3.0,51.416111,2.809722
...,...,...,...,...,...,...,...,...,...
18851,3H,,,1.0000,1,1619654400,0.0,56.011111,-3.406667
18852,3H,,,1.0500,1,1639094400,0.0,56.011111,-3.406667
18853,137Cs,0.00431,0.000543,,1,1617753600,0.0,53.413333,-3.870278
18854,137Cs,0.00946,0.000253,,1,1617753600,0.0,53.569722,-3.769722


***

# ReshapeLongToWide

In [119]:
#| export
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, columns='nuclide', values=['value']):
        fc.store_attr()
        # Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs
        self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
    
    def renamed_cols(self, cols):
        "Flatten columns name"
        return [inner if outer == "value" else f'{inner}{outer}'
                if inner else outer
                for outer, inner in cols]

    def pivot(self, df):
        # Among all possible 'derived cols' select the ones present in df
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        
        df.reset_index(names='sample', inplace=True)
        
        idx = list(set(df.columns) - set([self.columns] + derived_coi + self.values))
        return df.pivot_table(index=idx,
                              columns=self.columns,
                              values=self.values + derived_coi,
                              fill_value=np.nan,
                              aggfunc=lambda x: x
                              ).reset_index()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.pivot(tfm.dfs[k])
            tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)

In [120]:

dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat(),
                            EncodeTimeCB(cfg()),
                            #CompareDfsAndTfm(dfs),
                            RenameColumnCB(coi_grp, get_renaming_rules),
                            ReshapeLongToWide()
                            ])                            
                            
tfm()['seawater']

Unnamed: 0,depth,lat,time,lon,sample,137Cs_dl,210Pb_dl,226Ra_dl,228Ra_dl,"239,240Pu_dl",...,RA-226_unit,RA-228_unit,137Cs,210Pb,210Po,226Ra,228Ra,"239,240Pu",3H,99Tc
0,0.0,36.181667,862012800,-6.751667,5278,,,,,,...,,,,,,,,0.000003,,
1,0.0,36.566667,1550534400,-6.318056,17811,0.5925,,,,,...,,,,,,,,,,
2,0.0,36.566667,1550534400,-6.316667,17835,,,,,,...,,,,,,,,,0.07582,
3,0.0,36.566667,1559865600,-6.318056,17812,0.1554,,,,,...,,,,,,,,,,
4,0.0,36.566667,1559865600,-6.316667,17836,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15647,1693.0,73.730000,1219276800,13.270000,1620,0.0024,,,,,...,,,,,,,,,,
15648,1693.0,73.730000,1219276800,13.270000,1622,,,,,,...,,,,,,,,0.000012,,
15649,1694.0,73.717222,1187136000,13.251667,2154,,,,,,...,,,0.001200,,,,,,,
15650,1694.0,73.717222,1187136000,13.251667,2156,,,,,,...,,,,,,,,,,


### Sanitize coordinates

In [121]:

dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                            CorrectWholeBodyPart(),
                            LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                            LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                            LookupUnitCB(renaming_unit_rules),
                            RemapValueUncertaintyDetectionLimit(),
                            ConvertLonLat(),
                            EncodeTimeCB(cfg()),
                            #CompareDfsAndTfm(dfs),
                            RenameColumnCB(coi_grp, get_renaming_rules),
                            ReshapeLongToWide(),
                            SanitizeLonLatCB()
                            ])                            
                            
tfm()['seawater'].head()

Unnamed: 0,depth,lat,time,lon,sample,137Cs_dl,210Pb_dl,226Ra_dl,228Ra_dl,"239,240Pu_dl",...,RA-226_unit,RA-228_unit,137Cs,210Pb,210Po,226Ra,228Ra,"239,240Pu",3H,99Tc
0,0.0,36.181667,862012800,-6.751667,5278,,,,,,...,,,,,,,,3e-06,,
1,0.0,36.566667,1550534400,-6.318056,17811,0.5925,,,,,...,,,,,,,,,,
2,0.0,36.566667,1550534400,-6.316667,17835,,,,,,...,,,,,,,,,0.07582,
3,0.0,36.566667,1559865600,-6.318056,17812,0.1554,,,,,...,,,,,,,,,,
4,0.0,36.566667,1559865600,-6.316667,17836,,,,,,...,,,,,,,,,,


## NetCDF encoder

### Example change logs

In [122]:
tfm.logs

['Drop NaN nuclide names, convert nuclide names to lowercase, strip separators (e.g. `-`,`,`) and any trailing space(s)',
 'Remap to MARIS radionuclide names.',
 '\n    Biota species remapped to MARIS db:\n\n    ',
 "\n    Update bodypart labeled as 'whole' to either 'Whole animal' or 'Whole plant'.\n    ",
 "\n    Update bodypart id based on MARIS dbo_bodypar.xlsx:\n        - 3: 'Whole animal eviscerated without head',\n        - 12: 'Viscera',\n        - 8: 'Skin'\n    ",
 '\n    Update biogroup id  based on MARIS dbo_species.xlsx\n    ',
 'Remamp activity value, activity uncertainty and detection limit to MARIS format.',
 'Convert Longitude and Latitude values to DDD.DDDDD°',
 'Encode time as `int` representing seconds since xxx',
 'Convert data from long to wide with renamed columns.',
 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.']

### Feed global attributes

In [123]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [124]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

TODO: update zotero_key

Attributes related to the dataset are retrieved from [zotero](https://www.zotero.org/) using a zotero_key. The [MARIS datasets](https://maris.iaea.org/datasets) include a library on [zotero](https://www.zotero.org/groups/2432820/maris/library)

In [126]:
get_attrs(tfm, zotero_key='LQRA4MMK', kw=kw)

{'geospatial_lat_min': '49.43222222222222',
 'geospatial_lat_max': '81.26805555555555',
 'geospatial_lon_min': '-58.23166666666667',
 'geospatial_lon_max': '36.181666666666665',
 'geospatial_bounds': 'POLYGON ((-58.23166666666667 36.181666666666665, 49.43222222222222 36.181666666666665, 49.43222222222222 81.26805555555555, -58.23166666666667 81.26805555555555, -58.23166666666667 36.181666666666665))',
 'geospatial_vertical_max': '0',
 'geospatial_vertical_min': '-1850.0',
 'time_coverage_start': '1995-01-01T00:00:00',
 'time_coverage_end': '2021-12-31T00:00:00',
 'title': 'OSPAR Environmental Monitoring of Radioactive Substances',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "OSPAR Comission\'s Radioactive Substances Committee (RSC)"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chem

In [127]:
#| export
def enums_xtra(tfm, vars):
    "Retrieve a subset of the lengthy enum as 'species_t' for instance"
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras

### Encoding

In [129]:
#| export
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(get_rdn_format),
                                RemapRdnNameCB(),
                                ParseTimeCB(),
                                LookupBiotaSpeciesCB(get_maris_species, unmatched_fixes_biota_species),
                                CorrectWholeBodyPart(),
                                LookupBiotaBodyPartCB(get_maris_bodypart, unmatched_fixes_biota_tissues),
                                LookupBiogroupCB(partial(get_biogroup_lut, species_lut_path())),
                                LookupUnitCB(renaming_unit_rules),
                                RemapValueUncertaintyDetectionLimit(),
                                ConvertLonLat(),
                                EncodeTimeCB(cfg()),
                                #CompareDfsAndTfm(dfs),
                                RenameColumnCB(coi_grp, get_renaming_rules),
                                ReshapeLongToWide(),
                                SanitizeLonLatCB()
                                ])
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='LQRA4MMK', kw=kw),
                            verbose=kwargs.get('verbose', False),
                            enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                           )
    encoder.encode()

In [130]:
encode(fname_in, fname_out, nc_tpl_path(), verbose=False)

In [131]:
fname_out

'../../_data/output/ospar_19950103_2021214.nc'