In [None]:
#| default_exp handlers.maris_dump

# MARIS dump
> Data pipeline (handler) to convert global MARIS db dump into `NetCDF` format. It allows to encode as NetCDF all legacy datasets in one batch.

The input data is a dump from already imported MARIS datasets.

**Dev. board**: https://trello.com/b/IszgV1bj/marisco

Questions:

* filtering status? 

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| exports
from tqdm import tqdm
from pathlib import Path
import fastcore.all as fc
import pandas as pd
import numpy as np

from marisco.callbacks import (Callback, Transformer, SanitizeLonLatCB, EncodeTimeCB, ReshapeLongToWide)
from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)
from marisco.configs import lut_path, cdl_cfg, cfg, nc_tpl_path, Enums, get_lut
from marisco.serializers import NetCDFEncoder

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Configuration and file paths

In [None]:
# | exports
fname_in = Path().home() / 'pro/data/maris/MARIS_exportSample_20240313.txt'
dir_dest = '../../_data/output/dump'

## Utils

In [None]:
#| exports
class DataLoader:
    LUT = {
        'Sediment': 'sediment', 'Seawater': 'seawater',
        'Suspended matter': 'suspended-matter', 'Biota': 'biota'}

    def __init__(self, 
                 fname: str # Path to the MARIS global dump file
                 ):
        "Load specific MARIS dataset through its ref_id."
        self.fname = fname
        self.df = None  # Lazy loading

    def _load_data(self):
        if self.df is None:
            self.df = pd.read_csv(self.fname, sep='\t', encoding='ISO-8859-1')

    def __call__(self, 
                 ref_id: int # Reference ID of interest
                 ) -> dict: # Dictionary of dataframes
        self._load_data()
        filtered_df = self.df[self.df.ref_id == ref_id]
        return {
            self.LUT[name]: grp
            for name, grp in filtered_df.groupby('samptype')
            if name in self.LUT
        }

In [None]:
#| exports
def get_zotero_key(dfs):
    return dfs[next(iter(dfs))][['zoterourl']].iloc[0].values[0].split('/')[-1]

def get_fname(dfs):
    id, name = dfs[next(iter(dfs))][['ref_id', 'displaytext']].iloc[0]
    name = name.replace(',', '').replace('.', '').replace('-', ' ').split(' ')
    return '-'.join(([str(id)] + name)) + '.nc'

## Load data

Let's get a quick look at the input MARIS dump:

In [None]:
#|eval: false
df = pd.read_csv(fname_in, sep='\t', encoding='ISO-8859-1')

print('# of unique refs: ', len(df.ref_id.unique()))
print('columns: ', df.columns)
df.head()

# of unique refs:  526
columns:  Index(['ref_id', 'displaytext', 'samptype', 'nuclide_id', 'latitude',
       'longitude', 'begperiod', 'endperiod', 'sampdepth', 'totdepth',
       'uncertaint', 'unit_id', 'detection', 'area_id', 'species_id',
       'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume', 'salinity',
       'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id', 'activity',
       'zoterourl'],
      dtype='object')


Unnamed: 0,ref_id,displaytext,samptype,nuclide_id,latitude,longitude,begperiod,endperiod,sampdepth,totdepth,...,bodypar_id,sedtype_id,volume,salinity,temperatur,sampmet_id,prepmet_id,counmet_id,activity,zoterourl
0,182,"Urban et al., 2015",Biota,33,-35.140833,117.604444,2014-05-06 00:00:00,,-1.0,,...,52,0,,,,0,6,20,0.387,https://www.zotero.org/groups/2432820/maris/it...
1,182,"Urban et al., 2015",Biota,47,-35.140833,117.604444,2014-05-06 00:00:00,,-1.0,,...,52,0,,,,0,6,5,1.44,https://www.zotero.org/groups/2432820/maris/it...
2,182,"Urban et al., 2015",Biota,31,-16.466944,123.535833,2014-02-27 00:00:00,,-1.0,,...,52,0,,,,0,6,20,0.042,https://www.zotero.org/groups/2432820/maris/it...
3,182,"Urban et al., 2015",Biota,33,-16.466944,123.535833,2014-02-27 00:00:00,,-1.0,,...,52,0,,,,0,6,20,0.075,https://www.zotero.org/groups/2432820/maris/it...
4,182,"Urban et al., 2015",Biota,47,-16.466944,123.535833,2014-02-27 00:00:00,,-1.0,,...,52,0,,,,0,6,5,0.069,https://www.zotero.org/groups/2432820/maris/it...


Let's checkout if we retrieve the expected `keys` (sample types) and associated dataframes:

In [None]:
#|eval: false
dataloader = DataLoader(fname_in)
ref_id = 100 # Some other ref_id examples: OSPAR: 191, HELCOM: 100, 717 (only seawater)

dfs = dataloader(ref_id=ref_id)
print(f'keys: {dfs.keys()}')
dfs['sediment'].head()

keys: dict_keys(['biota', 'seawater', 'sediment'])


Unnamed: 0,ref_id,displaytext,samptype,nuclide_id,latitude,longitude,begperiod,endperiod,sampdepth,totdepth,...,bodypar_id,sedtype_id,volume,salinity,temperatur,sampmet_id,prepmet_id,counmet_id,activity,zoterourl
549778,100,"HELCOM MORS, 2018",Sediment,17,54.838333,9.9,1989-06-14 00:00:00,,-1.0,24.0,...,0,59,,,,0,0,0,26.6,https://www.zotero.org/groups/2432820/maris/it...
549779,100,"HELCOM MORS, 2018",Sediment,24,54.838333,9.9,1989-06-14 00:00:00,,-1.0,24.0,...,0,59,,,,0,0,0,134.0,https://www.zotero.org/groups/2432820/maris/it...
549780,100,"HELCOM MORS, 2018",Sediment,24,54.838333,9.9,1989-06-14 00:00:00,,-1.0,24.0,...,0,59,,,,0,0,0,18.6,https://www.zotero.org/groups/2432820/maris/it...
549781,100,"HELCOM MORS, 2018",Sediment,31,54.838333,9.9,1989-06-14 00:00:00,,-1.0,24.0,...,0,59,,,,0,0,0,42.5,https://www.zotero.org/groups/2432820/maris/it...
549782,100,"HELCOM MORS, 2018",Sediment,31,54.838333,9.9,1989-06-14 00:00:00,,-1.0,24.0,...,0,59,,,,0,0,0,5.9,https://www.zotero.org/groups/2432820/maris/it...


## Data transformation pipeline

### Normalize nuclide names

Remap `nuclide_id` to MARIS radionuclide standard names:

In [None]:
#| exports
nuclide_id_to_name = lambda: get_lut(lut_path(), 'dbo_nuclide.xlsx', 
                                     key='nc_name', value='nuclide_id',
                                     reverse=True)

In [None]:
# | exports
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self, fn_lut=nuclide_id_to_name): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide_id'].replace(lut)

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

print(tfm()['sediment']['nuclide_id'].unique())

['ru106' 'sb125' 'cs134' 'cs137' 'k40' 'co60' 'ag110m' 'ra226' 'th232'
 'pb212' 'pb214' 'pu238' 'am241' 'pu239_240_tot' 'zr95' 'mn54' 'ac228'
 'u235' 'tl208' 'be7' 'bi214' 'ra223' 'ru103' 'sr90' 'eu155' 'ba140'
 'co58' 'ra224' 'po210' 'ra228' 'th228' 'ce144' 'cs134_137_tot' 'pb210'
 'pu239' 'cd109' 'bi212' 'pu238_240_tot' 'nb95' 'ir192' 'sb124' 'zn65'
 'th234' 'pu241']


### Rename columns

Rename MARIS dump columns to MARIS netCDF standard names:

In [None]:
#|eval: false
dfs['sediment'].columns

Index(['ref_id', 'displaytext', 'samptype', 'nuclide_id', 'latitude',
       'longitude', 'begperiod', 'endperiod', 'sampdepth', 'totdepth',
       'uncertaint', 'unit_id', 'detection', 'area_id', 'species_id',
       'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume', 'salinity',
       'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id', 'activity',
       'zoterourl'],
      dtype='object')

In [None]:
#| exports
def renaming_rules():
    "Rename MARIS dump columns to MARIS netCDF standard names."
    vars = cdl_cfg()['vars']
    return {
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'begperiod': vars['defaults']['time']['name'],
        'sampdepth': vars['defaults']['smp_depth']['name'],
        'totdepth': vars['defaults']['tot_depth']['name'],
        'uncertaint': vars['suffixes']['uncertainty']['name'],
        'unit_id': vars['suffixes']['unit']['name'],
        'detection': vars['suffixes']['detection_limit']['name'],
        'area_id': vars['defaults']['area']['name'], 
        'species_id': vars['bio']['species']['name'],
        'biogroup_id': vars['bio']['bio_group']['name'],
        'bodypar_id': vars['bio']['body_part']['name'],
        'sedtype_id': vars['sed']['sed_type']['name'],
        'volume': vars['suffixes']['volume']['name'],
        'salinity': vars['suffixes']['salinity']['name'],
        'temperatur': vars['suffixes']['temperature']['name'],
        'sampmet_id': vars['suffixes']['sampling_method']['name'],
        'prepmet_id': vars['suffixes']['preparation_method']['name'],
        'counmet_id': vars['suffixes']['counting_method']['name'],
        'activity': 'value',
        'nuclide_id': 'nuclide'
    }

In [None]:
#| exports
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self, renaming_rules=renaming_rules): fc.store_attr()
    def __call__(self, tfm):
        lut = renaming_rules()
        coi = lut.keys()
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].loc[:, coi]
            tfm.dfs[k].rename(columns=lut, inplace=True)

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB()
    ])

print(tfm()['sediment'])

               lat        lon                 time smp_depth tot_depth  \
549778   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549779   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549780   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549781   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549782   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
...            ...        ...                  ...       ...       ...   
1532415  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532416  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532417  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532418  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532419  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   

             _unc _unit _dl  area species  ... body_part sed_type _vol _sal  \
549778       3.99     4   =  237

### Drop NaN only columns

In [None]:
#| exports
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0): fc.store_attr()
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB()
    ])

print(tfm()['sediment'])

               lat        lon                 time smp_depth tot_depth  \
549778   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549779   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549780   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549781   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
549782   54.838333        9.9  1989-06-14 00:00:00      -1.0      24.0   
...            ...        ...                  ...       ...       ...   
1532415  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532416  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532417  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532418  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   
1532419  57.619722  23.621389  2005-12-02 00:00:00      -1.0      55.0   

             _unc _unit _dl  area sed_type _sampmet _prepmet    value nuclide  
549778       3.99     4   =  23

### Remap detection limit values

In [None]:
#| exports
dl_name_to_id = lambda: get_lut(lut_path(), 'dbo_detectlimit.xlsx', key='name', value='id')

In [None]:
#| eval: false
dl_name_to_id()

{'Not applicable': -1, 'Not Available': 0, '=': 1, '<': 2, 'ND': 3, 'DE': 4}

In [None]:
#| exports
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
                 fn_lut=dl_name_to_id):
        fc.store_attr()
        self.var_name = cdl_cfg()['vars']['suffixes']['detection_limit']['name']

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.var_name] = tfm.dfs[k][self.var_name].replace(lut)

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB()
    ])

print(tfm()['sediment']['_dl'])

549778     1
549779     1
549780     1
549781     1
549782     1
          ..
1532415    1
1532416    1
1532417    1
1532418    1
1532419    1
Name: _dl, Length: 123196, dtype: int64


### Parse and encode time

We remind that in `netCDF` format time need to be encoded as `integer` representing the number of seconds since a time of reference. In our case we chose `1970-01-01 00:00:00.0` as defined in `configs.ipynb`.



In [None]:
#| exports
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].time, format='ISO8601')

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(cfg())
    ])

print(tfm()['sediment'])

               lat        lon        time smp_depth tot_depth      _unc _unit  \
549778   54.838333        9.9   613785600      -1.0      24.0      3.99     4   
549779   54.838333        9.9   613785600      -1.0      24.0       NaN     2   
549780   54.838333        9.9   613785600      -1.0      24.0     1.674     4   
549781   54.838333        9.9   613785600      -1.0      24.0       NaN     2   
549782   54.838333        9.9   613785600      -1.0      24.0     1.829     4   
...            ...        ...         ...       ...       ...       ...   ...   
1532415  57.619722  23.621389  1133481600      -1.0      55.0   86.2836     4   
1532416  57.619722  23.621389  1133481600      -1.0      55.0       NaN     2   
1532417  57.619722  23.621389  1133481600      -1.0      55.0  24.45552     4   
1532418  57.619722  23.621389  1133481600      -1.0      55.0       NaN     2   
1532419  57.619722  23.621389  1133481600      -1.0      55.0  123.2568     4   

         _dl  area sed_type

### Reshape: long to wide

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    ReshapeLongToWide()
    ])

print(tfm()['sediment'])

                 lon        time  area        lat  smp_depth  tot_depth  \
org_index                                                                 
549834      9.633333   544838400  2374  54.850000       -1.0       16.0   
549835      9.633333   544838400  2374  54.850000       -1.0       16.0   
549836      9.633333   544838400  2374  54.850000       -1.0       16.0   
549837      9.633333   544838400  2374  54.850000       -1.0       16.0   
549838      9.633333   544838400  2374  54.850000       -1.0       16.0   
...              ...         ...   ...        ...        ...        ...   
1518808    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1518809    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1518810    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1528756    29.833333  1128211200  2407  59.983333       -1.0        0.0   
1528757    29.833333  1128211200  2407  59.983333       -1.0        0.0   

           sed_type  ac2

### Sanitize coordinates

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    ReshapeLongToWide(),
    SanitizeLonLatCB()
    ])

tfm()['sediment']

Unnamed: 0_level_0,lon,time,area,lat,smp_depth,tot_depth,sed_type,ac228_dl,ag110m_dl,am241_dl,...,sb124,sb125,sr90,th228,th232,th234,tl208,u235,zn65,zr95
org_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
549834,9.633333,544838400,2374,54.850000,-1.0,16.0,58,,,,...,,,,,,,,,,
549835,9.633333,544838400,2374,54.850000,-1.0,16.0,58,,,,...,,,,,,,,,,
549836,9.633333,544838400,2374,54.850000,-1.0,16.0,58,,,,...,,,,,,,,,,
549837,9.633333,544838400,2374,54.850000,-1.0,16.0,58,,,,...,,,,,,,,,,
549838,9.633333,544838400,2374,54.850000,-1.0,16.0,58,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518808,29.833333,1128211200,2407,59.983333,-1.0,0.0,2,,,,...,,,,,,,,,,
1518809,29.833333,1128211200,2407,59.983333,-1.0,0.0,2,,,,...,,,,,,,,,,
1518810,29.833333,1128211200,2407,59.983333,-1.0,0.0,2,,,,...,,,,,,,,,,
1528756,29.833333,1128211200,2407,59.983333,-1.0,0.0,2,,,,...,,,,,,,,,,


## Encode to NetCDF

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    ReshapeLongToWide(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()
tfm.logs

['Remap to MARIS radionuclide names.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.']

In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)

{'geospatial_lat_min': '30.435833333333335',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.633333333333333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.633333333333333 53.5, 30.435833333333335 53.5, 30.435833333333335 65.75, 9.633333333333333 65.75, 9.633333333333333 53.5))',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-14T00:00:00',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "A.", "lastName": "McGarry"}, {"creatorType": "author", "firstName": "S.", "lastName": "Lyons"}, {"creatorType": "author", "firstName": "C.", "lastName": "McEnri"}, {"creatorType": "author", "firstName": "T.", "lastName": "Ryan"}, {"creatorType": "author", "firstName": "M.", "lastName": "O\'Colmain"}, {"creatorType": "author", "firstName": "J.D.", "lastName": "Cunningham"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean 

In [None]:
#| exports
def enums_xtra(tfm, vars):
    "Retrieve a subset of the lengthy enum as `species_t` for instance"
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras

In [None]:
#| exports
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    dataloader = DataLoader(fname_in)
    ref_ids = kwargs.get('ref_ids', df.ref_id.unique())
    print('Encoding ...')
    for ref_id in tqdm(ref_ids, leave=False):
        dfs = dataloader(ref_id=ref_id)
        print(get_fname(dfs))
        tfm = Transformer(dfs, cbs=[
            RemapRdnNameCB(),
            RenameColumnCB(),
            DropNAColumnsCB(),
            SanitizeDetectionLimitCB(),
            ParseTimeCB(),
            EncodeTimeCB(cfg()),
            ReshapeLongToWide(),
            SanitizeLonLatCB(verbose=True)
            ])
       
        tfm()
        encoder = NetCDFEncoder(tfm.dfs, 
                                src_fname=nc_tpl_path,
                                dest_fname=Path(fname_out) / get_fname(dfs), 
                                global_attrs=get_attrs(tfm, zotero_key=get_zotero_key(dfs), kw=kw),
                                verbose=kwargs.get('verbose', False),
                                enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                                )
        encoder.encode()

### Single dataset

In [None]:
#|eval: false
tfm = Transformer(dfs, cbs=[
            RemapRdnNameCB(),
            RenameColumnCB(),
            DropNAColumnsCB(),
            SanitizeDetectionLimitCB(),
            ParseTimeCB(),
            EncodeTimeCB(cfg()),
            ReshapeLongToWide(),
            SanitizeLonLatCB(verbose=True)
            ])

dfs_test = tfm()

In [None]:
#|eval: false
ref_id = 100
encode(fname_in, dir_dest, nc_tpl_path(), verbose=True, ref_ids=[ref_id])

Encoding ...


  0%|          | 0/1 [00:00<?, ?it/s]

100-HELCOM-MORS-2018.nc
--------------------------------------------------------------------------------
Group: biota, Variable: lon
--------------------------------------------------------------------------------
Group: biota, Variable: lat
--------------------------------------------------------------------------------
Group: biota, Variable: smp_depth
--------------------------------------------------------------------------------
Group: biota, Variable: time
--------------------------------------------------------------------------------
Group: biota, Variable: area
--------------------------------------------------------------------------------
Group: biota, Variable: bio_group
--------------------------------------------------------------------------------
Group: biota, Variable: species
--------------------------------------------------------------------------------
Group: biota, Variable: body_part
--------------------------------------------------------------------------------

                                              

### All datasets

In [None]:
#|eval: false
encode(fname_in, dir_dest, nc_tpl_path(), verbose=False)