In [None]:
#| default_exp handlers.maris_legacy

# MARIS Legacy

> This data pipeline, known as "handler" in Marisco terminology,  contains a data pipeline (handler) that converts the master MARIS database dump into `NetCDF` format. It enables batch encoding of all legacy datasets into NetCDF.

Key functions of this handler:

- **Load data** from a MARIS dump file
- **Transform data** by applying various transformations to clean and normalize the data
- **Encode data** into NetCDF files


The **result** is a set of NetCDF files, one for each unique reference ID in the input data.

::: {.callout-tip}
## Getting Started

For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.

:::

The present notebook pretends to be an instance of [Literate Programming](https://www.wikiwand.com/en/articles/Literate_programming) in the sense that it is a narrative that includes code snippets that are interspersed with explanations. When a function or a class needs to be exported in a dedicated python module (in our case `marisco/handlers/helcom.py`) the code snippet is added to the module using `#| exports` as provided by the wonderful [nbdev](https://nbdev.readthedocs.io/en/latest/) library.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from tqdm import tqdm
from pathlib import Path
import fastcore.all as fc
import pandas as pd
import numpy as np
from typing import Optional, List

from marisco.callbacks import (
    Callback, 
    Transformer, 
    SanitizeLonLatCB, 
    EncodeTimeCB, 
    RenameColumnsCB, 
    SelectColumnsCB,
    UniqueIndexCB
)

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB, 
    DepthRangeCB,
    TimeRangeCB,
    ZoteroCB,
    KeyValuePairCB
    )

from marisco.configs import (
    NC_GROUPS,
    lut_path,
    cfg,
    nc_tpl_path,
    Enums, 
    get_lut
    )

from marisco.encoders import NetCDFEncoder

import warnings
warnings.filterwarnings('ignore')

In [None]:
#| hide 
pd.set_option('display.max_rows', 100)

## Configuration & file paths

- **fname_in**: path to the folder containing the MARIS dump data in CSV format. 

- **dir_dest**: path to the folder where the NetCDF output will be saved.


In [None]:
#| exports
fname_in = Path().home() / 'pro/data/maris/2024-11-20 MARIS_QA_shapetype_id=1.txt'
dir_dest = '../../_data/output/dump'

## Utils

Below a utility class to load a specific MARIS dump dataset optionally filtered through its `ref_id`.


In [None]:
#| exports
class DataLoader:
    "Load specific MARIS dataset through its ref_id."
    LUT = {
        'Biota': 'BIOTA', 
        'Seawater': 'SEAWATER', 
        'Sediment': 'SEDIMENT', 
        'Suspended matter': 'SUSPENDED_MATTER'
    }

    def __init__(self, 
                 fname: str, # Path to the MARIS global dump file
                 exclude_ref_id: Optional[List[int]]=[9999] # Whether to filter the dataframe by ref_id
                 ):
        fc.store_attr()
        self.df = self._load_data()

    def _load_data(self):
        df = pd.read_csv(self.fname, sep='\t', encoding='utf-8', low_memory=False)
        return df[~df.ref_id.isin(self.exclude_ref_id)] if self.exclude_ref_id else df

    def __call__(self, 
                 ref_id: int # Reference ID of interest
                 ) -> dict: # Dictionary of dataframes
        df = self.df[self.df.ref_id == ref_id].copy() if ref_id else self.df.copy()
        return {self.LUT[name]: grp for name, grp in df.groupby('samptype') if name in self.LUT}

In [None]:
#| exports
def get_zotero_key(dfs):
    "Retrieve Zotero key from MARIS dump."
    return dfs[next(iter(dfs))][['zoterourl']].iloc[0].values[0].split('/')[-1]

In [None]:
#| exports
def get_fname(dfs):
    "Retrieve filename from MARIS dump."
    idx, name = dfs[next(iter(dfs))][['ref_id', 'displaytext']].iloc[0]
    name = name.replace(',', '').replace('.', '').replace('-', ' ').split(' ')
    return '-'.join(([str(idx)] + name)) + '.nc'

## Load data

Here below a quick overview of the MARIS dump data structure.

In [None]:
#| eval: false
dataloader = DataLoader(fname_in)
ref_id = 100 # Some other ref_id examples: OSPAR: 191, HELCOM: 100, 717 (only seawater)

dfs = dataloader(ref_id=ref_id)
print(f'keys: {dfs.keys()}')
dfs['SEDIMENT'].head()

keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])


Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
574705,397306,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,GERMANY,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"9.75,54.833",0xE6100000010CDFE00B93A96A4B400000000000802340,9.75,54.833333,576001
574706,397306,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,GERMANY,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"9.75,54.833",0xE6100000010CDFE00B93A96A4B400000000000802340,9.75,54.833333,576002
574707,397306,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,GERMANY,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"9.75,54.833",0xE6100000010CDFE00B93A96A4B400000000000802340,9.75,54.833333,576003
574708,397306,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,GERMANY,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"9.75,54.833",0xE6100000010CDFE00B93A96A4B400000000000802340,9.75,54.833333,576004
574709,397307,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,GERMANY,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"9.75,54.833",0xE6100000010CDFE00B93A96A4B400000000000802340,9.75,54.833333,576005


## Transform data

### Select columns

In [None]:
#| exports
cois_renaming_rules = {
    'sample_id': 'SMP_ID',
    'latitude': 'LAT',
    'longitude': 'LON',
    'begperiod': 'TIME',
    'sampdepth': 'SMP_DEPTH',
    'totdepth': 'TOT_DEPTH',
    'uncertaint': 'UNC',
    'unit_id': 'UNIT',
    'detection': 'DL',
    'area_id': 'AREA',
    'species_id': 'SPECIES',
    'biogroup_id': 'BIO_GROUP',
    'bodypar_id': 'BODY_PART',
    'sedtype_id': 'SED_TYPE',
    'volume': 'VOL',
    'salinity': 'SAL',
    'temperatur': 'TEMP',
    'sampmet_id': 'SAMP_MET',
    'prepmet_id': 'PREP_MET',
    'counmet_id': 'COUNT_MET',
    'activity': 'VALUE',
    'nuclide_id': 'NUCLIDE',
    'sliceup': 'TOP',
    'slicedown': 'BOTTOM'
}

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules)
    ])

print('Keys:', tfm().keys())
print('Columns:', tfm()['BIOTA'].columns)

Keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])
Columns: Index(['sample_id', 'latitude', 'longitude', 'begperiod', 'sampdepth',
       'totdepth', 'uncertaint', 'unit_id', 'detection', 'area_id',
       'species_id', 'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume',
       'salinity', 'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id',
       'activity', 'nuclide_id', 'sliceup', 'slicedown'],
      dtype='object')


### Rename columns

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules)
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)

Keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'SMP_DEPTH', 'TOT_DEPTH', 'UNC', 'UNIT',
       'DL', 'AREA', 'SPECIES', 'BIO_GROUP', 'BODY_PART', 'SED_TYPE', 'VOL',
       'SAL', 'TEMP', 'SAMP_MET', 'PREP_MET', 'COUNT_MET', 'VALUE', 'NUCLIDE',
       'TOP', 'BOTTOM'],
      dtype='object')


### Drop NaN only columns

We then remove columns containing only NaN values or 'Not available' (id=0 in MARIS lookup tables).

In [None]:
#| exports
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0): fc.store_attr()
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB()
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)

Keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'SMP_DEPTH', 'UNC', 'UNIT', 'DL',
       'AREA', 'SPECIES', 'BIO_GROUP', 'BODY_PART', 'PREP_MET', 'COUNT_MET',
       'VALUE', 'NUCLIDE'],
      dtype='object')


### Remap detection limit values

Category-based `NetCDF` variables are encoded as integer values based on the MARIS lookup table `dbo_detectlimit.xlsx`. We recall that these lookup tables are included in the `NetCDF` file as custom enumeration types.


In [None]:
lut_path()

Path('/Users/franckalbinet/.marisco/lut')

In [None]:
#| exports
dl_name_to_id = lambda: get_lut(lut_path(), 
                                'dbo_detectlimit.xlsx', 
                                key='name', 
                                value='id')

In [None]:
#| eval: false
dl_name_to_id()

{'Not applicable': -1, 'Not Available': 0, '=': 1, '<': 2, 'ND': 3, 'DE': 4}

In [None]:
#| exports
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
                 fn_lut=dl_name_to_id,
                 dl_name='DL'):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.dl_name] = tfm.dfs[k][self.dl_name].replace(lut)

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB()
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)
print(f'{dfs_tfm["BIOTA"]["DL"].unique()}')
print(f'{dfs_tfm["BIOTA"].head()}')

Keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'SMP_DEPTH', 'UNC', 'UNIT', 'DL',
       'AREA', 'SPECIES', 'BIO_GROUP', 'BODY_PART', 'PREP_MET', 'COUNT_MET',
       'VALUE', 'NUCLIDE'],
      dtype='object')
[1 2]
        SMP_ID        LAT        LON                     TIME  SMP_DEPTH  \
575432  638278  57.335278  12.074167  2008-11-03 00:00:00.000        0.0   
575433  638278  57.335278  12.074167  2008-11-03 00:00:00.000        0.0   
575434  638278  57.335278  12.074167  2008-11-03 00:00:00.000        0.0   
575435  638278  57.335278  12.074167  2008-11-03 00:00:00.000        0.0   
575436  638279  57.335278  12.074167  2009-09-17 00:00:00.000        0.0   

           UNC  UNIT  DL  AREA  SPECIES  BIO_GROUP  BODY_PART  PREP_MET  \
575432  0.0684     5   1  2374       96         11         40         6   
575433  0.7040     5   1  2374       96         11         40         6   
575434  0.0747     5   1  2374       96         11  

### Parse and encode time

We remind that in `netCDF` format time need to be encoded as `integer` representing the number of seconds since a time of reference. In our case we chose `1970-01-01 00:00:00.0` as defined in `configs.ipynb`.



In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse time column from MARIS dump."
    def __init__(self,
                 time_name='TIME'):
        fc.store_attr()
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.time_name] = pd.to_datetime(tfm.dfs[k][self.time_name], format='ISO8601')

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB()
    ])

print(tfm()['BIOTA'])

        SMP_ID        LAT        LON        TIME  SMP_DEPTH      UNC  UNIT  \
575432  638278  57.335278  12.074167  1225670400        0.0  0.06840     5   
575433  638278  57.335278  12.074167  1225670400        0.0  0.70400     5   
575434  638278  57.335278  12.074167  1225670400        0.0  0.07470     5   
575435  638278  57.335278  12.074167  1225670400        0.0  0.00000     5   
575436  638279  57.335278  12.074167  1253145600        0.0  0.35100     5   
...        ...        ...        ...         ...        ...      ...   ...   
932837  639100  63.050000  21.616667   518572800        0.0  0.01440     5   
932838  639100  63.050000  21.616667   518572800        0.0      NaN     5   
932839  639137  63.066667  21.400000  1114732800        0.0  1.46500     5   
932840  639137  63.066667  21.400000  1114732800        0.0  0.00204     5   
932841  639137  63.066667  21.400000  1114732800        0.0  5.00000     5   

        DL  AREA  SPECIES  BIO_GROUP  BODY_PART  PREP_MET  COUN

### Sanitize coordinates

We ensure that coordinates are within the valid range.

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_test = tfm()
dfs_test['BIOTA']

Unnamed: 0,SMP_ID,LAT,LON,TIME,SMP_DEPTH,UNC,UNIT,DL,AREA,SPECIES,BIO_GROUP,BODY_PART,PREP_MET,COUNT_MET,VALUE,NUCLIDE
575432,638278,57.335278,12.074167,1225670400,0.0,0.06840,5,1,2374,96,11,40,6,20,0.360,6
575433,638278,57.335278,12.074167,1225670400,0.0,0.70400,5,1,2374,96,11,40,6,20,17.600,2
575434,638278,57.335278,12.074167,1225670400,0.0,0.07470,5,1,2374,96,11,40,6,20,2.490,33
575435,638278,57.335278,12.074167,1225670400,0.0,0.00000,5,1,2374,96,11,40,6,20,1040.000,4
575436,638279,57.335278,12.074167,1253145600,0.0,0.35100,5,1,2374,96,11,40,6,20,11.700,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
932837,639100,63.050000,21.616667,518572800,0.0,0.01440,5,1,9999,269,4,52,12,9,0.072,12
932838,639100,63.050000,21.616667,518572800,0.0,,5,1,9999,269,4,52,12,9,0.015,11
932839,639137,63.066667,21.400000,1114732800,0.0,1.46500,5,1,9999,269,4,52,0,20,29.300,33
932840,639137,63.066667,21.400000,1114732800,0.0,0.00204,5,1,9999,269,4,52,0,8,0.017,12


### Set unique index

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB(),
    UniqueIndexCB()
    ])

dfs_test = tfm()    
dfs_test['BIOTA']

Unnamed: 0,ID,SMP_ID,LAT,LON,TIME,SMP_DEPTH,UNC,UNIT,DL,AREA,SPECIES,BIO_GROUP,BODY_PART,PREP_MET,COUNT_MET,VALUE,NUCLIDE
0,0,638278,57.335278,12.074167,1225670400,0.0,0.06840,5,1,2374,96,11,40,6,20,0.360,6
1,1,638278,57.335278,12.074167,1225670400,0.0,0.70400,5,1,2374,96,11,40,6,20,17.600,2
2,2,638278,57.335278,12.074167,1225670400,0.0,0.07470,5,1,2374,96,11,40,6,20,2.490,33
3,3,638278,57.335278,12.074167,1225670400,0.0,0.00000,5,1,2374,96,11,40,6,20,1040.000,4
4,4,638279,57.335278,12.074167,1253145600,0.0,0.35100,5,1,2374,96,11,40,6,20,11.700,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14867,14867,639100,63.050000,21.616667,518572800,0.0,0.01440,5,1,9999,269,4,52,12,9,0.072,12
14868,14868,639100,63.050000,21.616667,518572800,0.0,,5,1,9999,269,4,52,12,9,0.015,11
14869,14869,639137,63.066667,21.400000,1114732800,0.0,1.46500,5,1,9999,269,4,52,0,20,29.300,33
14870,14870,639137,63.066667,21.400000,1114732800,0.0,0.00204,5,1,9999,269,4,52,0,8,0.017,12


## Encode to NetCDF

In [None]:
#| eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB(),
    UniqueIndexCB()
    ])

dfs_tfm = tfm()
tfm.logs

['Select columns of interest.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Parse time column from MARIS dump.',
 'Encode time as seconds since epoch.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.',
 'Set unique index for each group.']

In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#| eval: false
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)

{'geospatial_lat_min': '30.4358333333333',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.63333333333333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.63333333333333 53.5, 30.4358333333333 53.5, 30.4358333333333 65.75, 9.63333333333333 65.75, 9.63333333333333 53.5))',
 'geospatial_vertical_max': '437.0',
 'geospatial_vertical_min': '-1.0',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-14T00:00:00',
 'id': '3W354SQG',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "A.", "lastName": "McGarry"}, {"creatorType": "author", "firstName": "S.", "lastName": "Lyons"}, {"creatorType": "author", "firstName": "C.", "lastName": "McEnri"}, {"creatorType": "author", "firstName": "T.", "lastName": "Ryan"}, {"creatorType": "author", "firstName": "M.", "lastName": "O\'Colmain"}, {"creatorType": "author", "firstName": "J.D.", "lastN

In [None]:
#| exports
def encode(
    fname_in: str, # Path to the MARIS dump data in CSV format
    dir_dest: str, # Path to the folder where the NetCDF output will be saved
    **kwargs # Additional keyword arguments
    ):
    "Encode MARIS dump to NetCDF."
    dataloader = DataLoader(fname_in)
    ref_ids = kwargs.get('ref_ids')
    if ref_ids is None:
        ref_ids = dataloader.df.ref_id.unique()
    print('Encoding ...')
    for ref_id in tqdm(ref_ids, leave=False):
        dfs = dataloader(ref_id=ref_id)
        print(get_fname(dfs))
        tfm = Transformer(dfs, cbs=[
            SelectColumnsCB(cois_renaming_rules),
            RenameColumnsCB(cois_renaming_rules),
            DropNAColumnsCB(),
            SanitizeDetectionLimitCB(),
            ParseTimeCB(),
            EncodeTimeCB(),
            SanitizeLonLatCB(),
            UniqueIndexCB()
            ])
        
        tfm()
        encoder = NetCDFEncoder(tfm.dfs, 
                                dest_fname=Path(dir_dest) / get_fname(dfs), 
                                global_attrs=get_attrs(tfm, zotero_key=get_zotero_key(dfs), kw=kw),
                                verbose=kwargs.get('verbose', False)
                                )
        encoder.encode()

### Single dataset

In [None]:
#| eval: false
ref_id = 100
encode(
    fname_in,
    dir_dest,
    verbose=False, 
    ref_ids=[ref_id])

Encoding ...


  0%|          | 0/1 [00:00<?, ?it/s]

100-HELCOM-MORS-2018.nc


                                             

### All datasets

In [None]:
#| eval: false
encode(
    fname_in, 
    dir_dest, 
    ref_ids=None,
    verbose=False)