In [None]:
#| default_exp handlers.maris_legacy_long

# MARIS Legacy

> This notebook contains a data pipeline (handler) that converts the master MARIS database dump into `NetCDF` format. It enables batch encoding of all legacy datasets into NetCDF.


Key functions of this handler:

- **Load data** from a MARIS dump file
- **Transform data** by applying various transformations to clean and normalize the data
- **Reshape data** from long to wide format
- **Encode data** into NetCDF files


The **result** is a set of NetCDF files, one for each unique reference ID in the input data.

## Packages import

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from tqdm import tqdm
from pathlib import Path
import fastcore.all as fc
import pandas as pd
import numpy as np

from marisco.callbacks import (
    Callback, 
    Transformer, 
    SanitizeLonLatCB, 
    EncodeTimeCB, 
    # ReshapeLongToWide, 
    RenameColumnsCB, 
    SelectColumnsCB,
    UniqueIndexCB
)

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB, 
    DepthRangeCB,
    TimeRangeCB,
    ZoteroCB,KeyValuePairCB
    )

from marisco.configs import (
    NC_GROUPS,
    lut_path,
    cdl_cfg,
    cfg,
    nc_tpl_path,
    Enums, get_lut
    )

from marisco.serializers import NetCDFEncoder

  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [None]:
#| hide
import warnings
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

## Configuration and file paths

In [None]:
# | exports
# fname_in = Path().home() / 'pro/data/maris/MARIS_exportSample_20240313.txt'
# dir_dest = '../../_data/output/dump'


fname_in = Path().home() / 'pro/data/maris/2024-11-08 MARIS_QA_shapetype_id=1.xlsx'
dir_dest = '../../_data/output/dump'

## Utils

In [None]:
#| exports
class DataLoader:
    "Load specific MARIS dataset through its ref_id."
    LUT = {
        'Biota': 'BIOTA', 
        'Seawater': 'SEAWATER', 
        'Sediment': 'SEDIMENT', 
        'Suspended matter': 'SUSPENDED_MATTER'
    }

    def __init__(self, 
                 fname: str # Path to the MARIS global dump file
                 ):
        self.fname = fname
        self.df = None  # Lazy loading

    def _load_data(self):
        if self.df is None:
            self.df = pd.read_excel(self.fname)

    def __call__(self, 
                 ref_id: int # Reference ID of interest
                 ) -> dict: # Dictionary of dataframes
        self._load_data()
        filtered_df = self.df[self.df.ref_id == ref_id]
        return {self.LUT[name]: grp for name, grp in filtered_df.groupby('samptype') if name in self.LUT}


In [None]:
#| exports
def get_zotero_key(dfs):
    "Retrieve Zotero key from MARIS dump."
    return dfs[next(iter(dfs))][['zoterourl']].iloc[0].values[0].split('/')[-1]

In [None]:
#| exports
def get_fname(dfs):
    "Retrieve filename from MARIS dump."
    id, name = dfs[next(iter(dfs))][['ref_id', 'displaytext']].iloc[0]
    name = name.replace(',', '').replace('.', '').replace('-', ' ').split(' ')
    return '-'.join(([str(id)] + name)) + '.nc'

## Load data

In [None]:
#|eval: false
dataloader = DataLoader(fname_in)
ref_id = 100 # Some other ref_id examples: OSPAR: 191, HELCOM: 100, 717 (only seawater)

dfs = dataloader(ref_id=ref_id)
print(f'keys: {dfs.keys()}')
dfs['SEDIMENT'].head()

keys: dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])


Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
576000,398412,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,DENMARK,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"10.998,55.375",0xE6100000010C0000000000B04B401F85EB51B8FE2540,10.9975,55.375,576001
576001,398412,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,DENMARK,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"10.998,55.375",0xE6100000010C0000000000B04B401F85EB51B8FE2540,10.9975,55.375,576002
576002,398412,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,DENMARK,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"10.998,55.375",0xE6100000010C0000000000B04B401F85EB51B8FE2540,10.9975,55.375,576003
576003,398412,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,DENMARK,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"10.998,55.375",0xE6100000010C0000000000B04B401F85EB51B8FE2540,10.9975,55.375,576004
576004,398412,2374,Kattegat,3,Sediment,100,"HELCOM MORS, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Assumed Cs137, originally reported as 138Cs.",HELCOM MORS 2018 Environmental database,...,,DENMARK,"HELCOM MORS, 2018. Environmental database - He...",2018,123,"10.998,55.375",0xE6100000010C0000000000B04B401F85EB51B8FE2540,10.9975,55.375,576005


In [73]:
ref_id = 724

## Data transformation pipeline

### Select columns

In [74]:
#| exports
cois_renaming_rules = {
    'sample_id': 'SMP_ID',
    'latitude': 'LAT',
    'longitude': 'LON',
    'begperiod': 'TIME',
    'sampdepth': 'SMP_DEPTH',
    'totdepth': 'TOT_DEPTH',
    'uncertaint': 'UNC',
    'unit_id': 'UNIT',
    'detection': 'DL',
    'area_id': 'AREA',
    'species_id': 'SPECIES',
    'biogroup_id': 'BIO_GROUP',
    'bodypar_id': 'BODY_PART',
    'sedtype_id': 'SED_TYPE',
    'volume': 'VOL',
    'salinity': 'SAL',
    'temperatur': 'TEMP',
    'sampmet_id': 'SAMP_MET',
    'prepmet_id': 'PREP_MET',
    'counmet_id': 'COUNT_MET',
    'activity': 'VALUE',
    'nuclide_id': 'NUCLIDE',
    'sliceup': 'TOP',
    'slicedown': 'BOTTOM'
}

In [76]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules)
    ])

print('Keys:', tfm().keys())
print('Columns:', tfm()['BIOTA'].columns)

Keys: dict_keys(['BIOTA'])
Columns: Index(['sample_id', 'latitude', 'longitude', 'begperiod', 'sampdepth',
       'totdepth', 'uncertaint', 'unit_id', 'detection', 'area_id',
       'species_id', 'biogroup_id', 'bodypar_id', 'sedtype_id', 'volume',
       'salinity', 'temperatur', 'sampmet_id', 'prepmet_id', 'counmet_id',
       'activity', 'nuclide_id', 'sliceup', 'slicedown'],
      dtype='object')


### Rename columns

In [77]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules)
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)

Keys: dict_keys(['BIOTA'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'SMP_DEPTH', 'TOT_DEPTH', 'UNC', 'UNIT',
       'DL', 'AREA', 'SPECIES', 'BIO_GROUP', 'BODY_PART', 'SED_TYPE', 'VOL',
       'SAL', 'TEMP', 'SAMP_MET', 'PREP_MET', 'COUNT_MET', 'VALUE', 'NUCLIDE',
       'TOP', 'BOTTOM'],
      dtype='object')


### Drop NaN only columns

In [78]:
#| exports
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0): fc.store_attr()
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])

In [79]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB()
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)


Keys: dict_keys(['BIOTA'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'UNC', 'UNIT', 'DL', 'AREA', 'SPECIES',
       'BIO_GROUP', 'BODY_PART', 'COUNT_MET', 'VALUE', 'NUCLIDE'],
      dtype='object')


### Remap detection limit values

In [80]:
#| exports
dl_name_to_id = lambda: get_lut(lut_path(), 'dbo_detectlimit.xlsx', key='name', value='id')

In [81]:
#| eval: false
dl_name_to_id()

{'Not applicable': -1, 'Not Available': 0, '=': 1, '<': 2, 'ND': 3, 'DE': 4}

In [82]:
#| exports
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
                 fn_lut=dl_name_to_id,
                 dl_name='DL'):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.dl_name] = tfm.dfs[k][self.dl_name].replace(lut)

In [83]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB()
    ])

dfs_tfm = tfm()
print('Keys:', dfs_tfm.keys())
print('Columns:', dfs_tfm['BIOTA'].columns)
print(f'{dfs_tfm["BIOTA"]["DL"].unique()}')
print(f'{dfs_tfm["BIOTA"].head()}')

Keys: dict_keys(['BIOTA'])
Columns: Index(['SMP_ID', 'LAT', 'LON', 'TIME', 'UNC', 'UNIT', 'DL', 'AREA', 'SPECIES',
       'BIO_GROUP', 'BODY_PART', 'COUNT_MET', 'VALUE', 'NUCLIDE'],
      dtype='object')
[1 2]
       SMP_ID       LAT         LON       TIME  UNC  UNIT  DL  AREA  SPECIES  \
20796  681755  36.35828  140.666057 1978-11-01  NaN     5   1  1908      445   
20797  681755  36.35828  140.666057 1978-11-01  NaN     5   1  1908      445   
20798  681821  36.35828  140.666057 1979-07-06  NaN     5   1  1908      445   
20799  681821  36.35828  140.666057 1979-07-06  NaN     5   1  1908      445   
20800  681821  36.35828  140.666057 1979-07-06  NaN     5   1  1908      445   

       BIO_GROUP  BODY_PART  COUNT_MET    VALUE  NUCLIDE  
20796          4          1         20    0.300       33  
20797          4          1         22    0.026       37  
20798          4          1          9  148.000      103  
20799          4          1         22    0.019       12  
20800         

### Parse and encode time

We remind that in `netCDF` format time need to be encoded as `integer` representing the number of seconds since a time of reference. In our case we chose `1970-01-01 00:00:00.0` as defined in `configs.ipynb`.



In [84]:
#| exports
class ParseTimeCB(Callback):
    "Parse time column from MARIS dump."
    def __init__(self,
                 time_name='TIME'):
        fc.store_attr()
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k][self.time_name] = pd.to_datetime(tfm.dfs[k][self.time_name], format='ISO8601')

In [85]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB()
    ])

print(tfm()['BIOTA'])

        SMP_ID        LAT         LON        TIME    UNC  UNIT  DL  AREA  \
20796   681755  36.358280  140.666057   278726400    NaN     5   1  1908   
20797   681755  36.358280  140.666057   278726400    NaN     5   1  1908   
20798   681821  36.358280  140.666057   300067200    NaN     5   1  1908   
20799   681821  36.358280  140.666057   300067200    NaN     5   1  1908   
20800   681821  36.358280  140.666057   300067200    NaN     5   1  1908   
...        ...        ...         ...         ...    ...   ...  ..   ...   
944501  681221  36.188018  138.023379  1195084800  0.011     5   1  9999   
944502  681221  36.188018  138.023379  1195084800  0.021     5   1  9999   
944503  681222  36.188018  138.023379  1226880000  0.910     5   1  9999   
944504  681222  36.188018  138.023379  1226880000  0.010     5   1  9999   
944505  681222  36.188018  138.023379  1226880000  0.019     5   1  9999   

        SPECIES  BIO_GROUP  BODY_PART  COUNT_MET    VALUE  NUCLIDE  
20796       445   

### Sanitize coordinates

In [86]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_test = tfm()
dfs_test['BIOTA']

Unnamed: 0,SMP_ID,LAT,LON,TIME,UNC,UNIT,DL,AREA,SPECIES,BIO_GROUP,BODY_PART,COUNT_MET,VALUE,NUCLIDE
20796,681755,36.358280,140.666057,278726400,,5,1,1908,445,4,1,20,0.300,33
20797,681755,36.358280,140.666057,278726400,,5,1,1908,445,4,1,22,0.026,37
20798,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,9,148.000,103
20799,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,22,0.019,12
20800,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,20,0.240,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944501,681221,36.188018,138.023379,1195084800,0.011,5,1,9999,262,4,1,22,0.120,33
944502,681221,36.188018,138.023379,1195084800,0.021,5,1,9999,262,4,1,20,0.110,33
944503,681222,36.188018,138.023379,1226880000,0.910,5,1,9999,262,4,1,20,85.000,4
944504,681222,36.188018,138.023379,1226880000,0.010,5,1,9999,262,4,1,22,0.110,33


### Set unique index

In [87]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB(),
    UniqueIndexCB()
    ])

dfs_test = tfm()
dfs_test['BIOTA']

Unnamed: 0,ID,SMP_ID,LAT,LON,TIME,UNC,UNIT,DL,AREA,SPECIES,BIO_GROUP,BODY_PART,COUNT_MET,VALUE,NUCLIDE
0,0,681755,36.358280,140.666057,278726400,,5,1,1908,445,4,1,20,0.300,33
1,1,681755,36.358280,140.666057,278726400,,5,1,1908,445,4,1,22,0.026,37
2,2,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,9,148.000,103
3,3,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,22,0.019,12
4,4,681821,36.358280,140.666057,300067200,,5,1,1908,445,4,1,20,0.240,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86646,86646,681221,36.188018,138.023379,1195084800,0.011,5,1,9999,262,4,1,22,0.120,33
86647,86647,681221,36.188018,138.023379,1195084800,0.021,5,1,9999,262,4,1,20,0.110,33
86648,86648,681222,36.188018,138.023379,1226880000,0.910,5,1,9999,262,4,1,20,85.000,4
86649,86649,681222,36.188018,138.023379,1226880000,0.010,5,1,9999,262,4,1,22,0.110,33


In [98]:
np.sort(dfs_test['BIOTA'].SPECIES.unique())

array([   0,    7,    8,    9,   12,   15,   20,   79,  100,  109,  163,
        166,  236,  249,  251,  262,  265,  266,  291,  295,  296,  298,
        299,  301,  302,  303,  304,  305,  306,  308,  309,  315,  319,
        320,  321,  322,  328,  332,  334,  335,  339,  340,  344,  347,
        348,  349,  350,  351,  356,  358,  361,  363,  365,  367,  369,
        373,  375,  376,  398,  414,  442,  445,  446,  447,  449,  450,
        451,  454,  458,  462,  465,  466,  471,  474,  475,  476,  477,
        479,  481,  485,  490,  494,  502,  504,  505,  507,  510,  514,
        515,  516,  518,  524,  531,  533,  536,  551,  555,  564,  565,
        567,  568,  569,  576,  577,  579,  580,  583,  584,  588,  589,
        591,  594,  600,  604,  611,  612,  614,  615,  622,  628,  648,
        655,  659,  660,  661,  672,  687,  712,  723,  727,  735,  763,
        790,  793,  816,  818,  821,  825,  826,  827,  844,  859,  873,
        929,  972,  992, 1029, 1059, 1060, 1078, 10

In [90]:
lut_src_dir_test = '../files/lut'
enums = Enums(lut_src_dir=lut_src_dir_test)

In [97]:
1629 in enums.types['SPECIES'].values()

False

In [99]:
for s in np.sort(dfs_test['BIOTA'].SPECIES.unique()):
    if s not in enums.types['SPECIES'].values():
        print(s)

1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1633
1634
1635
1637
1638
1639
1640
1641
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1654
1655
1656
1657
1658
1659
1660
1661
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1676
1677
1678
1679
1681
1682
1683
1684
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1729
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1747


In [104]:
list(enums.types['SPECIES'].values())[::-1]

[1611,
 1610,
 1609,
 1608,
 1607,
 1606,
 1605,
 1604,
 1603,
 1602,
 1601,
 1600,
 1599,
 1598,
 1597,
 1596,
 1595,
 1594,
 1593,
 1592,
 1591,
 1590,
 1589,
 1588,
 1587,
 1586,
 1585,
 1584,
 1583,
 1582,
 1581,
 1580,
 1579,
 1578,
 1577,
 1576,
 1575,
 1574,
 1573,
 1572,
 1571,
 1570,
 1569,
 1567,
 1566,
 1565,
 1564,
 1563,
 1562,
 1560,
 1559,
 1558,
 1557,
 1556,
 1555,
 1554,
 1553,
 1552,
 1551,
 1550,
 1549,
 1548,
 1547,
 1546,
 1545,
 1544,
 1543,
 1542,
 1541,
 1540,
 1539,
 1538,
 1537,
 1536,
 1535,
 1534,
 1533,
 1532,
 1531,
 1530,
 1529,
 1528,
 1527,
 1526,
 1525,
 1524,
 1523,
 1522,
 1521,
 1520,
 1519,
 1518,
 1517,
 1516,
 1515,
 1514,
 1513,
 1512,
 1511,
 1510,
 1509,
 1508,
 1507,
 1506,
 1505,
 1504,
 1503,
 1502,
 1501,
 1500,
 1499,
 1498,
 1497,
 1496,
 1495,
 1494,
 1493,
 1492,
 1491,
 1490,
 1489,
 1488,
 1487,
 1486,
 1485,
 1484,
 1483,
 1482,
 1481,
 1480,
 1479,
 1478,
 1477,
 1476,
 1475,
 1474,
 1473,
 1472,
 1471,
 1470,
 1469,
 1468,
 1467,

## Encode to NetCDF

In [None]:
#|eval: false
dfs = dataloader(ref_id=ref_id)
tfm = Transformer(dfs, cbs=[
    SelectColumnsCB(cois_renaming_rules),
    RenameColumnsCB(cois_renaming_rules),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB(),
    UniqueIndexCB()
    ])

dfs_tfm = tfm()
tfm.logs

['Select columns of interest.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Parse time column from MARIS dump.',
 'Encode time as seconds since epoch.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.',
 'Set unique index for each group.']

In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)

[(12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (12.71, 55.8891666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 55.9666666666667), (11.5833333333333, 

{'geospatial_lat_min': '30.4358333333333',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.63333333333333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.63333333333333 53.5, 30.4358333333333 53.5, 30.4358333333333 65.75, 9.63333333333333 65.75, 9.63333333333333 53.5))',
 'geospatial_vertical_max': '437.0',
 'geospatial_vertical_min': '-1.0',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-14T00:00:00',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "A.", "lastName": "McGarry"}, {"creatorType": "author", "firstName": "S.", "lastName": "Lyons"}, {"creatorType": "author", "firstName": "C.", "lastName": "McEnri"}, {"creatorType": "author", "firstName": "T.", "lastName": "Ryan"}, {"creatorType": "author", "firstName": "M.", "lastName": "O\'Colmain"}, {"creatorType": "author", "firstName": "J.D.", "lastName": "Cunningham"}

In [67]:
#| exports
def encode(fname_in, fname_out, dataloader=None, **kwargs):
    if dataloader is None: dataloader = DataLoader(fname_in)._load_data()
    ref_ids = kwargs.get('ref_ids', dataloader.df.ref_id.unique())
    print('Encoding ...')
    for ref_id in tqdm(ref_ids, leave=False):
        dfs = dataloader(ref_id=ref_id)
        print(get_fname(dfs))
        tfm = Transformer(dfs, cbs=[
            SelectColumnsCB(cois_renaming_rules),
            RenameColumnsCB(cois_renaming_rules),
            DropNAColumnsCB(),
            SanitizeDetectionLimitCB(),
            ParseTimeCB(),
            EncodeTimeCB(),
            SanitizeLonLatCB(),
            UniqueIndexCB()
            ])
        
        tfm()
        encoder = NetCDFEncoder(tfm.dfs, 
                                dest_fname=Path(fname_out) / get_fname(dfs), 
                                global_attrs=get_attrs(tfm, zotero_key=get_zotero_key(dfs), kw=kw),
                                verbose=kwargs.get('verbose', False)
                                )
        encoder.encode()

### Single dataset

In [72]:
#|eval: false
ref_id = 724
encode(
    fname_in,
    dir_dest,
    dataloader=dataloader,
    verbose=True, 
    ref_ids=[ref_id])

Encoding ...


  0%|          | 0/1 [00:00<?, ?it/s]

724-Environmental-Radioactivity-and-Radiation-in-Japan---Environmental-Radiation-Database-2024.nc
--------------------------------------------------------------------------------
Group: biota, Variable: id
--------------------------------------------------------------------------------
Group: biota, Variable: lon
--------------------------------------------------------------------------------
Group: biota, Variable: lat
--------------------------------------------------------------------------------
Group: biota, Variable: time
--------------------------------------------------------------------------------
Group: biota, Variable: area
--------------------------------------------------------------------------------
Group: biota, Variable: smp_id
--------------------------------------------------------------------------------
Group: biota, Variable: nuclide
--------------------------------------------------------------------------------
Group: biota, Variable: value
--------------------

                                     

ValueError: trying to assign illegal value to Enum variable

### All datasets

In [70]:
#|eval: false
encode(
    fname_in, 
    dir_dest, 
    dataloader=dataloader,
    verbose=False)

Encoding ...


  0%|          | 0/475 [00:00<?, ?it/s]

402-CCHDO-2018.nc


  0%|          | 1/475 [00:02<15:56,  2.02s/it]

374-Ostlund-et-al-1987.nc


  0%|          | 2/475 [00:03<13:57,  1.77s/it]

376-IRSN---Institut-de-Radioprotection-et-de-Sûreté-Nucléaire-2019.nc


  1%|          | 3/475 [00:05<16:07,  2.05s/it]

401-Olsen-et-al-2016.nc


  1%|          | 4/475 [00:06<12:35,  1.60s/it]

16-Cherry-and-Heyraud-1981.nc


  1%|          | 5/475 [00:07<10:33,  1.35s/it]

18-Cherry-and-Heyraud-1982.nc


  1%|▏         | 6/475 [00:08<08:58,  1.15s/it]

30-Östlund-and-Grall-1991.nc


  1%|▏         | 7/475 [00:09<07:57,  1.02s/it]

182-Urban-et-al-2015.nc


  2%|▏         | 8/475 [00:10<07:31,  1.04it/s]

183-Bokor-et-al-2016.nc


  2%|▏         | 9/475 [00:10<06:57,  1.12it/s]

226-Sdraulig-2018.nc


  2%|▏         | 10/475 [00:11<06:39,  1.17it/s]

323-Johansen-et-al-2019.nc


  2%|▏         | 11/475 [00:12<06:40,  1.16it/s]

99-Aoyama-and-Hirose-2004.nc


  3%|▎         | 12/475 [00:14<08:57,  1.16s/it]

358-Kall-et-al-2014.nc


  3%|▎         | 13/475 [00:15<08:01,  1.04s/it]

395-Bailly-du-Bois-et-al-2020.nc


  3%|▎         | 14/475 [00:18<12:51,  1.67s/it]

106-Yamada-et-al-2006.nc


  3%|▎         | 15/475 [00:19<10:47,  1.41s/it]

400-Boyer-et-al-2013.nc


  3%|▎         | 16/475 [00:19<09:27,  1.24s/it]

509-Johansen-et-al-2015.nc


  4%|▎         | 17/475 [00:20<08:45,  1.15s/it]

568-Johansen-2020.nc


  4%|▍         | 18/475 [00:21<08:24,  1.10s/it]

97-ASPAMARD-2004.nc


  4%|▍         | 19/475 [00:22<07:43,  1.02s/it]

443-Heyraud-et-al-1994.nc


  4%|▍         | 20/475 [00:23<07:15,  1.04it/s]

508-Lee-et-al-2018.nc


  4%|▍         | 21/475 [00:24<07:11,  1.05it/s]

103-RADNOR-2010.nc


  5%|▍         | 22/475 [00:25<07:00,  1.08it/s]

122-Casacuberta-et-al-2018.nc


  5%|▍         | 23/475 [00:26<06:35,  1.14it/s]

190-Schlitzer-et-al-2018.nc


  5%|▌         | 24/475 [00:27<08:44,  1.16s/it]

191-OSPAR-Comission’s-Radioactive-Substances-Committee-(RSC)-2018.nc


  5%|▌         | 25/475 [00:29<10:35,  1.41s/it]

199-Skjerdal-et-al-2020.nc


  5%|▌         | 26/475 [00:30<09:13,  1.23s/it]

200-Zaborska-et-al-2010.nc


  6%|▌         | 27/475 [00:31<08:08,  1.09s/it]

380-Smith-et-al-2020.nc


  6%|▌         | 28/475 [00:32<07:33,  1.01s/it]

685-Chamizo-et-al-2021.nc


  6%|▌         | 29/475 [00:33<06:58,  1.07it/s]

720-Payne-et-al-2024.nc


  6%|▋         | 30/475 [00:34<07:08,  1.04it/s]

381-Smith-2020.nc


  7%|▋         | 31/475 [00:35<07:02,  1.05it/s]

432-Efurd-et-al-1997.nc


  7%|▋         | 32/475 [00:36<07:05,  1.04it/s]

477-Valette-Silver-et-al-1999.nc


  7%|▋         | 33/475 [00:36<06:54,  1.07it/s]

718-Smith-2024.nc


  7%|▋         | 34/475 [00:37<06:40,  1.10it/s]

222-Huang-et-al-2019.nc


  7%|▋         | 35/475 [00:38<06:22,  1.15it/s]

201-Mietelski-et-al-2008.nc


  8%|▊         | 36/475 [00:39<06:25,  1.14it/s]

409-Cherry-et-al-1987.nc


  8%|▊         | 37/475 [00:40<06:40,  1.09it/s]

712-Fávaro-et-al-2012.nc


  8%|▊         | 38/475 [00:41<06:18,  1.15it/s]

570-Szufa-2020.nc


  8%|▊         | 39/475 [00:41<06:06,  1.19it/s]

109-Gulin-and-Stokozov-2005.nc


  8%|▊         | 40/475 [00:42<05:57,  1.22it/s]

571-Szufa-2018.nc


  9%|▊         | 41/475 [00:43<05:59,  1.21it/s]

119-MEXT---Ministry-of-Education-Culture-Sports-Science-and-Technology-2011.nc


  9%|▉         | 42/475 [00:44<06:41,  1.08it/s]

130-Wada-et-al-2016.nc


  9%|▉         | 43/475 [00:50<17:16,  2.40s/it]

129-Wada-et-al-2016.nc


  9%|▉         | 44/475 [00:51<14:48,  2.06s/it]

121-TEPCO---Tokyo-Electric-Power-Company-2011.nc


  9%|▉         | 45/475 [00:57<21:34,  3.01s/it]

132-Fukushima-Prefectural-Federation-of-Fisheries-Co-operative-Associations-2012.nc


 10%|▉         | 46/475 [01:00<22:07,  3.09s/it]

120-NRA---Nuclear-Regulation-Authority-2013.nc


 10%|▉         | 47/475 [01:01<17:31,  2.46s/it]

427-Yamamoto-et-al-1994.nc


 10%|█         | 48/475 [01:02<14:00,  1.97s/it]

428-Yu-et-al-2015.nc


 10%|█         | 49/475 [01:02<11:33,  1.63s/it]

142-NRA---Nuclear-Regulation-Authority-2013.nc


 11%|█         | 50/475 [01:03<09:53,  1.40s/it]

143-Fukushima-Prefecture-2011.nc


 11%|█         | 51/475 [01:04<09:01,  1.28s/it]

445-Hoffman-et-al-1974.nc


 11%|█         | 52/475 [01:05<08:00,  1.14s/it]

446-Valette-Silver-and-Lauenstein-1995.nc


 11%|█         | 53/475 [01:06<07:14,  1.03s/it]

513-Madigan-2012.nc


 11%|█▏        | 54/475 [01:07<07:05,  1.01s/it]

514-Miki-et-al-2016.nc


 12%|█▏        | 55/475 [01:08<06:35,  1.06it/s]

520-Baumann-et-al-2013.nc


 12%|█▏        | 56/475 [01:09<06:27,  1.08it/s]

542-Ruelas-Inzunza-et-al-2012.nc


 12%|█▏        | 57/475 [01:09<06:08,  1.13it/s]

546-Takagi-et-al-2015.nc


 12%|█▏        | 58/475 [01:10<05:59,  1.16it/s]

547-Ruelas-Inzunza-2014.nc


 12%|█▏        | 59/475 [01:11<05:42,  1.22it/s]

548-Suchanek-et-al-1996.nc


 13%|█▎        | 60/475 [01:12<05:36,  1.23it/s]

549-Shigeoka-et-al-2019.nc


 13%|█▎        | 61/475 [01:13<05:38,  1.22it/s]

724-Environmental-Radioactivity-and-Radiation-in-Japan---Environmental-Radiation-Database-2024.nc


                                                

ValueError: trying to assign illegal value to Enum variable