In [None]:
#| default_exp handlers.maris_dump

# MARIS dump
> Data pipeline (handler) to convert global MARIS db dump into `NetCDF` format

The input data is a dump from already imported MARIS datasets.


**Questions**:
1. is `decayedto` used and in what context? (question to be answered for all columns actually)
2. what are units of uncertainty?
3. ref_id vs. zoterourl vs. displaytext
   1.  e.g ref_id=129 and 130 points to the same dataset?

**Dev. board**: https://trello.com/b/IszgV1bj/marisco

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from functools import partial
import fastcore.all as fc

from pathlib import Path

from marisco.callbacks import (Callback, Transformer, SanitizeLonLatCB)
from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)
from marisco.configs import lut_path, cdl_cfg, cfg, nc_tpl_path

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
fname_in = Path().home() / 'pro/data/maris/all-maris.txt'
dir_dest = '../../_data/output/dump'
# fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
# | export
def load_dump(fname): 
    return pd.read_csv(fname, sep='\t', encoding='ISO-8859-1')   

def load_data(df:pd.DataFrame, # MARIS global dump 
                 ref_id:int, # Reference id of interest
                 ):
    "Load specific MARIS dataset through its ref_id."
    lut = {
        'Sediment': 'sediment',
        'Seawater': 'seawater',
        'Suspended matter': 'suspended-matter',
        'Biota': 'biota'}
    dfs = {}
    for name, grp in df[df.ref_id  == ref_id].groupby('samptype'):
        dfs[lut[name]] = grp
    return dfs

def get_zotero_key(dfs):
    return dfs[next(iter(dfs))][['zoterourl']].iloc[0].values[0].split('/')[-1]

def get_fname(dfs):
    id, name = dfs[next(iter(dfs))][['ref_id', 'displaytext']].iloc[0]
    name = name.replace(',', '').replace('.', '').replace('-', ' ').split(' ')
    return '-'.join(([str(id)] + name)) + '.nc'

## Load data

In [None]:
df = load_dump(fname_in)

In [None]:
# 52
ref_id = 52

## Data transformation pipeline

### Normalize nuclide names

In [None]:
#| export
def get_varnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nuclide_id').to_dict()['nc_name']

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self,
                 fn_lut=get_varnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide_id'].replace(lut)

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

print(tfm()['biota']['nuclide_id'].unique())

['sr90' 'cs137' 'pu238' 'pu239_240_tot']


### Rename columns

In [None]:
#| export
# To be added: endperiod, totdepth, lab
def renaming_rules():
    vars = cdl_cfg()['vars']
    # Define column names renaming rules
    return {
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'begperiod': vars['defaults']['time']['name'],
        'sampdepth': vars['defaults']['depth']['name'],
        'uncertaint': vars['suffixes']['uncertainty']['name'],
        'unit_id': vars['suffixes']['unit']['name'],
        'detection': vars['suffixes']['detection_limit']['name'],
        'area_id': vars['defaults']['area']['name'], 
        'species_id': vars['bio']['species']['name'],
        'biogroup_id': vars['bio']['bio_group']['name'],
        'bodypar_id': vars['bio']['body_part']['name'],
        'sedtype_id': vars['sed']['sed_type']['name'],
        'volume': vars['suffixes']['volume']['name'],
        'salinity': vars['suffixes']['salinity']['name'],
        'temperatur': vars['suffixes']['temperature']['name'],
        'sampmet_id': vars['suffixes']['sampling_method']['name'],
        'prepmet_id': vars['suffixes']['preparation_method']['name'],
        'counmet_id': vars['suffixes']['counting_method']['name'],
        'activity': 'value',
        'nuclide_id': 'nuclide'
    }

In [None]:
#| export
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        lut = renaming_rules()
        coi = lut.keys()
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, coi]
            # Rename cols
            tfm.dfs[k].rename(columns=lut, inplace=True)

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB()
    ])

print(tfm()['biota'])

              lat         lon     time  depth  unc  unit  dl  area  species  \
757776  42.316667  132.316667  00:00.0    NaN  NaN     5   <  4307      712   
757777  42.316667  132.316667  00:00.0    NaN  NaN     5  DE  4307      712   
757778  42.316667  132.316667  00:00.0    NaN  NaN     5  DE  4307      712   
757779  42.316667  132.316667  00:00.0    NaN  NaN     5  DE  4307      712   
757780  42.316667  132.316667  00:00.0    NaN  NaN     5   <  4307      695   
757781  42.316667  132.316667  00:00.0    NaN  NaN     4  DE  4307      695   

        bio_group  body_part  sed_type  vol  sal  temp  sampmet  prepmet  \
757776          4          0         0  NaN  NaN   NaN        0        0   
757777          4          0         0  NaN  NaN   NaN        0        0   
757778          4          0         0  NaN  NaN   NaN        0        0   
757779          4          0         0  NaN  NaN   NaN        0        0   
757780          8          0         0  NaN  NaN   NaN        0   

### Drop NaN only columns

In [None]:
#| export
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0):
        fc.store_attr()
        
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB()
    ])

print(tfm()['biota'])

              lat         lon     time  unit  dl  area  species  bio_group  \
757776  42.316667  132.316667  00:00.0     5   <  4307      712          4   
757777  42.316667  132.316667  00:00.0     5  DE  4307      712          4   
757778  42.316667  132.316667  00:00.0     5  DE  4307      712          4   
757779  42.316667  132.316667  00:00.0     5  DE  4307      712          4   
757780  42.316667  132.316667  00:00.0     5   <  4307      695          8   
757781  42.316667  132.316667  00:00.0     4  DE  4307      695          8   

           value        nuclide  
757776  0.170000           sr90  
757777  0.142478          cs137  
757778  0.000548          pu238  
757779  0.002389  pu239_240_tot  
757780  0.440000          cs137  
757781  0.176635  pu239_240_tot  


### Sanitize detection limit values

In [None]:
#| export
def get_dl_lut():
    fname = lut_path() / 'dbo_detection.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['previous_name', 'detection_id'])
    return df_nuclide.set_index('previous_name').to_dict()['detection_id']

In [None]:
get_dl_lut()

{nan: 0, 'ND': 1, '=': 2, 'D': 3, '<': 4}

In [None]:
#| export
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
                 fn_lut=get_dl_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['dl'] = tfm.dfs[k]['dl'].replace(lut)

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB()
    ])

print(tfm()['sediment'])

              lat         lon     time   depth  unit  dl  area        value  \
755699  41.916667  134.333333  00:00.0  3570.0     4   2  4307     0.120000   
755702  42.316667  132.316667  00:00.0  1512.0     4   4  4307     0.300000   
755703  42.316667  132.316667  00:00.0  1512.0     4   2  4307     2.300000   
755704  42.316667  132.316667  00:00.0  1512.0     4   2  4307     0.013000   
755705  42.316667  132.316667  00:00.0  1512.0     4   2  4307     0.760000   
757500  38.001944  134.951389  00:00.0  2991.0     4  DE  4307     0.149666   
757501  38.001944  134.951389  00:00.0  2991.0     4   2  4307     1.000000   
757502  38.001944  134.951389  00:00.0  2991.0     4   4  4307     0.018000   
757503  38.001944  134.951389  00:00.0  2991.0     4   2  4307     0.140000   
757754  40.795000  134.866667  00:00.0  3245.0     4   4  4307     0.300000   
757755  40.795000  134.866667  00:00.0  3245.0     4  DE  4307     1.400000   
757756  40.795000  134.866667  00:00.0  3245.0     4

### Reshape: long to wide

In [None]:
#| export
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, value_col='nuclide'):
        fc.store_attr()
        self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
    
    def renamed_cols(self, cols):
        return [f'{inner}_{outer}' if inner else outer for outer, inner in cols]

    def pivot(self, df):
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        idx = list(set(df.columns) - set([self.value_col] + derived_coi))
        return df.pivot_table(index=idx,
                              columns=self.value_col,
                              values=derived_coi).reset_index()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.pivot(tfm.dfs[k])
            tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide()
    ])

print(tfm()['sediment'])

TypeError: agg function failed [how->mean,dtype->object]

### Encode time (seconds since ...)

In [None]:
#| export
class EncodeTimeCB(Callback):
    "Encode time as `int` representing seconds since xxx (TBD)."  
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].time = 9999

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB()
    ])

print(tfm()['sediment'])

        area  value  time        lat       lon  am241_dl  cs134_dl  cs137_dl  \
sample                                                                         
0       2357    0.1  9999  54.083056 -6.215000       NaN       1.0       NaN   
1       2357    0.2  9999  53.333333 -6.000000       NaN       3.0       NaN   
2       2357    0.2  9999  53.333333 -5.366667       NaN       3.0       NaN   
3       2357    0.2  9999  53.416667 -6.016667       NaN       3.0       NaN   
4       2357    0.2  9999  53.600000 -5.933333       NaN       3.0       NaN   
...      ...    ...   ...        ...       ...       ...       ...       ...   
145     9999  346.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
146     9999  346.0  9999  54.634722 -8.454722       NaN       NaN       NaN   
147     9999  412.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
148     9999  491.0  9999  54.042778 -6.186944       NaN       NaN       NaN   
149     9999  493.0  9999  54.042778 -6.

### Sanitize coordinates

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

print(tfm()['sediment'])

        area  value  time        lat       lon  am241_dl  cs134_dl  cs137_dl  \
sample                                                                         
0       2357    0.1  9999  54.083056 -6.215000       NaN       1.0       NaN   
1       2357    0.2  9999  53.333333 -6.000000       NaN       3.0       NaN   
2       2357    0.2  9999  53.333333 -5.366667       NaN       3.0       NaN   
3       2357    0.2  9999  53.416667 -6.016667       NaN       3.0       NaN   
4       2357    0.2  9999  53.600000 -5.933333       NaN       3.0       NaN   
...      ...    ...   ...        ...       ...       ...       ...       ...   
145     9999  346.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
146     9999  346.0  9999  54.634722 -8.454722       NaN       NaN       NaN   
147     9999  412.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
148     9999  491.0  9999  54.042778 -6.186944       NaN       NaN       NaN   
149     9999  493.0  9999  54.042778 -6.

## Encode to NetCDF

In [None]:
dfs = load_data(df, ref_id)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()

TypeError: agg function failed [how->mean,dtype->object]

In [None]:
tfm.logs

['Remap to MARIS radionuclide names.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Convert data from long to wide with renamed columns.',
 'Encode time as `int` representing seconds since xxx (TBD).',
 'Drop row when both longitude & latitude equal 0.']

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| export
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)

{'geospatial_lat_min': '-5.233333333',
 'geospatial_lat_max': '55.12111111',
 'geospatial_lon_min': '-9.910277778',
 'geospatial_lon_max': '51.65138889',
 'geospatial_bounds': 'POLYGON ((-9.910277778 51.65138889, -5.233333333 51.65138889, -5.233333333 55.12111111, -9.910277778 55.12111111, -9.910277778 51.65138889))',
 'time_coverage_start': '1970-01-01T02:46:39',
 'time_coverage_end': '1970-01-01T02:46:39',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': [{'creatorType': 'author',
   'firstName': 'A.',
   'lastName': 'McGarry'},
  {'creatorType': 'author', 'firstName': 'S.', 'lastName': 'Lyons'},
  {'creatorType': 'author', 'firstName': 'C.', 'lastName': 'McEnri'},
  {'creatorType': 'author', 'firstName': 'T.', 'lastName': 'Ryan'},
  {'creatorType': 'author', 'firstName': 'M.', 'lastName': "O'Colmain"},
  {'creatorType': 'author', 'firstName': 'J.D.', 'lastName': 'Cunningham'}],
 'keywords': 'oceanography, Earth Scie

In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    df = load_dump(fname_in)
    ref_ids = kwargs.get('ref_ids', df.ref_id.unique())
    # for ref_id in tqdm(ref_ids):
    for ref_id in ref_ids:
        dfs = load_data(df, ref_id)
        print(get_fname(dfs), ': ', get_zotero_key(dfs))
        # print(get_fname(dfs))    
        # tfm = Transformer(dfs, cbs=[
        #     RemapRdnNameCB(),
        #     RenameColumnCB(),
        #     DropNAColumnsCB(),
        #     SanitizeDetectionLimitCB(),
        #     ReshapeLongToWide(),
        #     EncodeTimeCB(),
        #     SanitizeLonLatCB()
        #     ])
        
        # species_lut = get_maris_species(fname_in, 'species_helcom.pkl')
        # enums_xtra = {
        #     'species_t': {info['name']: info['id'] 
        #                   for info in species_lut.values() if info['name'] != ''}
        # }
        
        # encoder = NetCDFEncoder(tfm(), 
        #                         src_fname=nc_tpl_path,
        #                         dest_fname=Paht(fname_out) / get_fname(dfs), 
        #                         global_attrs=get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw),
        #                         # enums_xtra=enums_xtra,
        #                         **kwargs)
        # encoder.encode()

In [None]:
encode(fname_in, dir_dest, nc_tpl_path(), verbose=False, ref_ids=[ref_id])

100%|██████████| 1/1 [00:00<00:00, 169.58it/s]

9FPK75TG





In [None]:
encode(fname_in, dir_dest, nc_tpl_path(), verbose=False)

402-CCHDO-2018.nc :  GSALIT9M
374-Ístlund-et-al-1987.nc :  5YSDHQRR
401-Olsen-et-al-2016.nc :  IPCB2F7F
16-Cherry-and-Heyraud-1981.nc :  VWH88IG7
30-Ístlund-and-Grall-1991.nc :  P2HYNJQI
323-Johansen-et-al-2019.nc :  WNLYCP6T
18-Cherry-and-Heyraud-1982.nc :  P8VYA6DI
226-Sdraulig-2018.nc :  WVN54MX3
395-Bailly-du-Bois-et-al-2020.nc :  VTG8KVPZ
400-Boyer-et-al-2013.nc :  J7A8WAST
97-ASPAMARD-2004.nc :  YYRZDYRA
99-Aoyama-and-Hirose-2004.nc :  8ANGNEN5
358-Kall-et-al-2014.nc :  MVQ9MFW8
568-Johansen-2020.nc :  ZKIQ3LBF
443-Heyraud-et-al-1994.nc :  7BUFP2FI
508-Lee-et-al-2018.nc :  SUB864TS
509-Johansen-et-al-2015.nc :  GVG92UPF
106-Yamada-et-al-2006.nc :  UWQGT8LC
182-Urban-et-al-2015.nc :  JEVKJFL2
183-Bokor-et-al-2016.nc :  YJWVTSS6
685-Chamizo-et-al-2021.nc :  RVTBEKUH
477-Valette-Silver-et-al-1999.nc :  42H8QMWH
380-Smith-et-al-2020.nc :  GYCNEQV8
432-Efurd-et-al-1997.nc :  6MLDX3XC
122-Casacuberta-et-al-2018.nc :  P8QW47VH
190-Schlitzer-et-al-2018.nc :  97UIMEXN
103-Norwegian-Radiat