In [None]:
#| default_exp handlers.maris_dump

# MARIS dump
> Data pipeline (handler) to convert global MARIS db dump into `NetCDF` format

The input data is a dump from already imported MARIS datasets.


**Questions**:
1. is `decayedto` used and in what context? (question to be answered for all columns actually)
2. what are units of uncertainty?
3. ref_id vs. zoterourl vs. displaytext
   1.  e.g ref_id=129 and 130 points to the same dataset?

**Dev. board**: https://trello.com/b/IszgV1bj/marisco

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from functools import partial
import fastcore.all as fc

from pathlib import Path

from marisco.callbacks import (Callback, Transformer, SanitizeLonLatCB)
from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)
from marisco.configs import lut_path, cdl_cfg, cfg


In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
fname_in = Path().home() / 'pro/data/maris/all-maris.txt'
dir_dest = '../../_data/output/dump'
# fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
# | export
def load_dump(fname): 
    return pd.read_csv(fname, sep='\t', encoding='ISO-8859-1')   

def load_data(df:pd.DataFrame, # MARIS global dump 
                 ref_id:int, # Reference id of interest
                 ):
    "Load specific MARIS dataset through its ref_id."
    lut = {
        'Sediment': 'sediment',
        'Seawater': 'seawater',
        'Suspended matter': 'suspended-matter',
        'Biota': 'biota'}
    dfs = {}
    for name, grp in df[df.ref_id  == 695].groupby('samptype'):
        dfs[lut[name]] = grp
    return dfs

def get_zotero_key(df, ref_id):
    result = df[df.ref_id  == 695].zoterourl.unique()
    if len(result) > 1: print('Several Zotero records have been found, please check!')
    return result[0].split('/')[-1]

## Load data

In [None]:
df = load_dump(fname_in)

## Data transformation pipeline

### Normalize nuclide names

In [None]:
#| export
def get_varnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nuclide_id').to_dict()['nc_name']

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self,
                 fn_lut=get_varnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide_id'].replace(lut)

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

print(tfm()['biota']['nuclide_id'].unique())

['i131' 'cs134' 'cs137' 'k40' 'am241' 'pu239_240_tot' 'pu238']


### Rename columns

In [None]:
#| export
# To be added: endperiod, totdepth, lab
def renaming_rules():
    vars = cdl_cfg()['vars']
    # Define column names renaming rules
    return {
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'begperiod': vars['defaults']['time']['name'],
        'sampdepth': vars['defaults']['depth']['name'],
        'uncertaint': vars['suffixes']['uncertainty']['name'],
        'unit_id': vars['suffixes']['unit']['name'],
        'detection': vars['suffixes']['detection_limit']['name'],
        'area_id': vars['defaults']['area']['name'], 
        'species_id': vars['bio']['species']['name'],
        'biogroup_id': vars['bio']['bio_group']['name'],
        'bodypar_id': vars['bio']['body_part']['name'],
        'sedtype_id': vars['sed']['sed_type']['name'],
        'volume': vars['suffixes']['volume']['name'],
        'salinity': vars['suffixes']['salinity']['name'],
        'temperatur': vars['suffixes']['temperature']['name'],
        'sampmet_id': vars['suffixes']['sampling_method']['name'],
        'prepmet_id': vars['suffixes']['preparation_method']['name'],
        'counmet_id': vars['suffixes']['counting_method']['name'],
        'activity': 'value',
        'nuclide_id': 'nuclide'
    }

In [None]:
#| export
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        lut = renaming_rules()
        coi = lut.keys()
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, coi]
            # Rename cols
            tfm.dfs[k].rename(columns=lut, inplace=True)

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB()
    ])

print(tfm()['biota'])

              lat       lon     time  depth  unc  unit dl  area  species  \
533970  53.367778 -6.144167  00:00.0    NaN  NaN     5  =  2357       96   
533971  53.367778 -6.144167  00:00.0    NaN  NaN     5  <  2357       96   
533972  53.367778 -6.144167  00:00.0    NaN  NaN     5  =  2357       96   
533973  53.367778 -6.144167  00:00.0    NaN  NaN     5  =  2357       96   
533974  53.367778 -6.144167  00:00.0    NaN  NaN     5  =  2357       96   
...           ...       ...      ...    ...  ...   ... ..   ...      ...   
810691  54.100833 -6.200278  00:00.0    NaN  NaN     5  <  9999      129   
810692  54.100833 -6.200278  00:00.0    NaN  NaN     5  =  9999      129   
810693  54.100833 -6.200278  00:00.0    NaN  NaN     5  =  9999      129   
810694  54.100833 -6.200278  00:00.0    NaN  NaN     5  =  9999      129   
810695  54.100833 -6.200278  00:00.0    NaN  NaN     5  =  9999      129   

        bio_group  body_part  sed_type  vol  sal  temp  sampmet  prepmet  \
533970     

### Drop NaN only columns

In [None]:
#| export
class DropNAColumnsCB(Callback):
    "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables)."
    def __init__(self, na_value=0):
        fc.store_attr()
        
    def isMarisNA(self, col): 
        return len(col.unique()) == 1 and col.iloc[0] == self.na_value
    
    def dropMarisNA(self, df):
        na_cols = [col for col in df.columns if self.isMarisNA(df[col])]
        return df.drop(labels=na_cols, axis=1)
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k].dropna(axis=1, how='all')
            tfm.dfs[k] = self.dropMarisNA(tfm.dfs[k])

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB()
    ])

print(tfm()['biota'])

              lat       lon     time  unit dl  area  species  bio_group  \
533970  53.367778 -6.144167  00:00.0     5  =  2357       96         11   
533971  53.367778 -6.144167  00:00.0     5  <  2357       96         11   
533972  53.367778 -6.144167  00:00.0     5  =  2357       96         11   
533973  53.367778 -6.144167  00:00.0     5  =  2357       96         11   
533974  53.367778 -6.144167  00:00.0     5  =  2357       96         11   
...           ...       ...      ...   ... ..   ...      ...        ...   
810691  54.100833 -6.200278  00:00.0     5  <  9999      129         14   
810692  54.100833 -6.200278  00:00.0     5  =  9999      129         14   
810693  54.100833 -6.200278  00:00.0     5  =  9999      129         14   
810694  54.100833 -6.200278  00:00.0     5  =  9999      129         14   
810695  54.100833 -6.200278  00:00.0     5  =  9999      129         14   

         value        nuclide  
533970   53.00           i131  
533971    0.20          cs134  
533

### Sanitize detection limit values

In [None]:
#| export
def get_dl_lut():
    fname = lut_path() / 'dbo_detection.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['previous_name', 'detection_id'])
    return df_nuclide.set_index('previous_name').to_dict()['detection_id']

In [None]:
get_dl_lut()

{'ND': 0, '=': 1, 'D': 2, '<': 3}

In [None]:
#| export
class SanitizeDetectionLimitCB(Callback):
    "Assign Detection Limit name to its id based on MARIS nomenclature."
    def __init__(self,
                 fn_lut=get_dl_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['dl'] = tfm.dfs[k]['dl'].replace(lut)

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB()
    ])

print(tfm()['sediment'])

              lat       lon     time  unit  dl  area   value        nuclide
533931  53.866667 -5.550000  00:00.0     4   1  2357  170.40          cs137
533932  53.866667 -5.550000  00:00.0     4   1  2357  663.00            k40
533933  53.866667 -5.233333  00:00.0     4   3  2357    0.70          cs134
533934  53.866667 -5.233333  00:00.0     4   1  2357   78.90          cs137
533935  53.866667 -5.233333  00:00.0     4   1  2357  457.00            k40
...           ...       ...      ...   ...  ..   ...     ...            ...
810696  54.100833 -6.200278  00:00.0     4   1  9999    2.50          cs134
810697  54.100833 -6.200278  00:00.0     4   1  9999  173.00          cs137
810698  54.100833 -6.200278  00:00.0     4   1  9999    2.13          pu238
810699  54.100833 -6.200278  00:00.0     4   1  9999    5.77          am241
810700  54.100833 -6.200278  00:00.0     4   1  9999   11.99  pu239_240_tot

[153 rows x 8 columns]


### Reshape: long to wide

In [None]:
#| export
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, value_col='nuclide'):
        fc.store_attr()
        self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
    
    def renamed_cols(self, cols):
        return [f'{inner}_{outer}' if inner else outer for outer, inner in cols]

    def pivot(self, df):
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        idx = list(set(df.columns) - set([self.value_col] + derived_coi))
        return df.pivot_table(index=idx,
                              columns=self.value_col,
                              values=derived_coi).reset_index()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.pivot(tfm.dfs[k])
            tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide()
    ])

print(tfm()['sediment'])

        area  value     time        lat       lon  am241_dl  cs134_dl  \
sample                                                                  
0       2357    0.1  00:00.0  54.083056 -6.215000       NaN       1.0   
1       2357    0.2  00:00.0  53.333333 -6.000000       NaN       3.0   
2       2357    0.2  00:00.0  53.333333 -5.366667       NaN       3.0   
3       2357    0.2  00:00.0  53.416667 -6.016667       NaN       3.0   
4       2357    0.2  00:00.0  53.600000 -5.933333       NaN       3.0   
...      ...    ...      ...        ...       ...       ...       ...   
145     9999  346.0  00:00.0  53.283889 -9.048889       NaN       NaN   
146     9999  346.0  00:00.0  54.634722 -8.454722       NaN       NaN   
147     9999  412.0  00:00.0  53.283889 -9.048889       NaN       NaN   
148     9999  491.0  00:00.0  54.042778 -6.186944       NaN       NaN   
149     9999  493.0  00:00.0  54.042778 -6.186944       NaN       NaN   

        cs137_dl  k40_dl  pu238_dl  pu239_240_tot_

### Encode time (seconds since ...)

In [None]:
#| export
class EncodeTimeCB(Callback):
    "Encode time as `int` representing seconds since xxx (TBD)."  
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].time = 9999

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB()
    ])

print(tfm()['sediment'])

        area  value  time        lat       lon  am241_dl  cs134_dl  cs137_dl  \
sample                                                                         
0       2357    0.1  9999  54.083056 -6.215000       NaN       1.0       NaN   
1       2357    0.2  9999  53.333333 -6.000000       NaN       3.0       NaN   
2       2357    0.2  9999  53.333333 -5.366667       NaN       3.0       NaN   
3       2357    0.2  9999  53.416667 -6.016667       NaN       3.0       NaN   
4       2357    0.2  9999  53.600000 -5.933333       NaN       3.0       NaN   
...      ...    ...   ...        ...       ...       ...       ...       ...   
145     9999  346.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
146     9999  346.0  9999  54.634722 -8.454722       NaN       NaN       NaN   
147     9999  412.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
148     9999  491.0  9999  54.042778 -6.186944       NaN       NaN       NaN   
149     9999  493.0  9999  54.042778 -6.

### Sanitize coordinates

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

print(tfm()['sediment'])

        area  value  time        lat       lon  am241_dl  cs134_dl  cs137_dl  \
sample                                                                         
0       2357    0.1  9999  54.083056 -6.215000       NaN       1.0       NaN   
1       2357    0.2  9999  53.333333 -6.000000       NaN       3.0       NaN   
2       2357    0.2  9999  53.333333 -5.366667       NaN       3.0       NaN   
3       2357    0.2  9999  53.416667 -6.016667       NaN       3.0       NaN   
4       2357    0.2  9999  53.600000 -5.933333       NaN       3.0       NaN   
...      ...    ...   ...        ...       ...       ...       ...       ...   
145     9999  346.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
146     9999  346.0  9999  54.634722 -8.454722       NaN       NaN       NaN   
147     9999  412.0  9999  53.283889 -9.048889       NaN       NaN       NaN   
148     9999  491.0  9999  54.042778 -6.186944       NaN       NaN       NaN   
149     9999  493.0  9999  54.042778 -6.

## Encode to NetCDF

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[
    RemapRdnNameCB(),
    RenameColumnCB(),
    DropNAColumnsCB(),
    SanitizeDetectionLimitCB(),
    ReshapeLongToWide(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()

In [None]:
tfm.logs

['Remap to MARIS radionuclide names.',
 'Renaming variables to MARIS standard names.',
 "Drop variable containing only NaN or 'Not available' (id=0 in MARIS lookup tables).",
 'Assign Detection Limit name to its id based on MARIS nomenclature.',
 'Convert data from long to wide with renamed columns.',
 'Encode time as `int` representing seconds since xxx (TBD).',
 'Drop row when both longitude & latitude equal 0.']

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| export
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        # ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
get_attrs(tfm, zotero_key=get_zotero_key(df, 52), kw=kw)

{'geospatial_lat_min': '-5.233333333',
 'geospatial_lat_max': '55.12111111',
 'geospatial_lon_min': '-9.910277778',
 'geospatial_lon_max': '51.65138889',
 'geospatial_bounds': 'POLYGON ((-9.910277778 51.65138889, -5.233333333 51.65138889, -5.233333333 55.12111111, -9.910277778 55.12111111, -9.910277778 51.65138889))',
 'time_coverage_start': '1970-01-01T02:46:39',
 'time_coverage_end': '1970-01-01T02:46:39',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes, Earth Science > Oceans > Water Quality > Ocean Contaminants, Earth Science > Biological Classification > Animals/Vertebrates > Fish, Earth Science > Biosphere > Ecosystems > Marine Ecosystems, Earth Science > Biological Classifi