In [None]:
#| default_exp handlers.geotraces

# Geotraces
> Data pipeline (handler) to convert BODC Geotraces seawater dataset

Questions:

- salinity

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from tqdm import tqdm
from pathlib import Path
import fastcore.all as fc
import pandas as pd
import numpy as np
import re

from marisco.callbacks import (Callback, Transformer, SanitizeLonLatCB, EncodeTimeCB)
from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)
from marisco.configs import lut_path, cdl_cfg, cfg, nc_tpl_path, Enums
from marisco.serializers import NetCDFEncoder

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# | export
fname_in = '../../_data/geotraces/GEOTRACES_IDP2021_v2/seawater/ascii/GEOTRACES_IDP2021_Seawater_Discrete_Sample_Data_v2.csv'
dir_dest = '../../_data/output/'

## Load data

In [None]:
#|eval: false
df = pd.read_csv(fname_in)
df.head()

Unnamed: 0,Cruise,Station:METAVAR:INDEXED_TEXT,Type,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],Operator's Cruise Name:METAVAR:INDEXED_TEXT,Ship Name:METAVAR:INDEXED_TEXT,Period:METAVAR:INDEXED_TEXT,...,QV:SEADATANET.581,Co_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.582,Ni_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.583,Cu_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.584,Zn_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.585,QV:ODV:SAMPLE
0,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
1,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
2,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
3,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
4,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1


## Data transformation pipeline

### Select columns of interest

In [None]:
# I_129, U_236_238, Th_232, Ac_227

In [None]:
#| export
common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',
              'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]']

nuclides_pattern = ['^TRITI', '^Th_228', '^Th_230', '^Th_234', '^Pa_231', 
                    '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',
                    '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240']  

class SelectColsOfInterestCB(Callback):
    "Select columns of interest."
    def __init__(self, common_coi, nuclides_pattern): fc.store_attr()
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.dfs.columns if 
                           any(re.match(pattern, c) for pattern in self.nuclides_pattern)]

        tfm.dfs = tfm.dfs[self.common_coi + nuc_of_interest]

In [None]:
#|eval: false
df = pd.read_csv(fname_in)
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern)
])

In [None]:
#|eval: false
df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],TRITIUM_D_CONC_BOTTLE [TU],Cs_137_D_CONC_BOTTLE [uBq/kg],Np_237_D_CONC_BOTTLE [uBq/kg],Pu_239_D_CONC_BOTTLE [uBq/kg],Pu_239_Pu_240_D_CONC_BOTTLE [uBq/kg],...,Pa_231_LPT_CONC_PUMP [uBq/kg],Th_228_SPT_CONC_PUMP [uBq/kg],Th_228_LPT_CONC_PUMP [uBq/kg],Th_230_TP_CONC_PUMP [uBq/kg],Th_230_SPT_CONC_PUMP [uBq/kg],Th_230_LPT_CONC_PUMP [uBq/kg],Th_234_SPT_CONC_PUMP [mBq/kg],Th_234_LPT_CONC_PUMP [mBq/kg],Po_210_TP_CONC_UWAY [mBq/kg],Pb_210_TP_CONC_UWAY [mBq/kg]
0,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.1,,,,,,...,,,,,,,,,,
1,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,,,,,,...,,,,,,,,,,
2,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,,,,,,...,,,,,,,,,,
3,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,,,,,,...,,,,,,,,,,
4,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,,,,,,...,,,,,,,,,,


### Reshape: wide to long

So that we can extract information such as sample methodology, filtering status, units included in Geotraces nuclides name.

In [None]:
#|export
class WideToLongCB(Callback):
    """
    Get Geotraces nuclide names as values not column names 
    to extract contained information (unit, sampling method, ...).
    """
    def __init__(self, common_coi, nuclides_pattern, 
                 var_name='nuclide', value_name='value'): 
        fc.store_attr()
        
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.dfs.columns if 
                           any(re.match(pattern, c) for pattern in self.nuclides_pattern)]
        tfm.dfs = pd.melt(tfm.dfs, id_vars=self.common_coi, value_vars=nuc_of_interest, 
                          var_name=self.var_name, value_name=self.value_name)
        tfm.dfs.dropna(subset='value', inplace=True)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern)
])

df_test = tfm()
df_test.shape

(22882, 7)

### Extract

#### Unit

In [None]:
#|export
class ExtractUnitCB(Callback):
    """
    Extract units from nuclide names.
    """
    def __init__(self, var_name='nuclide'): 
        fc.store_attr()
        self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']

    def extract_unit(self, s):
        match = re.search(r'\[(.*?)\]', s)
        return match.group(1) if match else None
        
    def __call__(self, tfm):
        tfm.dfs[self.unit_col_name] = tfm.dfs[self.var_name].apply(self.extract_unit)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB()
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],nuclide,value,_unit
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU


#### Filtering status

In [None]:
#\export
phase = {
    'D': {'filt': 1, 'group': 'seawater'},
    'T': {'filt': 2, 'group': 'seawater'},
    'TP': {'filt': 1, 'group': 'suspended-matter'}, 
    'LPT': {'filt': 1, 'group': 'suspended-matter'},
    'SPT': {'filt': 1, 'group': 'suspended-matter'}}

In [None]:
#|export
class ExtractFilteringStatusCB(Callback):
    """
    Extract filtering status from nuclide names.
    """
    def __init__(self, phase, var_name='nuclide'): 
        fc.store_attr()
        self.filt_col_name = cdl_cfg()['vars']['suffixes']['filtered']['name']

    def extract_filt_status(self, s):
        matched_string = self.match(s)
        return self.phase[matched_string.group(1)]['filt'] if matched_string else None

    def match(self, s):
        return re.search(r'_(' + '|'.join(self.phase.keys()) + ')_', s)
        
    def extract_group(self, s):
        matched_string = self.match(s)
        return self.phase[matched_string.group(1)]['group'] if matched_string else None
        
    def __call__(self, tfm):
        tfm.dfs[self.filt_col_name] = tfm.dfs[self.var_name].apply(self.extract_filt_status)
        tfm.dfs['group'] = tfm.dfs[self.var_name].apply(self.extract_group)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],nuclide,value,_unit,_filt,group
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU,1,seawater
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU,1,seawater
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU,1,seawater
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU,1,seawater
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU,1,seawater


#### Sampling method

In [None]:
#\export
# To be validated
smp_method = {
    'BOTTLE': 1,
    'FISH': 18,
    'PUMP': 14,
    'UWAY': 24}

In [None]:
#|export
class ExtractSamplingMethodCB(Callback):
    """
    Extract sampling method from nuclide names.
    """
    def __init__(self, smp_method, var_name='nuclide'): 
        fc.store_attr()
        self.smp_method_col_name = cdl_cfg()['vars']['suffixes']['sampling_method']['name']

    def extract_smp_method(self, s):
        match = re.search(r'_(' + '|'.join(self.smp_method.keys()) + ') ', s)
        return self.smp_method[match.group(1)] if match else None
        
    def __call__(self, tfm):
        tfm.dfs[self.smp_method_col_name] = tfm.dfs[self.var_name].apply(self.extract_smp_method)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU,1,seawater,1


### Remap to MARIS nuclide names 

In [None]:
#\export
nuclides_name = {'TRITIUM': 'h3', 'Pu_239_Pu_240': 'pu239_240_tot'}

In [None]:
#|export
class RenameNuclideCB(Callback):
    """
    Remap nuclides name to MARIS standard.
    """
    def __init__(self, nuclides_name, var_name='nuclide'): 
        fc.store_attr()
        self.patterns = ['_D', '_T', '_TP', '_LPT', '_SPT']

    def extract_nuclide_name(self, s):
        match = re.search(r'(.*?)(' + '|'.join(self.patterns) + ')', s)
        return match.group(1) if match else None

    def standardize_name(self, s):
        s = self.extract_nuclide_name(s)
        return self.nuclides_name[s] if s in self.nuclides_name else s.lower().replace('_', '')
        
    def __call__(self, tfm):
        tfm.dfs[self.var_name] = tfm.dfs[self.var_name].apply(self.standardize_name)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,h3,0.733,TU,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,h3,0.696,TU,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,h3,0.718,TU,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,h3,0.709,TU,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,h3,0.692,TU,1,seawater,1


In [None]:
#|eval: false
df_test.nuclide.unique()

array(['h3', 'cs137', 'np237', 'pu239', 'pu239_240_tot', 'pu240', 'u236',
       'pa231', 'pb210', 'po210', 'ra224', 'ra226', 'ra228', 'th230',
       'th234', 'be7', 'ra223', 'th228'], dtype=object)

### Standardize unit

In [None]:
#\export
units_lut = {
    'TU': {'id': 7, 'factor': 1},
    'uBq/kg': {'id': 3, 'factor': 1e-6},
    'atoms/kg': {'id': 9, 'factor': 1},
    'mBq/kg': {'id': 3, 'factor': 1e-3}}

In [None]:
#|export
class StandardizeUnitCB(Callback):
    """
    Remap unit to MARIS standard ones and apply conversion where needed.
    """
    def __init__(self, units_lut, var_name='value'): 
        fc.store_attr()
        self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']
        
    
    def __call__(self, tfm):
        # Convert/rescale values
        tfm.dfs[self.var_name] *= tfm.dfs[self.unit_col_name].map(
            {k: v['factor'] for k, v in self.units_lut.items()})
        
        # Match MARIS unit id
        tfm.dfs[self.unit_col_name] = tfm.dfs[self.unit_col_name].map(
            {k: v['id'] for k, v in self.units_lut.items()})
        

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut)
])

df_test = tfm()
print(df_test.head())
print(df_test.columns)

     yyyy-mm-ddThh:mm:ss.sss  Longitude [degrees_east]  \
9223     2010-10-17T00:13:29                 350.33792   
9231     2010-10-17T00:13:29                 350.33792   
9237     2010-10-17T00:13:29                 350.33792   
9244     2010-10-17T00:13:29                 350.33792   
9256     2010-10-17T00:13:29                 350.33792   

      Latitude [degrees_north]  Bot. Depth [m]  DEPTH [m] nuclide  value  \
9223                   38.3271          2827.0       17.8      h3  0.733   
9231                   38.3271          2827.0       34.7      h3  0.696   
9237                   38.3271          2827.0       67.5      h3  0.718   
9244                   38.3271          2827.0       91.9      h3  0.709   
9256                   38.3271          2827.0      136.6      h3  0.692   

      _unit  _filt     group  _sampmet  
9223      7      1  seawater         1  
9231      7      1  seawater         1  
9237      7      1  seawater         1  
9244      7      1  seawater  

### Rename common columns

In [None]:
#| export
def renaming_rules():
    vars = cdl_cfg()['vars']
    # Define column names renaming rules
    return {
        'yyyy-mm-ddThh:mm:ss.sss': vars['defaults']['time']['name'],
        'Longitude [degrees_east]': vars['defaults']['lon']['name'],
        'Latitude [degrees_north]': vars['defaults']['lat']['name'],
        'DEPTH [m]': vars['defaults']['smp_depth']['name'],
        'Bot. Depth [m]': vars['defaults']['tot_depth']['name']
    }

In [None]:
#| export
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self, renaming_rules=renaming_rules): fc.store_attr()
    def __call__(self, tfm):
        lut = self.renaming_rules()
        new_col_names = [lut[name] if name in lut else name for name in tfm.dfs.columns]
        tfm.dfs.columns = new_col_names

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules)
])

df_test = tfm()
df_test.head()

Unnamed: 0,time,lon,lat,tot_depth,smp_depth,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,h3,0.733,7,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,h3,0.696,7,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,h3,0.718,7,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,h3,0.709,7,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,h3,0.692,7,1,seawater,1


### Unshift longitudes

In [None]:
#| export
class UnshiftLongitudeCB(Callback):
    "Longitudes are coded between 0 and 360 in Geotraces. We rescale it between -180 and 180 instead."
    def __init__(self): 
        self.lon_col_name = cdl_cfg()['vars']['defaults']['lon']['name']
    
    def __call__(self, tfm):
        tfm.dfs[self.lon_col_name] = tfm.dfs[self.lon_col_name] - 180

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB()
])

df_test = tfm()
df_test.head()

Unnamed: 0,time,lon,lat,tot_depth,smp_depth,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,170.33792,38.3271,2827.0,17.8,h3,0.733,7,1,seawater,1
9231,2010-10-17T00:13:29,170.33792,38.3271,2827.0,34.7,h3,0.696,7,1,seawater,1
9237,2010-10-17T00:13:29,170.33792,38.3271,2827.0,67.5,h3,0.718,7,1,seawater,1
9244,2010-10-17T00:13:29,170.33792,38.3271,2827.0,91.9,h3,0.709,7,1,seawater,1
9256,2010-10-17T00:13:29,170.33792,38.3271,2827.0,136.6,h3,0.692,7,1,seawater,1


### Dispatch to groups

In [None]:
#| export
class DispatchToGroupCB(Callback):
    "Convert to a dictionary of dataframe with sample type (seawater,...) as keys."
    def __init__(self, group_name='group'): 
        fc.store_attr()
        
    def __call__(self, tfm):
        tfm.dfs = dict(tuple(tfm.dfs.groupby(self.group_name)))
        for key in tfm.dfs:
            tfm.dfs[key] = tfm.dfs[key].drop(self.group_name, axis=1)
        

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB()
])

tfm()

{'seawater':                         time        lon      lat  tot_depth  smp_depth  \
 9223     2010-10-17T00:13:29  170.33792  38.3271     2827.0       17.8   
 9231     2010-10-17T00:13:29  170.33792  38.3271     2827.0       34.7   
 9237     2010-10-17T00:13:29  170.33792  38.3271     2827.0       67.5   
 9244     2010-10-17T00:13:29  170.33792  38.3271     2827.0       91.9   
 9256     2010-10-17T00:13:29  170.33792  38.3271     2827.0      136.6   
 ...                      ...        ...      ...        ...        ...   
 5330519  2015-09-04T09:15:18    3.25999  88.4058     3960.0        5.0   
 5330522  2015-09-04T09:15:18    3.25999  88.4058     3960.0       20.0   
 5330699  2015-09-07T14:20:39  -90.74920  89.9809     4229.0        0.5   
 5330702  2015-09-07T14:20:39  -90.74920  89.9809     4229.0        1.5   
 5330705  2015-09-07T14:20:39  -90.74920  89.9809     4229.0        5.0   
 
         nuclide   value  _unit  _filt  _sampmet  
 9223         h3  0.7330      7    

### Rehape: long to wide

In [None]:
#| export
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, columns='nuclide', values=['value']):
        fc.store_attr()
        # Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs
        self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
    
    def renamed_cols(self, cols):
        "Flatten columns name"
        return [inner if outer == "value" else f'{inner}{outer}'
                if inner else outer
                for outer, inner in cols]

    def pivot(self, df):
        # Among all possible 'derived cols' select the ones present in df
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        
        df.reset_index(names='sample', inplace=True)
        
        idx = list(set(df.columns) - set([self.columns] + derived_coi + self.values))
        return df.pivot_table(index=idx,
                              columns=self.columns,
                              values=self.values + derived_coi,
                              fill_value=np.nan,
                              aggfunc=lambda x: x
                              ).reset_index()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.pivot(tfm.dfs[k])
            tfm.dfs[k].columns = self.renamed_cols(tfm.dfs[k].columns)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide()
])

dfs_test = tfm()['seawater']
print('shape: ', dfs_test.shape)
print('columns: ', dfs_test.columns)
dfs_test.head()

shape:  (16459, 78)
columns:  Index(['lat', 'lon', 'time', 'sample', 'tot_depth', 'smp_depth', 'be7_filt',
       'cs137_filt', 'h3_filt', 'np237_filt', 'pa231_filt', 'pb210_filt',
       'po210_filt', 'pu239_filt', 'pu239_240_tot_filt', 'pu240_filt',
       'ra223_filt', 'ra224_filt', 'ra226_filt', 'ra228_filt', 'th228_filt',
       'th230_filt', 'th234_filt', 'u236_filt', 'be7_sampmet', 'cs137_sampmet',
       'h3_sampmet', 'np237_sampmet', 'pa231_sampmet', 'pb210_sampmet',
       'po210_sampmet', 'pu239_sampmet', 'pu239_240_tot_sampmet',
       'pu240_sampmet', 'ra223_sampmet', 'ra224_sampmet', 'ra226_sampmet',
       'ra228_sampmet', 'th228_sampmet', 'th230_sampmet', 'th234_sampmet',
       'u236_sampmet', 'be7_unit', 'cs137_unit', 'h3_unit', 'np237_unit',
       'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',
       'pu239_240_tot_unit', 'pu240_unit', 'ra223_unit', 'ra224_unit',
       'ra226_unit', 'ra228_unit', 'th228_unit', 'th230_unit', 'th234_unit',
       'u236_unit'

Unnamed: 0,lat,lon,time,sample,tot_depth,smp_depth,be7_filt,cs137_filt,h3_filt,np237_filt,...,pu239_240_tot,pu240,ra223,ra224,ra226,ra228,th228,th230,th234,u236
0,-70.5744,171.8772,2008-03-05T13:57:45,1201316,136.0,9.7,,,,,...,,,,,,,,,,
1,-70.5744,171.8772,2008-03-05T13:57:45,1201330,136.0,135.6,,,,,...,,,,,,,,,,
2,-70.5744,171.8772,2008-03-05T13:57:45,2044652,136.0,9.7,,,,,...,,,,,,,,2e-06,,
3,-70.5744,171.8772,2008-03-05T13:57:45,2044666,136.0,135.6,,,,,...,,,,,,,,3e-06,,
4,-70.5744,171.8772,2008-03-05T13:57:45,3942156,136.0,7.0,,,,,...,,,,,,,,,,


### Parse time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].time, format='ISO8601')

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB()
])

print('time data type: ', tfm()['seawater'].time.dtype)

time data type:  datetime64[ns]


### Encode time (seconds since ...)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg())
])

dfs_test = tfm()['seawater']
dfs_test.head()

Unnamed: 0,lat,lon,time,sample,tot_depth,smp_depth,be7_filt,cs137_filt,h3_filt,np237_filt,...,pu239_240_tot,pu240,ra223,ra224,ra226,ra228,th228,th230,th234,u236
0,-70.5744,171.8772,1204725465,1201316,136.0,9.7,,,,,...,,,,,,,,,,
1,-70.5744,171.8772,1204725465,1201330,136.0,135.6,,,,,...,,,,,,,,,,
2,-70.5744,171.8772,1204725465,2044652,136.0,9.7,,,,,...,,,,,,,,2e-06,,
3,-70.5744,171.8772,1204725465,2044666,136.0,135.6,,,,,...,,,,,,,,3e-06,,
4,-70.5744,171.8772,1204725465,3942156,136.0,7.0,,,,,...,,,,,,,,,,


### Sanitize coordinates

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    SanitizeLonLatCB()
])
dfs_test = tfm()['seawater']
dfs_test.head()

Unnamed: 0,lat,lon,time,sample,tot_depth,smp_depth,be7_filt,cs137_filt,h3_filt,np237_filt,...,pu239_240_tot,pu240,ra223,ra224,ra226,ra228,th228,th230,th234,u236
0,-70.5744,171.8772,1204725465,1201316,136.0,9.7,,,,,...,,,,,,,,,,
1,-70.5744,171.8772,1204725465,1201330,136.0,135.6,,,,,...,,,,,,,,,,
2,-70.5744,171.8772,1204725465,2044652,136.0,9.7,,,,,...,,,,,,,,2e-06,,
3,-70.5744,171.8772,1204725465,2044666,136.0,135.6,,,,,...,,,,,,,,3e-06,,
4,-70.5744,171.8772,1204725465,3942156,136.0,7.0,,,,,...,,,,,,,,,,


## Encode to NetCDF

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    SanitizeLonLatCB()
])

tfm();

In [None]:
#|eval: false
tfm.logs

['Select columns of interest.',
 '\n    Get Geotraces nuclide names as values not column names \n    to extract contained information (unit, sampling method, ...).\n    ',
 '\n    Extract units from nuclide names.\n    ',
 '\n    Extract filtering status from nuclide names.\n    ',
 '\n    Extract sampling method from nuclide names.\n    ',
 '\n    Remap nuclides name to MARIS standard.\n    ',
 '\n    Remap unit to MARIS standard ones and apply conversion where needed.\n    ',
 'Renaming variables to MARIS standard names.',
 'Longitudes are coded between 0 and 360 in Geotraces. We rescale it between -180 and 180 instead.',
 'Convert to a dictionary of dataframe with sample type (seawater,...) as keys.',
 'Convert data from long to wide with renamed columns.',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator.']

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
get_attrs(tfm, zotero_key='3W354SQG', kw=kw)

{'geospatial_lat_min': '30.435833333333335',
 'geospatial_lat_max': '65.75',
 'geospatial_lon_min': '9.633333333333333',
 'geospatial_lon_max': '53.5',
 'geospatial_bounds': 'POLYGON ((9.633333333333333 53.5, 30.435833333333335 53.5, 30.435833333333335 65.75, 9.633333333333333 65.75, 9.633333333333333 53.5))',
 'time_coverage_start': '1984-01-10T00:00:00',
 'time_coverage_end': '2018-12-14T00:00:00',
 'title': 'Radioactivity Monitoring of the Irish Marine Environment 1991 and 1992',
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "A.", "lastName": "McGarry"}, {"creatorType": "author", "firstName": "S.", "lastName": "Lyons"}, {"creatorType": "author", "firstName": "C.", "lastName": "McEnri"}, {"creatorType": "author", "firstName": "T.", "lastName": "Ryan"}, {"creatorType": "author", "firstName": "M.", "lastName": "O\'Colmain"}, {"creatorType": "author", "firstName": "J.D.", "lastName": "Cunningham"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean 

In [None]:
#| export
def enums_xtra(tfm, vars):
    "Retrieve a subset of the lengthy enum as 'species_t' for instance"
    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
    xtras = {}
    for var in vars:
        unique_vals = tfm.unique(var)
        if unique_vals.any():
            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
    return xtras

In [None]:
#| export
# TBD
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    df = load_dump(fname_in)
    ref_ids = kwargs.get('ref_ids', df.ref_id.unique())
    print('Encoding ...')
    for ref_id in tqdm(ref_ids, leave=False):
        dfs = load_data(df, ref_id)
        print(get_fname(dfs))
        tfm = Transformer(dfs, cbs=[
            RemapRdnNameCB(),
            RenameColumnCB(),
            DropNAColumnsCB(),
            SanitizeDetectionLimitCB(),
            ParseTimeCB(),
            ReshapeLongToWide(),
            EncodeTimeCB(cfg()),
            SanitizeLonLatCB(verbose=True)
            ])
       
        tfm()
        encoder = NetCDFEncoder(tfm.dfs, 
                                src_fname=nc_tpl_path,
                                dest_fname=Path(fname_out) / get_fname(dfs), 
                                global_attrs=get_attrs(tfm, zotero_key=get_zotero_key(dfs), kw=kw),
                                verbose=kwargs.get('verbose', False),
                                enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                                )
        encoder.encode()