In [None]:
#| default_exp handlers.geotraces

# Geotraces
> Data pipeline (handler) to convert BODC Geotraces seawater dataset

Ref: 
    
    - https://www.geotraces.org/geotraces-intermediate-data-product-2021/
    - zotero MARIS geotraces entry (2018 only?): https://www.zotero.org/groups/2432820/maris/items/97UIMEXN


BODC bottle id

temperature, oxygen, station (optional), samplabcode 

List of all accepted maris master db cols:
- sample_id
- sampquality
- ref_id
- osamcod
- samplabcode
- samptype_id
- lab_id
- station
- aqcs
- sedtrap
- aqcsyear
- aqcsmonth
- totdepth
- sampdepth
- volume
- salinity
- temperatur
- filtered
- acid
- oxygen
- samparea
- sliceup
- slicedown
- sieved
- organic
- oxic
- drytemp
- drywt
- wetwt
- percentwt
- filtpore
- sampnote
- begperiod
- endperiod
- sedtype_id
- drymet_id
- species_id
- bodypar_id
- sampmet_id
- prepmet_id
- gfe
- Commonname
- TaxonRepName
- SedRepName
- shiftedcoordinates
- shiftedLong
- shiftedLat
- longitude
- latitude

TODO & QUESTIONS:

- salinity, temperature, oxygen, station(?), samplabcode (BODC bottle ID?)
- How to handle this unit? U_236_238_T_RATIO_BOTTLE [per 10^12]

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from tqdm import tqdm
from pathlib import Path
import fastcore.all as fc
import pandas as pd
import numpy as np
import re

from marisco.callbacks import (
    Callback, 
    Transformer, 
    SanitizeLonLatCB, 
    EncodeTimeCB,
    ReshapeLongToWide
)

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB,
    DepthRangeCB, 
    TimeRangeCB,
    ZoteroCB,
    KeyValuePairCB
)

from marisco.configs import (
    lut_path, 
    cdl_cfg, 
    cfg, 
    nc_tpl_path, 
    Enums
)

from marisco.serializers import NetCDFEncoder

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
import warnings
warnings.filterwarnings('ignore')

So again, the 1 million question is still there: what constitutes a single sample in the context of Geotraces?

## Input and output file names

In [None]:
# | exports
fname_in = '../../_data/geotraces/GEOTRACES_IDP2021_v2/seawater/ascii/GEOTRACES_IDP2021_Seawater_Discrete_Sample_Data_v2.csv'
fname_out = '../../_data/output/190-geotraces-2021.nc'

## Load data

In [None]:
#| exports
load_data = lambda fname: pd.read_csv(fname_in)

In [None]:
#| eval: false
df = load_data(fname_in)
print(f'df shape: {df.shape}')
df.head()

df shape: (105417, 1188)


Unnamed: 0,Cruise,Station:METAVAR:INDEXED_TEXT,Type,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],Operator's Cruise Name:METAVAR:INDEXED_TEXT,Ship Name:METAVAR:INDEXED_TEXT,Period:METAVAR:INDEXED_TEXT,...,QV:SEADATANET.581,Co_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.582,Ni_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.583,Cu_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.584,Zn_CELL_CONC_BOTTLE [amol/cell],QV:SEADATANET.585,QV:ODV:SAMPLE
0,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
1,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
2,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
3,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1
4,GA01,0,B,2014-05-17T22:29:00,349.29999,38.4329,4854.0,GEOVIDE,Pourquoi pas?,15/05/2014 - 30/06/2014,...,9,,9,,9,,9,,9,1


In [None]:
#| hide
#| eval: false
def find_print_col(s, cols, lower=True):
    cols = cols if not lower else [col.lower() for col in cols]
    for col in cols:
        if s in col: print(col)

find_print_col('sal', df.columns)
find_print_col('tmp', df.columns)
find_print_col('oxy', df.columns)
find_print_col('U_236_238', df.columns, lower=False)

ctdsal_d_conc_sensor [pss-78]
salinity_d_conc_bottle
salinity_d_conc_pump
salinity_d_conc_fish
salinity_d_conc_uway
salinity_d_conc_boat_pump
ctdtmp_t_value_sensor [deg c]
oxygen_d_conc_bottle [umol/kg]
ctdoxy_d_conc_sensor [umol/kg]
U_236_238_T_RATIO_BOTTLE [per 10^12]


## Data transformation pipeline

### Select columns of interest

In [None]:
#| exports
common_coi = ['yyyy-mm-ddThh:mm:ss.sss', 'Longitude [degrees_east]',
              'Latitude [degrees_north]', 'Bot. Depth [m]', 'DEPTH [m]', 'BODC Bottle Number:INTEGER']

nuclides_pattern = ['^TRITI', '^Th_228', '^Th_23[024]', '^Pa_231', 
                    '^U_236_[DT]', '^Be_', '^Cs_137', '^Pb_210', '^Po_210',
                    '^Ra_22[3468]', 'Np_237', '^Pu_239_[D]', '^Pu_240', '^Pu_239_Pu_240',
                    '^I_129', '^Ac_227']  

class SelectColsOfInterestCB(Callback):
    "Select columns of interest."
    def __init__(self, common_coi, nuclides_pattern): fc.store_attr()
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.df.columns if 
                           any(re.match(pattern, c) for pattern in self.nuclides_pattern)]

        tfm.df = tfm.df[self.common_coi + nuc_of_interest]

In [None]:
#| eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern)
])

In [None]:
#| eval: false
df_test = tfm()
print(f'df_test shape: {df_test.shape}')
df_test.head()

df_test shape: (105417, 86)


Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,TRITIUM_D_CONC_BOTTLE [TU],Cs_137_D_CONC_BOTTLE [uBq/kg],I_129_D_CONC_BOTTLE [atoms/kg],Np_237_D_CONC_BOTTLE [uBq/kg],...,Th_230_TP_CONC_PUMP [uBq/kg],Th_230_SPT_CONC_PUMP [uBq/kg],Th_230_LPT_CONC_PUMP [uBq/kg],Th_232_TP_CONC_PUMP [pmol/kg],Th_232_SPT_CONC_PUMP [pmol/kg],Th_232_LPT_CONC_PUMP [pmol/kg],Th_234_SPT_CONC_PUMP [mBq/kg],Th_234_LPT_CONC_PUMP [mBq/kg],Po_210_TP_CONC_UWAY [mBq/kg],Pb_210_TP_CONC_UWAY [mBq/kg]
0,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.1,1214048,,,,,...,,,,,,,,,,
1,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,1214039,,,,,...,,,,,,,,,,
2,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,1214027,,,,,...,,,,,,,,,,
3,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,1214018,,,,,...,,,,,,,,,,
4,2014-05-17T22:29:00,349.29999,38.4329,4854.0,2957.2,1214036,,,,,...,,,,,,,,,,


`BODC Bottle Number:INTEGER` field allows to characterize uniquely a sample as shown below:

In [None]:
#| eval: false
cols_measurements = [col for col in df_test.columns if col not in common_coi]

unique_key = ['BODC Bottle Number:INTEGER']

df_test.dropna(subset=cols_measurements, how='all', inplace=True);
print(f'df_test shape after dropping rows with no measurements: {df_test.shape}')
print(f'df_test duplicated keys: {df_test[unique_key].duplicated().sum()}')

df_test[df_test[unique_key].duplicated(keep=False)].sort_values(by=unique_key)

df_test shape after dropping rows with no measurements: (9389, 86)
df_test duplicated keys: 0


Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,TRITIUM_D_CONC_BOTTLE [TU],Cs_137_D_CONC_BOTTLE [uBq/kg],I_129_D_CONC_BOTTLE [atoms/kg],Np_237_D_CONC_BOTTLE [uBq/kg],...,Th_230_TP_CONC_PUMP [uBq/kg],Th_230_SPT_CONC_PUMP [uBq/kg],Th_230_LPT_CONC_PUMP [uBq/kg],Th_232_TP_CONC_PUMP [pmol/kg],Th_232_SPT_CONC_PUMP [pmol/kg],Th_232_LPT_CONC_PUMP [pmol/kg],Th_234_SPT_CONC_PUMP [mBq/kg],Th_234_LPT_CONC_PUMP [mBq/kg],Po_210_TP_CONC_UWAY [mBq/kg],Pb_210_TP_CONC_UWAY [mBq/kg]


### Reshape: wide to long

So that we can extract information such as sample methodology, filtering status, units included in Geotraces nuclides name.

In [None]:
#| exports
class WideToLongCB(Callback):
    """
    Get Geotraces nuclide names as values not column names 
    to extract contained information (unit, sampling method, ...).
    """
    def __init__(self, common_coi, nuclides_pattern, 
                 var_name='nuclide', value_name='value'): 
        fc.store_attr()
        
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.df.columns if 
                           any(re.match(pattern, c) for pattern in self.nuclides_pattern)]
        tfm.df = pd.melt(tfm.df, id_vars=self.common_coi, value_vars=nuc_of_interest, 
                          var_name=self.var_name, value_name=self.value_name)
        tfm.df.dropna(subset='value', inplace=True)

In [None]:
#| eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern)
])
df_test = tfm()
df_test.shape

(26745, 8)

### Extract

#### Unit

In [None]:
#| exports
class ExtractUnitCB(Callback):
    """
    Extract units from nuclide names.
    """
    def __init__(self, var_name='nuclide'): 
        fc.store_attr()
        self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']

    def extract_unit(self, s):
        match = re.search(r'\[(.*?)\]', s)
        return match.group(1) if match else None
        
    def __call__(self, tfm):
        tfm.df[self.unit_col_name] = tfm.df[self.var_name].apply(self.extract_unit)

In [None]:
#| eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB()
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,nuclide,value,_unit
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,842525,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,842528,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,842531,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,842534,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,842540,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU


#### Filtering status

In [None]:
#| exports
phase = {
    'D': {'filt': 1, 'group': 'seawater'},
    'T': {'filt': 2, 'group': 'seawater'},
    'TP': {'filt': 1, 'group': 'suspended-matter'}, 
    'LPT': {'filt': 1, 'group': 'suspended-matter'},
    'SPT': {'filt': 1, 'group': 'suspended-matter'}}

In [None]:
#| exports
class ExtractFilteringStatusCB(Callback):
    "Extract filtering status from nuclide names."
    def __init__(self, phase, var_name='nuclide'): 
        fc.store_attr()
        self.filt_col_name = cdl_cfg()['vars']['suffixes']['filtered']['name']

    def extract_filt_status(self, s):
        matched_string = self.match(s)
        return self.phase[matched_string.group(1)]['filt'] if matched_string else None

    def match(self, s):
        return re.search(r'_(' + '|'.join(self.phase.keys()) + ')_', s)
        
    def extract_group(self, s):
        matched_string = self.match(s)
        return self.phase[matched_string.group(1)]['group'] if matched_string else None
        
    def __call__(self, tfm):
        tfm.df[self.filt_col_name] = tfm.df[self.var_name].apply(self.extract_filt_status)
        tfm.df['group'] = tfm.df[self.var_name].apply(self.extract_group)

In [None]:
#|eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,nuclide,value,_unit,_filt,group
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,842525,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU,1,seawater
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,842528,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU,1,seawater
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,842531,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU,1,seawater
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,842534,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU,1,seawater
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,842540,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU,1,seawater


#### Sampling method

In [None]:
#| exports
# To be validated
smp_method = {
    'BOTTLE': 1,
    'FISH': 18,
    'PUMP': 14,
    'UWAY': 24}

In [None]:
#| exports
class ExtractSamplingMethodCB(Callback):
    "Extract sampling method from nuclide names."
    def __init__(self, smp_method, var_name='nuclide'): 
        fc.store_attr()
        self.smp_method_col_name = cdl_cfg()['vars']['suffixes']['sampling_method']['name']

    def extract_smp_method(self, s):
        match = re.search(r'_(' + '|'.join(self.smp_method.keys()) + ') ', s)
        return self.smp_method[match.group(1)] if match else None
        
    def __call__(self, tfm):
        tfm.df[self.smp_method_col_name] = tfm.df[self.var_name].apply(self.extract_smp_method)

In [None]:
#| eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,842525,TRITIUM_D_CONC_BOTTLE [TU],0.733,TU,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,842528,TRITIUM_D_CONC_BOTTLE [TU],0.696,TU,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,842531,TRITIUM_D_CONC_BOTTLE [TU],0.718,TU,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,842534,TRITIUM_D_CONC_BOTTLE [TU],0.709,TU,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,842540,TRITIUM_D_CONC_BOTTLE [TU],0.692,TU,1,seawater,1


### Remap to MARIS nuclide names 

In [None]:
#| exports
nuclides_name = {'TRITIUM': 'h3', 'Pu_239_Pu_240': 'pu239_240_tot'}

In [None]:
#| exports
class RenameNuclideCB(Callback):
    "Remap nuclides name to MARIS standard."
    def __init__(self, nuclides_name, var_name='nuclide'): 
        fc.store_attr()
        self.patterns = ['_D', '_T', '_TP', '_LPT', '_SPT']

    def extract_nuclide_name(self, s):
        match = re.search(r'(.*?)(' + '|'.join(self.patterns) + ')', s)
        return match.group(1) if match else None

    def standardize_name(self, s):
        s = self.extract_nuclide_name(s)
        return self.nuclides_name[s] if s in self.nuclides_name else s.lower().replace('_', '')
        
    def __call__(self, tfm):
        tfm.df[self.var_name] = tfm.df[self.var_name].apply(self.standardize_name)

In [None]:
#|eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name)
])

df_test = tfm()
df_test.head()

Unnamed: 0,yyyy-mm-ddThh:mm:ss.sss,Longitude [degrees_east],Latitude [degrees_north],Bot. Depth [m],DEPTH [m],BODC Bottle Number:INTEGER,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,842525,h3,0.733,TU,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,842528,h3,0.696,TU,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,842531,h3,0.718,TU,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,842534,h3,0.709,TU,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,842540,h3,0.692,TU,1,seawater,1


In [None]:
#| eval: false
df_test.nuclide.unique()

array(['h3', 'cs137', 'i129', 'np237', 'pu239', 'pu239_240_tot', 'pu240',
       'u236', 'pa231', 'pb210', 'po210', 'ra224', 'ra226', 'ra228',
       'th230', 'th232', 'th234', 'ac227', 'be7', 'ra223', 'th228'],
      dtype=object)

### Standardize unit

In [None]:
#| exports
units_lut = {
    'TU': {'id': 7, 'factor': 1},
    'uBq/kg': {'id': 3, 'factor': 1e-6},
    'atoms/kg': {'id': 9, 'factor': 1},
    'mBq/kg': {'id': 3, 'factor': 1e-3}}

In [None]:
#| exports
class StandardizeUnitCB(Callback):
    "Remap unit to MARIS standard ones and apply conversion where needed."
    def __init__(self, units_lut, var_name='value'): 
        fc.store_attr()
        self.unit_col_name = cdl_cfg()['vars']['suffixes']['unit']['name']
        
    def __call__(self, tfm):
        # Convert/rescale values
        tfm.df[self.var_name] *= tfm.df[self.unit_col_name].map(
            {k: v['factor'] for k, v in self.units_lut.items()})
        
        # Match MARIS unit id
        tfm.df[self.unit_col_name] = tfm.df[self.unit_col_name].map(
            {k: v['id'] for k, v in self.units_lut.items()})

In [None]:
#|eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut)
])

df_test = tfm()
print(df_test.head())
print(df_test.columns)

     yyyy-mm-ddThh:mm:ss.sss  Longitude [degrees_east]  \
9223     2010-10-17T00:13:29                 350.33792   
9231     2010-10-17T00:13:29                 350.33792   
9237     2010-10-17T00:13:29                 350.33792   
9244     2010-10-17T00:13:29                 350.33792   
9256     2010-10-17T00:13:29                 350.33792   

      Latitude [degrees_north]  Bot. Depth [m]  DEPTH [m]  \
9223                   38.3271          2827.0       17.8   
9231                   38.3271          2827.0       34.7   
9237                   38.3271          2827.0       67.5   
9244                   38.3271          2827.0       91.9   
9256                   38.3271          2827.0      136.6   

      BODC Bottle Number:INTEGER nuclide  value  _unit  _filt     group  \
9223                      842525      h3  0.733    7.0      1  seawater   
9231                      842528      h3  0.696    7.0      1  seawater   
9237                      842531      h3  0.718    7.0     

### Rename common columns

In [None]:
#| exports
def renaming_rules():
    "Define column names renaming rules."
    vars = cdl_cfg()['vars']
    return {
        'yyyy-mm-ddThh:mm:ss.sss': vars['defaults']['time']['name'],
        'Longitude [degrees_east]': vars['defaults']['lon']['name'],
        'Latitude [degrees_north]': vars['defaults']['lat']['name'],
        'DEPTH [m]': vars['defaults']['smp_depth']['name'],
        'Bot. Depth [m]': vars['defaults']['tot_depth']['name'],
        'BODC Bottle Number:INTEGER': cdl_cfg()['dim']['name']
    }

In [None]:
#| exports
class RenameColumnCB(Callback):
    "Renaming variables to MARIS standard names."
    def __init__(self, renaming_rules=renaming_rules): fc.store_attr()
    def __call__(self, tfm):
        lut = self.renaming_rules()
        new_col_names = [lut[name] if name in lut else name for name in tfm.df.columns]
        tfm.df.columns = new_col_names

In [None]:
#|eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules)
])

df_test = tfm()
df_test.head()

Unnamed: 0,time,lon,lat,tot_depth,smp_depth,sample,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,350.33792,38.3271,2827.0,17.8,842525,h3,0.733,7.0,1,seawater,1
9231,2010-10-17T00:13:29,350.33792,38.3271,2827.0,34.7,842528,h3,0.696,7.0,1,seawater,1
9237,2010-10-17T00:13:29,350.33792,38.3271,2827.0,67.5,842531,h3,0.718,7.0,1,seawater,1
9244,2010-10-17T00:13:29,350.33792,38.3271,2827.0,91.9,842534,h3,0.709,7.0,1,seawater,1
9256,2010-10-17T00:13:29,350.33792,38.3271,2827.0,136.6,842540,h3,0.692,7.0,1,seawater,1


### Unshift longitudes

In [None]:
#| exports
class UnshiftLongitudeCB(Callback):
    "Longitudes are coded between 0 and 360 in Geotraces. We rescale it between -180 and 180 instead."
    def __init__(self): 
        self.lon_col_name = cdl_cfg()['vars']['defaults']['lon']['name']
    
    def __call__(self, tfm):
        tfm.df[self.lon_col_name] = tfm.df[self.lon_col_name] - 180

In [None]:
#| eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB()
])

df_test = tfm()
df_test.head()

Unnamed: 0,time,lon,lat,tot_depth,smp_depth,sample,nuclide,value,_unit,_filt,group,_sampmet
9223,2010-10-17T00:13:29,170.33792,38.3271,2827.0,17.8,842525,h3,0.733,7.0,1,seawater,1
9231,2010-10-17T00:13:29,170.33792,38.3271,2827.0,34.7,842528,h3,0.696,7.0,1,seawater,1
9237,2010-10-17T00:13:29,170.33792,38.3271,2827.0,67.5,842531,h3,0.718,7.0,1,seawater,1
9244,2010-10-17T00:13:29,170.33792,38.3271,2827.0,91.9,842534,h3,0.709,7.0,1,seawater,1
9256,2010-10-17T00:13:29,170.33792,38.3271,2827.0,136.6,842540,h3,0.692,7.0,1,seawater,1


### Dispatch to groups

In [None]:
#| exports
class DispatchToGroupCB(Callback):
    "Convert to a dictionary of dataframe with sample type (seawater,...) as keys."
    def __init__(self, group_name='group'): 
        fc.store_attr()
        
    def __call__(self, tfm):
        tfm.dfs = dict(tuple(tfm.df.groupby(self.group_name)))
        for key in tfm.dfs:
            tfm.dfs[key] = tfm.dfs[key].drop(self.group_name, axis=1)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB()
])

dfs_test = tfm()
print(f'dfs_test keys: {dfs_test.keys()}')
print(dfs_test['seawater'].head())

dfs_test keys: dict_keys(['seawater', 'suspended-matter'])
                     time        lon      lat  tot_depth  smp_depth  sample  \
9223  2010-10-17T00:13:29  170.33792  38.3271     2827.0       17.8  842525   
9231  2010-10-17T00:13:29  170.33792  38.3271     2827.0       34.7  842528   
9237  2010-10-17T00:13:29  170.33792  38.3271     2827.0       67.5  842531   
9244  2010-10-17T00:13:29  170.33792  38.3271     2827.0       91.9  842534   
9256  2010-10-17T00:13:29  170.33792  38.3271     2827.0      136.6  842540   

     nuclide  value  _unit  _filt  _sampmet  
9223      h3  0.733    7.0      1         1  
9231      h3  0.696    7.0      1         1  
9237      h3  0.718    7.0      1         1  
9244      h3  0.709    7.0      1         1  
9256      h3  0.692    7.0      1         1  


In [None]:
dfs_test['seawater'].columns

Index(['time', 'lon', 'lat', 'tot_depth', 'smp_depth', 'sample', 'nuclide',
       'value', '_unit', '_filt', '_sampmet'],
      dtype='object')

### Rehape: long to wide

In [None]:
class ReshapeLongToWide(Callback):
    "Convert data from long to wide with renamed columns."
    def __init__(self, 
                 unique_id: str='sample', # Column to use as unique index
                 columns: str='nuclide', # Column whose values will become column names
                 values: str='value', # Column containing values of each nuclide
                 ):
        fc.store_attr()
        self.derived_cols = self._get_derived_cols()
    
    def _get_derived_cols(self):
        "Retrieve all possible derived vars (e.g 'unc', 'dl', ...) from configs."
        return [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]

    def renamed_cols(self, cols):
        "Flatten columns name."
        return [inner if outer == "value" else f'{inner}{outer}' if inner else outer
                for outer, inner in cols]
        
    def set_index(self, df):
        "Set the index of the dataframe using the unique id column."
        df.index.name = self.unique_id
        return df

    def pivot(self, df):
        derived_coi = [col for col in self.derived_cols if col in df.columns]
        common_cols = list(set(df.columns) - 
                           set([self.columns] + [self.values] + derived_coi + [self.unique_id]))
                
        pivot_df = df.pivot_table(index=[self.unique_id] + common_cols,
                                  columns=self.columns,
                                  values=[self.values] + derived_coi,
                                  aggfunc=lambda x: x
                                  ).reset_index()
        
        # pivot_df = pivot_df.set_index(self.unique_id)
        return pivot_df

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp] = self.pivot(tfm.dfs[grp])
            tfm.dfs[grp].columns = self.renamed_cols(tfm.dfs[grp].columns)

In [None]:
#|eval: false
tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide()
])

dfs_test = tfm()
print('shape: ', dfs_test['seawater'].shape)
print('columns: ', dfs_test['seawater'] .columns)
dfs_test['seawater'].head()

shape:  (8779, 90)
columns:  Index(['sample', 'lon', 'smp_depth', 'tot_depth', 'time', 'lat', 'ac227_filt',
       'be7_filt', 'cs137_filt', 'h3_filt', 'i129_filt', 'np237_filt',
       'pa231_filt', 'pb210_filt', 'po210_filt', 'pu239_filt',
       'pu239_240_tot_filt', 'pu240_filt', 'ra223_filt', 'ra224_filt',
       'ra226_filt', 'ra228_filt', 'th228_filt', 'th230_filt', 'th232_filt',
       'th234_filt', 'u236_filt', 'ac227_sampmet', 'be7_sampmet',
       'cs137_sampmet', 'h3_sampmet', 'i129_sampmet', 'np237_sampmet',
       'pa231_sampmet', 'pb210_sampmet', 'po210_sampmet', 'pu239_sampmet',
       'pu239_240_tot_sampmet', 'pu240_sampmet', 'ra223_sampmet',
       'ra224_sampmet', 'ra226_sampmet', 'ra228_sampmet', 'th228_sampmet',
       'th230_sampmet', 'th232_sampmet', 'th234_sampmet', 'u236_sampmet',
       'ac227_unit', 'be7_unit', 'cs137_unit', 'h3_unit', 'i129_unit',
       'np237_unit', 'pa231_unit', 'pb210_unit', 'po210_unit', 'pu239_unit',
       'pu239_240_tot_unit', 'pu240

Unnamed: 0,sample,lon,smp_depth,tot_depth,time,lat,ac227_filt,be7_filt,cs137_filt,h3_filt,...,pu240,ra223,ra224,ra226,ra228,th228,th230,th232,th234,u236
0,194730,-145.9999,25.7,181.0,2007-07-30T10:37:19,75.0005,,,,,...,,,,,,,,,0.034914,
1,194732,-145.9999,49.5,181.0,2007-07-30T10:37:19,75.0005,,,,,...,,,,,,,,,0.037409,
2,194737,-145.9999,74.6,181.0,2007-07-30T10:37:19,75.0005,,,,,...,,,,,,,,,0.033527,
3,194748,-145.9999,100.2,181.0,2007-07-30T10:37:19,75.0005,,,,,...,,,,,,,,,0.034267,
4,194750,-146.0138,4.7,197.0,2007-07-31T08:55:44,77.5033,,,1.0,,...,,,,,,,,,,


Let's also have a quick look at how many nuclides are measured by sample:

In [None]:
#| eval: false
common_cols = [
    'lon',
    'lat', 
    'smp_depth',
    'tot_depth', 
    'time'
]

derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
cois = common_cols + derived_cols
measurement_cols = [col for col in dfs_test['seawater'].columns if col not in cois]
non_nan_counts = dfs_test['seawater'][measurement_cols].notna().sum(axis=1)

print(non_nan_counts.sort_values(ascending=False))

4531    23
4135    23
3791    23
4207    23
6657    21
        ..
3820     5
3818     5
3816     5
3814     5
8778     5
Length: 8779, dtype: int64


### Parse time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].time, format='ISO8601')

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB()
])

print('time data type: ', tfm()['seawater'].time.dtype)

time data type:  datetime64[ns]


### Encode time (seconds since ...)

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg())
])

dfs_test = tfm()['seawater']
dfs_test.head()

Unnamed: 0,sample,lon,smp_depth,tot_depth,time,lat,ac227_filt,be7_filt,cs137_filt,h3_filt,...,pu240,ra223,ra224,ra226,ra228,th228,th230,th232,th234,u236
0,194730,-145.9999,25.7,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.034914,
1,194732,-145.9999,49.5,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.037409,
2,194737,-145.9999,74.6,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.033527,
3,194748,-145.9999,100.2,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.034267,
4,194750,-146.0138,4.7,197.0,1185872144,77.5033,,,1.0,,...,,,,,,,,,,


### Sanitize coordinates

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    SanitizeLonLatCB()
])
dfs_test = tfm()['seawater']
dfs_test.head()

Unnamed: 0,sample,lon,smp_depth,tot_depth,time,lat,ac227_filt,be7_filt,cs137_filt,h3_filt,...,pu240,ra223,ra224,ra226,ra228,th228,th230,th232,th234,u236
0,194730,-145.9999,25.7,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.034914,
1,194732,-145.9999,49.5,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.037409,
2,194737,-145.9999,74.6,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.033527,
3,194748,-145.9999,100.2,181.0,1185791839,75.0005,,,,,...,,,,,,,,,0.034267,
4,194750,-146.0138,4.7,197.0,1185872144,77.5033,,,1.0,,...,,,,,,,,,,


## NetCDF encoder

### Example change logs

In [None]:
#|eval: false
df = pd.read_csv(fname_in)

tfm = Transformer(df, cbs=[
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(common_coi, nuclides_pattern),
    ExtractUnitCB(),
    ExtractFilteringStatusCB(phase),
    ExtractSamplingMethodCB(smp_method),
    RenameNuclideCB(nuclides_name),
    StandardizeUnitCB(units_lut),
    RenameColumnCB(renaming_rules),
    UnshiftLongitudeCB(),
    DispatchToGroupCB(),
    ReshapeLongToWide(),
    ParseTimeCB(),
    EncodeTimeCB(cfg()),
    SanitizeLonLatCB()
])

tfm();

In [None]:
#|eval: false
tfm.logs

['Select columns of interest.',
 '\n    Get Geotraces nuclide names as values not column names \n    to extract contained information (unit, sampling method, ...).\n    ',
 '\n    Extract units from nuclide names.\n    ',
 'Extract filtering status from nuclide names.',
 'Extract sampling method from nuclide names.',
 'Remap nuclides name to MARIS standard.',
 'Remap unit to MARIS standard ones and apply conversion where needed.',
 'Renaming variables to MARIS standard names.',
 'Longitudes are coded between 0 and 360 in Geotraces. We rescale it between -180 and 180 instead.',
 'Convert to a dictionary of dataframe with sample type (seawater,...) as keys.',
 'Convert data from long to wide with renamed columns.',
 'Encode time as `int` representing seconds since xxx.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        DepthRangeCB(),
        TimeRangeCB(cfg()),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#|eval: false
zotero_metadata = get_attrs(tfm, zotero_key='97UIMEXN', kw=kw)
print('Keys: ', zotero_metadata.keys())
print('Title: ', zotero_metadata['title'])

Keys:  dict_keys(['geospatial_lat_min', 'geospatial_lat_max', 'geospatial_lon_min', 'geospatial_lon_max', 'geospatial_bounds', 'time_coverage_start', 'time_coverage_end', 'title', 'summary', 'creator_name', 'keywords', 'publisher_postprocess_logs'])
Title:  The GEOTRACES Intermediate Data Product 2017


In [None]:
#| export
#def enums_xtra(tfm, vars):
#    "Retrieve a subset of the lengthy enum as 'species_t' for instance"
#    enums = Enums(lut_src_dir=lut_path(), cdl_enums=cdl_cfg()['enums'])
#    xtras = {}
#    for var in vars:
#        unique_vals = tfm.unique(var)
#        if unique_vals.any():
#            xtras[f'{var}_t'] = enums.filter(f'{var}_t', unique_vals)
#    return xtras

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path, **kwargs):
    df = pd.read_csv(fname_in)
    tfm = Transformer(df, cbs=[
        SelectColsOfInterestCB(common_coi, nuclides_pattern),
        WideToLongCB(common_coi, nuclides_pattern),
        ExtractUnitCB(),
        ExtractFilteringStatusCB(phase),
        ExtractSamplingMethodCB(smp_method),
        RenameNuclideCB(nuclides_name),
        StandardizeUnitCB(units_lut),
        RenameColumnCB(renaming_rules),
        UnshiftLongitudeCB(),
        DispatchToGroupCB(),
        ReshapeLongToWide(),
        ParseTimeCB(),
        EncodeTimeCB(cfg()),
        SanitizeLonLatCB()
    ])
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            src_fname=nc_tpl_path,
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='97UIMEXN', kw=kw),
                            verbose=kwargs.get('verbose', False),
                            #enums_xtra=enums_xtra(tfm, vars=['species', 'body_part'])
                           )
    encoder.encode()

In [None]:
#|eval: false
encode(fname_in, fname_out, nc_tpl_path(), verbose=False)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''