In [None]:
#| default_exp netcdf_to_csv

# NetCDF to Open Refine CSV (WIP)

***

## Packages import

In [None]:

from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, lut_path, nuc_lut_path, cdl_cfg
from marisco.serializers import OpenRefineCsvEncoder




dict_values([{'name': '_unc', 'dtype': 'f4', 'attrs': {'long_name': ' uncertainty', 'standard_name': '_uncertainty'}}, {'name': '_dl', 'dtype': 'dl_t', 'attrs': {'long_name': ' detection limit', 'standard_name': '_detection_limit'}}, {'name': '_vol', 'dtype': 'f4', 'attrs': {'long_name': ' volume', 'standard_name': '_volume'}}, {'name': '_sal', 'dtype': 'f4', 'attrs': {'long_name': ' salinity', 'standard_name': '_sal'}}, {'name': '_temp', 'dtype': 'f4', 'attrs': {'long_name': ' temperature', 'standard_name': '_temp'}}, {'name': '_filt', 'dtype': 'filt_t', 'attrs': {'long_name': ' filtered', 'standard_name': '_filtered'}}, {'name': '_counmet', 'dtype': 'counmet_t', 'attrs': {'long_name': ' counting method', 'standard_name': '_counting_method'}}, {'name': '_sampmet', 'dtype': 'sampmet_t', 'attrs': {'long_name': ' sampling method', 'standard_name': '_sampling_method'}}, {'name': '_prepmet', 'dtype': 'prepmet_t', 'attrs': {'long_name': ' preparation method', 'standard_name': '_preparation_

Get the current working directory (cwd). . 

In [None]:
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [None]:
fname_in = '../../_data/output/ospar_19950103_2021214.nc'
fname_out = '../../_data/output/ospar_test.csv'

### Load NetCDF 

load netcdf4 data

In [None]:
def netcdf4_to_df(fname_in):
    # read nc file
    netcdf4_data = Dataset(fname_in, "r")
    # Create dictionary of dataframes
    dfs={}
    for group in (netcdf4_data.groups.keys()):
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        dfs[group]=ds.to_dataframe()
    netcdf4_data.close()
    return(dfs)

In [None]:
dfs = netcdf4_to_df(fname_in)
dfs['biota']

Unnamed: 0_level_0,sample,lon,lat,time,bio_group,species,body_part,h3,h3_dl,h3_unit,...,pu238_dl,pu238_unit,am241,am241_unc,am241_dl,am241_unit,pu239_240_tot,pu239_240_tot_unc,pu239_240_tot_dl,pu239_240_tot_unit
biota,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,-4.901945,55.725277,1640908800,13,394,19,,-1,-1,...,-1,-1,,,-1,-1,0.3510,0.066,2,5
1,1,-3.240556,54.968887,1640908800,11,96,56,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1
2,2,-3.791389,58.565834,1640908800,13,394,19,,-1,-1,...,-1,-1,,,-1,-1,0.0938,0.018,2,5
3,3,-3.647778,58.618610,1640908800,13,394,19,,-1,-1,...,-1,-1,,,-1,-1,1.5400,0.310,2,5
4,4,-2.398056,55.964722,1640908800,11,96,56,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7992,15305,7.566667,54.348331,789350400,4,99,52,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1
7993,15306,7.566667,54.348331,789350400,4,99,25,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1
7994,15307,-3.566111,54.455002,789264000,13,394,19,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1
7995,15308,-3.566111,54.455002,789264000,13,394,19,,-1,-1,...,-1,-1,,,-1,-1,,,-1,-1


In [None]:
dfs['seawater'].columns

Index(['sample', 'lon', 'lat', 'smp_depth', 'time', 'h3', 'h3_unc', 'h3_dl',
       'h3_unit', 'tc99', 'tc99_unc', 'tc99_dl', 'tc99_unit', 'cs137',
       'cs137_unc', 'cs137_dl', 'cs137_unit', 'po210', 'po210_unc', 'po210_dl',
       'po210_unit', 'ra226', 'ra226_unc', 'ra226_dl', 'ra226_unit', 'ra228',
       'ra228_unc', 'ra228_dl', 'ra228_unit', 'pu239_240_tot',
       'pu239_240_tot_unc', 'pu239_240_tot_dl', 'pu239_240_tot_unit'],
      dtype='object')

### Transpose Nuclides 

In [None]:
#| export
# Define cols that are not nuclides
sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'smp_depth', 'time'],
           #'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'],
           'biota': ['sample', 'lon', 'lat', 'time', 'species', 'body_part']}

In [None]:
def allowed_nuclides():
    df=pd.read_excel(nuc_lut_path())
    return(df[['nuclide_id','nusymbol']])

In [None]:
allowed_nuclides()

Unnamed: 0,nuclide_id,nusymbol
0,-1,
1,0,0
2,1,3H
3,2,7Be
4,3,14C
...,...,...
129,138,T-BETA-40K
130,139,55Fe
131,140,"144Ce, 144Pr"
132,141,240Pu/239Pu


In [None]:
derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
derived_cols

['_unc',
 '_dl',
 '_vol',
 '_sal',
 '_temp',
 '_filt',
 '_counmet',
 '_sampmet',
 '_prepmet',
 '_unit']

0               NaN
1                 0
2                3H
3               7Be
4               14C
           ...     
129      T-BETA-40K
130            55Fe
131    144Ce, 144Pr
132     240Pu/239Pu
133     241Pu/239Pu
Name: nusymbol, Length: 134, dtype: object

In [None]:
derived_nuc_cols= x for x in allowed_nuclides()['nusymbol']

In [None]:
nuc=allowed_nuclides()['nusymbol'][2]
nuc

'3H'

In [None]:
nuc+(derived_cols[0])

'3H_unc'

In [None]:
for nuc in allowed_nuclides()['nusymbol']:
    nuc.join(x) for x in derived_cols

nan
0
3H
7Be
14C
40K
51Cr
54Mn
57Co
58Co
60Co
65Zn
89Sr
90Sr
95Zr
95Nb
99Tc
103Ru
106Ru
106Rh
106mAg
108Ag
108mAg
110mAg
124Sb
125Sb
129mTe
129I
131I
127Cs
134Cs
137Cs
140Ba
140La
141Ce
144Ce
147Pm
154Eu
155Eu
210Pb
212Pb
214Pb
207Bi
211Bi
214Bi
210Po
220Rn
222Rn
223Ra
224Ra
225Ra
226Ra
228Ra
228Ac
227Th
228Th
232Th
234Th
234Pa
234U
235U
238U
237Np
239Np
238Pu
239Pu
240Pu
241Pu
240Am
241Am
242Cm
243Cm
244Cm
134,137Cs
239,240Pu
239,240Pu III,IV
239,240Pu V,VI
243,244Cm
238Pu/239,240Pu
241Am/239,240Pu
137Cs/134Cs
109Cd
152Eu
59Fe
153Gd
192Ir
238,240Pu
86Rb
46Sc
113Sn
117mSn
208Tl
99Mo
99mTc
105Ru
129Te
132Te
132I
135I
136Cs
T-BETA
T-ALPHA
133I
230Th
231Pa
236U
111Ag
116mIn
123mTe
127Sb
133Ba
139Ce
201Tl
203Hg
22Na
234mPa
243Am
75Se
85Sr
88Y
140Ce
212Bi
236U/238U
125I
137mBa
232U
233Pa
106Ru,106Rh
T-U
T-BETA-40K
55Fe
144Ce, 144Pr
240Pu/239Pu
241Pu/239Pu


In [None]:
self.derived_cols = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]


In [None]:
cdl_cfg()['vars']['suffixes'].values()

In [None]:
#| export
class TransposeNuclideColumns(Callback):
    
    " Transpose NetCDF nuclide data."
    def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()
    
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)
            
    def transpose_nuclides(self, df, group):
        df_sample_cols=self.cols_grp[group]
        
        
        
        nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
        unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
        unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
        dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
        nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))
              
        # Transpose nuclide_cols
        nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')
        
        # Transpose unc_cols
        unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
        unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')    
            
        # Transpose unit_cols
        unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')       
        unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')
        
        # Transpose dl_cols
        dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')       
        dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')        
        
        # Combine nuclide_df, unc_df, unit_df and dl_df
        combine_on=sample_cols+['nuclide']
        df = pd.merge(nuclide_df, unc_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, unit_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, dl_df,  how='outer', left_on= combine_on, right_on = combine_on)
        
        # Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
        df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
        return(df)            

In [None]:
#| export
class TransposeNuclideColumns(Callback):
    
    " Transpose NetCDF nuclide data."
    def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()
    
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)
            
    def transpose_nuclides(self, df, group):
        sample_cols=self.cols_grp[group]
        
        nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
        unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
        unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
        dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
        nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))
              
        # Transpose nuclide_cols
        nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')
        
        # Transpose unc_cols
        unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
        unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')    
            
        # Transpose unit_cols
        unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')       
        unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')
        
        # Transpose dl_cols
        dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')       
        dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')        
        
        # Combine nuclide_df, unc_df, unit_df and dl_df
        combine_on=sample_cols+['nuclide']
        df = pd.merge(nuclide_df, unc_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, unit_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, dl_df,  how='outer', left_on= combine_on, right_on = combine_on)
        
        # Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
        df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
        return(df)            

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns()])
tfm()

{'seawater':        sample       lon        lat  smp_depth        time        nuclide  \
 0           0  3.188056  51.375278        3.0  1264550400          cs137   
 1           0  3.188056  51.375278        3.0  1264550400             h3   
 2           0  3.188056  51.375278        3.0  1264550400          po210   
 3           0  3.188056  51.375278        3.0  1264550400  pu239_240_tot   
 4           0  3.188056  51.375278        3.0  1264550400          ra226   
 ...       ...       ...        ...        ...         ...            ...   
 70625   18855 -4.086389  53.123333        0.0  1617753600          po210   
 70626   18855 -4.086389  53.123333        0.0  1617753600  pu239_240_tot   
 70627   18855 -4.086389  53.123333        0.0  1617753600          ra226   
 70628   18855 -4.086389  53.123333        0.0  1617753600          ra228   
 70629   18855 -4.086389  53.123333        0.0  1617753600           tc99   
 
        activity  uncertainty  unit_id  dl  
 0           0.2 

***

### Parse Time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time)
    def format_time(self, x): 
        return num2pydate(x, units=self.cfg['units']['time'])

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg())])
tfm()

{'seawater':        sample       lon        lat  smp_depth       time        nuclide  \
 0           0  3.188056  51.375278        3.0 2010-01-27          cs137   
 1           0  3.188056  51.375278        3.0 2010-01-27             h3   
 2           0  3.188056  51.375278        3.0 2010-01-27          po210   
 3           0  3.188056  51.375278        3.0 2010-01-27  pu239_240_tot   
 4           0  3.188056  51.375278        3.0 2010-01-27          ra226   
 ...       ...       ...        ...        ...        ...            ...   
 70625   18855 -4.086389  53.123333        0.0 2021-04-07          po210   
 70626   18855 -4.086389  53.123333        0.0 2021-04-07  pu239_240_tot   
 70627   18855 -4.086389  53.123333        0.0 2021-04-07          ra226   
 70628   18855 -4.086389  53.123333        0.0 2021-04-07          ra228   
 70629   18855 -4.086389  53.123333        0.0 2021-04-07           tc99   
 
        activity  uncertainty  unit_id  dl  
 0           0.2          NaN

***

### Sample Type 

In [None]:
#| export
# Define sample types groups
sample_type_lut = {'seawater': 1,
           'sediment': 2,
           'biota': 3,
           'suspended matter': 4}

In [None]:
#| export
class LookupSampleType(Callback):
    def __init__(self, lut=sample_type_lut): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['samptype_id'] = self.lut[k]
            

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType()])
tfm()

{'seawater':        sample       lon        lat  smp_depth       time        nuclide  \
 0           0  3.188056  51.375278        3.0 2010-01-27          cs137   
 1           0  3.188056  51.375278        3.0 2010-01-27             h3   
 2           0  3.188056  51.375278        3.0 2010-01-27          po210   
 3           0  3.188056  51.375278        3.0 2010-01-27  pu239_240_tot   
 4           0  3.188056  51.375278        3.0 2010-01-27          ra226   
 ...       ...       ...        ...        ...        ...            ...   
 70625   18855 -4.086389  53.123333        0.0 2021-04-07          po210   
 70626   18855 -4.086389  53.123333        0.0 2021-04-07  pu239_240_tot   
 70627   18855 -4.086389  53.123333        0.0 2021-04-07          ra226   
 70628   18855 -4.086389  53.123333        0.0 2021-04-07          ra228   
 70629   18855 -4.086389  53.123333        0.0 2021-04-07           tc99   
 
        activity  uncertainty  unit_id  dl  samptype_id  
 0           0.2

***

### Nuclide lookup

In [None]:
#| export
def get_nucnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nc_name').to_dict()['nuclide_id']

In [None]:
# | export
class LookupNuclideIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':        sample       lon        lat  smp_depth       time        nuclide  \
 0           0  3.188056  51.375278        3.0 2010-01-27          cs137   
 1           0  3.188056  51.375278        3.0 2010-01-27             h3   
 2           0  3.188056  51.375278        3.0 2010-01-27          po210   
 3           0  3.188056  51.375278        3.0 2010-01-27  pu239_240_tot   
 4           0  3.188056  51.375278        3.0 2010-01-27          ra226   
 ...       ...       ...        ...        ...        ...            ...   
 70625   18855 -4.086389  53.123333        0.0 2021-04-07          po210   
 70626   18855 -4.086389  53.123333        0.0 2021-04-07  pu239_240_tot   
 70627   18855 -4.086389  53.123333        0.0 2021-04-07          ra226   
 70628   18855 -4.086389  53.123333        0.0 2021-04-07          ra228   
 70629   18855 -4.086389  53.123333        0.0 2021-04-07           tc99   
 
        activity  uncertainty  unit_id  dl  samptype_id  nuclide_id  
 0  

In [None]:
tfm.dfs['biota']

Unnamed: 0,sample,lon,lat,time,species,body_part,nuclide,activity,uncertainty,unit_id,dl,samptype_id,nuclide_id
0,0,-4.901945,55.725277,2021-12-31,394,19,am241,,,-1.0,-1.0,3,72
1,0,-4.901945,55.725277,2021-12-31,394,19,bio_group,13.0,,,,3,bio_group
2,0,-4.901945,55.725277,2021-12-31,394,19,cs137,,,-1.0,-1.0,3,33
3,0,-4.901945,55.725277,2021-12-31,394,19,h3,,,-1.0,-1.0,3,1
4,0,-4.901945,55.725277,2021-12-31,394,19,pb210,,,-1.0,-1.0,3,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87962,15309,-3.566111,54.455002,1995-01-05,394,19,pu238,,,-1.0,-1.0,3,67
87963,15309,-3.566111,54.455002,1995-01-05,394,19,pu239_240_tot,,,-1.0,-1.0,3,77
87964,15309,-3.566111,54.455002,1995-01-05,394,19,ra226,,,-1.0,-1.0,3,53
87965,15309,-3.566111,54.455002,1995-01-05,394,19,ra228,,,-1.0,-1.0,3,54


***

### Include ref_id

TODO: Create lut for the ref_id?

In [None]:
#| export
def get_ref_id():
    ref_id = 191
    return ref_id

In [None]:
# | export
class LookupRefIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_ref_id):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['ref_id'] = lut

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            LookupRefIdCB(get_ref_id)
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':        sample       lon        lat  smp_depth       time        nuclide  \
 0           0  3.188056  51.375278        3.0 2010-01-27          cs137   
 1           0  3.188056  51.375278        3.0 2010-01-27             h3   
 2           0  3.188056  51.375278        3.0 2010-01-27          po210   
 3           0  3.188056  51.375278        3.0 2010-01-27  pu239_240_tot   
 4           0  3.188056  51.375278        3.0 2010-01-27          ra226   
 ...       ...       ...        ...        ...        ...            ...   
 70625   18855 -4.086389  53.123333        0.0 2021-04-07          po210   
 70626   18855 -4.086389  53.123333        0.0 2021-04-07  pu239_240_tot   
 70627   18855 -4.086389  53.123333        0.0 2021-04-07          ra226   
 70628   18855 -4.086389  53.123333        0.0 2021-04-07          ra228   
 70629   18855 -4.086389  53.123333        0.0 2021-04-07           tc99   
 
        activity  uncertainty  unit_id  dl  samptype_id  nuclide_id  ref_i

In [None]:
tfm.dfs['biota'].head()

Unnamed: 0,sample,lon,lat,time,species,body_part,nuclide,activity,uncertainty,unit_id,dl,samptype_id,nuclide_id,ref_id
0,0,-4.901945,55.725277,2021-12-31,394,19,am241,,,-1.0,-1.0,3,72,191
1,0,-4.901945,55.725277,2021-12-31,394,19,bio_group,13.0,,,,3,bio_group,191
2,0,-4.901945,55.725277,2021-12-31,394,19,cs137,,,-1.0,-1.0,3,33,191
3,0,-4.901945,55.725277,2021-12-31,394,19,h3,,,-1.0,-1.0,3,1,191
4,0,-4.901945,55.725277,2021-12-31,394,19,pb210,,,-1.0,-1.0,3,41,191


### Rename columns

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'activity',
                     'uncertainty', 'unit_id', 'dl', , 'nuclide_id'],
            'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
                'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}

In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'lat': 'latitude',
    'lon': 'longitude',
    'time': 'begperiod',
    'depth': 'sampdepth',
    'nuclide': 'nuclide_id',
    'uncertainty': 'uncertaint',
    'dl': 'detection',
    'sed_type': 'sedtype_id (0)',
    'species_id': 'species_id (0)',
    'body_part': 'bodypar_id',
}

In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            RenameColumnCB()
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample  longitude   latitude  sampdepth  begperiod     nuclide_id  \
 8            0  11.078300  54.349998        0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998        0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000        0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000        0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000        0.0 2019-02-12          cs137   
 ...        ...        ...        ...        ...        ...            ...   
 144248    4818  18.235001  58.583302      460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302      460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302      460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302      460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302      460.0 1996-07-25           sr90   
 
            activity  uncertaint  unit_id  detecti

***

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                                ParseTimeCB(cfg()),
                                LookupSampleType(),
                                LookupNuclideIdCB(),
                                RenameColumnCB()
                                ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out, 
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, verbose=False)

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


<marisco.serializers.OpenRefineCsvEncoder>