In [1]:
#| default_exp netcdf_to_csv

# NetCDF to Open Refine CSV (WIP)

***

## Packages import

mamba install dask --force-reinstall


In [562]:
from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, lut_path, cdl_cfg, nuc_lut_path, unit_lut_path, detection_limit_lut_path
from marisco.serializers import OpenRefineCsvEncoder
from functools import reduce,partial

Get the current working directory (cwd). . 

In [153]:
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [154]:
fname_in = '../../_data/output/ospar_19950103_2021214.nc'
fname_out = '../../_data/output/ospar_test.csv'

### Load NetCDF 

load netcdf4 data

In [341]:
def netcdf4_to_df(fname_in):  
    # Read nc file
    with Dataset(fname_in, "r", format='NETCDF4' ) as nc:
        # Read groups ('seawater', 'biota', 'sediment')
        groups= nc.groups.keys()
        # Read fill values 
        fill_value={}
        for group in groups:
            fill_value[group] = netcdf4_data.groups[group].variables['sample'][:].fill_value
    
    # Create dictionary of dataframes
    dfs={}
    for group in groups:
        # Read dataset
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        # Create Pandas dataframe 
        dfs[group]=ds.to_dataframe()
        # If the index is not 'sample' then set the index to be 'sample'
        if dfs[group].index.name != 'sample':
            dfs[group].set_index("sample", inplace=True)
        # Drop the rows where 'sample' uses the fill_value.
        dfs[group]=dfs[group].drop(fill_value[group], axis=0, errors='ignore') 
    return(dfs)

In [342]:
dfs = netcdf4_to_df(fname_in)
dfs

{'seawater':               lon        lat  smp_depth        time   h3  h3_unc  h3_dl  \
 sample                                                                    
 100     -1.973889  57.998890        0.0  1281916800  NaN     NaN     -1   
 101     -2.486944  58.484444        0.0  1281916800  NaN     NaN     -1   
 102     -2.007222  58.994446        0.0  1281916800  NaN     NaN     -1   
 103     -1.973889  57.998890        0.0  1281916800  NaN     NaN     -1   
 104     -2.486944  58.484444        0.0  1281916800  NaN     NaN     -1   
 ...           ...        ...        ...         ...  ...     ...    ...   
 16394   13.265278  73.723053     1675.0  1538092800  NaN     NaN     -1   
 16399   13.265278  73.723053     1675.0  1538092800  5.1     NaN      3   
 18659   13.267500  73.723610     1680.0  1630713600  NaN     NaN     -1   
 18694   13.267500  73.723610     1680.0  1630713600  NaN     NaN     -1   
 9439   -12.666667  68.000000     1850.0   975283200  NaN     NaN     -1   


In [344]:
dfs['seawater']

Unnamed: 0_level_0,lon,lat,smp_depth,time,h3,h3_unc,h3_dl,h3_unit,tc99,tc99_unc,...,ra226_dl,ra226_unit,ra228,ra228_unc,ra228_dl,ra228_unit,pu239_240_tot,pu239_240_tot_unc,pu239_240_tot_dl,pu239_240_tot_unit
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100,-1.973889,57.998890,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
101,-2.486944,58.484444,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
102,-2.007222,58.994446,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
103,-1.973889,57.998890,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000032,1.610000e-06,2,1
104,-2.486944,58.484444,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000031,1.565000e-06,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16394,13.265278,73.723053,1675.0,1538092800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000012,1.250000e-06,2,1
16399,13.265278,73.723053,1675.0,1538092800,5.1,,3,1,,,...,-1,-1,,,-1,-1,,,-1,-1
18659,13.267500,73.723610,1680.0,1630713600,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
18694,13.267500,73.723610,1680.0,1630713600,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000010,8.000000e-07,2,1


### Reshape: wide to long

In [345]:
#| export
class ReshapeWideToLong(Callback):
    "Convert data from wide to long with renamed columns."
    def __init__(self, columns='nuclide', values=['value']):
        fc.store_attr()
        # Retrieve all possible suffixes vars (e.g '_unc', '_dl', ...) from configs
        suff_cfg = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
        # Retrieve all possible nuclides
        nucs_cfg = pd.read_excel(nuc_lut_path())['nc_name'].to_list()
        nucs_cfg = [x for x in nucs_cfg if str(x) != 'nan'] # remove 'nan' from nuclide list
        # Retrieve all possible vars thats are not in vars suffixes
        self.vars_cfg=[x['name'] for var_key in cdl_cfg()['vars'].keys() for x in cdl_cfg()['vars'][var_key].values() if var_key != 'suffixes']
        # combine all possible nuclides with its suffixes.    
        value_name='Activity or MDA'
        derived_nucs_cols={value_name:nucs_cfg}     
        for suf in suff_cfg:
            derived_nucs_cols[suf]= [str(nuc)+str(suf) for nuc in nucs_cfg]
        self.derived_nucs_cols=derived_nucs_cols
           
    def melt(self, df):
        # Among all possible 'self.derived_nuc_cols' include the ones present in df.
        derived_nucs_cols={}
        for key,derived_nuc_cols in self.derived_nucs_cols.items():
            derived_nuc_cols = [col for col in derived_nuc_cols if col in df.columns]
            if derived_nuc_cols:
                derived_nucs_cols[key] = derived_nuc_cols
        
        # Among all possible 'self.vars_cfg' include the ones present in df.
        vars_cfg = [var for var in self.vars_cfg if var in df.columns]
        
        # Melt cols included in self.derived_nucs_cols        
        df=df.reset_index()  # Reset the index so 'sample' can be used with id_vars
        nuc_dfs={}
        for key,val in derived_nucs_cols.items():
            # Transpose nuclide_cols
            df_t=pd.melt(frame=df, id_vars=vars_cfg+['sample'], value_vars=val, var_name='nuclide', value_name=key)
            df_t['nuclide']=df_t['nuclide'].str.replace(key, '')
            # Keep rows where 'key' value is not nan
            df_t=df_t[df_t[key].notna()]
            nuc_dfs[key]=df_t
        
        # Merge dfs created from melt. 
        combine_on= vars_cfg + ['sample'] + ['nuclide']
        merged_df=reduce(lambda df1, df2: pd.merge(df1, df2,  how='outer', left_on= combine_on, right_on = combine_on), nuc_dfs.values())
        # Keep rows where either value_name (i.e.Activity or MDA ) or _unc are not 'nan'.
        merged_df = merged_df[merged_df[['Activity or MDA','_unc']].notna().any(axis=1)]
        return (merged_df)
    
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.melt(tfm.dfs[k])


In [346]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong()])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

TODO: check the dfs as the dimensions should be longer. this means each row has a single nuclide. Is this what we want?

In [347]:
tfm.dfs['biota']

Unnamed: 0,lon,lat,time,bio_group,species,body_part,sample,nuclide,Activity or MDA,_unc,_dl,_unit
1,-39.634445,66.784164,1017014400,4,99,52,11474,cs137,0.1800,0.014634,2,5
11,-39.150002,62.116665,1375228800,4,426,52,4714,cs137,0.2198,0.025633,2,5
21,-35.919998,64.289719,1287273600,4,99,52,6465,cs137,0.2090,0.008778,2,5
31,-35.099998,64.720001,1287100800,4,381,52,6466,cs137,0.1830,0.007503,2,5
41,-34.000000,64.000000,1306886400,4,99,52,6143,cs137,0.1696,0.018266,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...
79921,34.336113,72.420280,1629676800,4,404,34,259,cs137,0.1200,0.013500,2,5
79931,34.351665,72.426392,1535760000,4,404,52,1913,cs137,0.1200,0.020000,2,5
79941,35.521946,78.758888,1632355200,4,402,3,140,cs137,0.0420,0.016500,2,5
79951,36.581112,73.510002,1536537600,4,99,52,1878,cs137,0.1700,0.015000,2,5


***

### Parse Time

In [531]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Sampling start date'] = tfm.dfs[k]['time'].apply(self.format_time)
    def format_time(self, x): 
        date = num2pydate(x, units=self.cfg['units']['time'])
        date=date.strftime('%d-%b-%Y')
        return date

In [532]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg())])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Sample Type 

In [545]:
#| export
class LookupSampleType(Callback):
    def __init__(self): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Sample type'] = k.upper()
            

In [546]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType()])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Nuclide lookup

In [536]:
#| export
def get_nucnames_lut():
    df_nuclide = pd.read_excel(nuc_lut_path(), usecols=['nc_name','nusymbol'])
    return df_nuclide.set_index('nc_name').to_dict()['nusymbol']

TODO Review isotopes. Cs127?

In [537]:
# | export
class LookupNuclideIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Nuclide'] = tfm.dfs[k]['nuclide'].replace(lut)
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.strip()
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.replace(',','_')
            
            

TODO Check the correct format. Went from h3 to 3H. 

In [538]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [539]:
tfm.dfs['biota']['Nuclide'].unique()

array(['137Cs', '210Po', '99Tc', '239_240Pu', '238Pu', '241Am', '210Pb',
       '226Ra', '228Ra', '3H'], dtype=object)

***

### Lon, Lat coordinates

Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction.

In [540]:
# | export
def deg_to_dms(deg, coordinate='lat'):
    """Convert from decimal degrees to degrees, minutes, seconds."""
    #print(deg)
    m, s = divmod(abs(deg)*3600, 60)
    d, m = divmod(m, 60)
    
    if deg < 0:
        if coordinate == 'lat':
            cord = 'S'
        elif coordinate == 'lon':
            cord = 'W'
    else:
        if coordinate == 'lat':
            cord = 'N' 
        elif coordinate == 'lon':
            cord = 'E'                       
    
    #print(d, m ,s , cord)
    
    d, m = int(d), int(m)
    
    return pd.Series([d, m, s, cord])

In [541]:
# | export
class ConvertLonLatCB(Callback):
    "Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction."
    def __init__(self, fn_convert=deg_to_dms):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp][['Latitude degrees','Latitude minutes','Latitude seconds','Latitude direction']] = tfm.dfs[grp]['lat'].apply(self.fn_convert, coordinate='lat')
            tfm.dfs[grp][['Longitude degrees','Longitude minutes','Longitude seconds','Longitude direction']] = tfm.dfs[grp]['lon'].apply(self.fn_convert, coordinate='lon')


In [553]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [544]:
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_unit', 'Sampling start date',
       'Sample type', 'Nuclide', 'Latitude degrees', 'Latitude minutes',
       'Latitude seconds', 'Latitude direction', 'Longitude degrees',
       'Longitude minutes', 'Longitude seconds', 'Longitude direction'],
      dtype='object')

***

### Unit lookup

In [551]:
#| export
def get_unitnames_lut():
    df_unit = pd.read_excel(unit_lut_path(), usecols=['unit_id','unit'])
    return df_unit.set_index('unit_id').to_dict()['unit']

In [554]:
# | export
class LookupUnitIdCB(Callback):
    "Lookup MARIS unit_id."
    def __init__(self,
                 fn_lut=get_unitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Unit'] = tfm.dfs[k]['_unit'].replace(lut)
                        

In [555]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [558]:
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_unit', 'Sampling start date',
       'Sample type', 'Nuclide', 'Latitude degrees', 'Latitude minutes',
       'Latitude seconds', 'Latitude direction', 'Longitude degrees',
       'Longitude minutes', 'Longitude seconds', 'Longitude direction',
       'Unit'],
      dtype='object')

In [560]:
tfm.dfs['seawater']['_dl']

3        2
13       2
14       2
21       2
31       2
        ..
70595    2
70605    2
70609    2
70619    2
70624    2
Name: _dl, Length: 10090, dtype: int64

***

### Value type (_dl) lookup


In [579]:
#| export
def get_detectionlimitnames_lut():
    df_unit = pd.read_excel(detection_limit_lut_path(), usecols=['id','name'])
    return df_unit.set_index('id').to_dict()['name']

In [580]:
# | export
class LookupValueTypeIdCB(Callback):
    "Lookup MARIS unit_id."
    def __init__(self,
                 fn_lut=get_detectionlimitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Value type'] = tfm.dfs[k]['_dl'].replace(lut)
                        

In [581]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [586]:
tfm.dfs['biota'].columns

Index(['lon', 'lat', 'time', 'bio_group', 'species', 'body_part', 'sample',
       'nuclide', 'Activity or MDA', '_unc', '_dl', '_unit',
       'Sampling start date', 'Sample type', 'Nuclide', 'Latitude degrees',
       'Latitude minutes', 'Latitude seconds', 'Latitude direction',
       'Longitude degrees', 'Longitude minutes', 'Longitude seconds',
       'Longitude direction', 'Unit', 'Value type'],
      dtype='object')

In [587]:
tfm.dfs['biota']['bio_group']

1        4
11       4
21       4
31       4
41       4
        ..
79921    4
79931    4
79941    4
79951    4
79961    4
Name: bio_group, Length: 7997, dtype: int64

#### Biogroup

# HERE

Create a biogroup_lut in configs

In [588]:
#| export
def get_biogroup_lut(maris_lut):
    species = pd.read_excel(maris_lut)
    return species[['species_id', 'biogroup_id']].set_index('species_id').to_dict()['biogroup_id']

In [589]:
get_biogroup_lut()

TypeError: get_biogroup_lut() missing 1 required positional argument: 'maris_lut'

In [None]:
#| export
class LookupBiogroupCB(Callback):
    """
    Update biogroup id  based on MARIS dbo_species.xlsx
    """
    def __init__(self, fn_lut): fc.store_attr()
    def __call__(self, tfm):
        lut = self.fn_lut()
        tfm.dfs['biota']['bio_group'] = tfm.dfs['biota']['species'].apply(lambda x: lut[x])

### Rename columns

Rename 

TODO: What is ref_id? 
Should/is 'sample' used to create a ref_id? 

TODO: List COI in order

TODO 'nuclide' will be dropped in the actual OR

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity',
                     'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide',
                     'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
                     'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}

In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'lat': 'latitude',
    'lon': 'longitude',
    'time': 'begperiod',
    'depth': 'sampdepth',
    'nuclide': 'nuclide_id',
    'uncertainty': 'uncertaint',
    'dl': 'detection',
    'sed_type': 'sedtype_id (0)',
    'species_id': 'species_id (0)',
    'body_part': 'bodypar_id',
}

In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            RenameColumnCB()
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample  longitude   latitude  sampdepth  begperiod     nuclide_id  \
 8            0  11.078300  54.349998        0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998        0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000        0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000        0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000        0.0 2019-02-12          cs137   
 ...        ...        ...        ...        ...        ...            ...   
 144248    4818  18.235001  58.583302      460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302      460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302      460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302      460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302      460.0 1996-07-25           sr90   
 
            activity  uncertaint  unit_id  detecti

***

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                                ParseTimeCB(cfg()),
                                LookupSampleType(),
                                LookupNuclideIdCB(),
                                RenameColumnCB()
                                ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out, 
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, verbose=False)

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


<marisco.serializers.OpenRefineCsvEncoder>