In [1]:
#| default_exp netcdf_to_csv

# NetCDF to Open Refine CSV (WIP)

***

## Packages import

In [2]:
#| export
from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, cdl_cfg, nuc_lut_path, unit_lut_path, detection_limit_lut_path, species_lut_path, bodyparts_lut_path, sediments_lut_path
from marisco.serializers import OpenRefineCsvEncoder
from functools import reduce,partial

Get the current working directory (cwd). . 

In [3]:
#|eval: false
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [4]:
# | export
fname_in = '../../_data/output/ospar_19950103_2021214.nc'
fname_out = '../../_data/output/ospar_test.csv'
ref_id=191 # OSPAR ref_id

## Load NetCDF 

In [5]:
# | export
def netcdf4_to_df(fname_in):  
    # Read nc file
    with Dataset(fname_in, "r", format='NETCDF4' ) as nc:
        # Read groups ('seawater', 'biota', 'sediment')
        groups= nc.groups.keys()
        # Read fill values 
        fill_value={}
        for group in groups:
            fill_value[group] = nc.groups[group].variables['sample'][:].fill_value
    
    # Create dictionary of dataframes
    dfs={}
    for group in groups:
        # Read dataset
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        # Create Pandas dataframe 
        dfs[group]=ds.to_dataframe()
        # If the index is not 'sample' then set the index to be 'sample'
        if dfs[group].index.name != 'sample':
            dfs[group].set_index("sample", inplace=True)
        # Drop the rows where 'sample' uses the fill_value.
        dfs[group]=dfs[group].drop(fill_value[group], axis=0, errors='ignore') 
    return(dfs)

In [6]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
dfs

{'seawater':               lon        lat  smp_depth        time   h3  h3_unc  h3_dl  \
 sample                                                                    
 100     -1.973889  57.998890        0.0  1281916800  NaN     NaN     -1   
 101     -2.486944  58.484444        0.0  1281916800  NaN     NaN     -1   
 102     -2.007222  58.994446        0.0  1281916800  NaN     NaN     -1   
 103     -1.973889  57.998890        0.0  1281916800  NaN     NaN     -1   
 104     -2.486944  58.484444        0.0  1281916800  NaN     NaN     -1   
 ...           ...        ...        ...         ...  ...     ...    ...   
 16394   13.265278  73.723053     1675.0  1538092800  NaN     NaN     -1   
 16399   13.265278  73.723053     1675.0  1538092800  5.1     NaN      3   
 18659   13.267500  73.723610     1680.0  1630713600  NaN     NaN     -1   
 18694   13.267500  73.723610     1680.0  1630713600  NaN     NaN     -1   
 9439   -12.666667  68.000000     1850.0   975283200  NaN     NaN     -1   


In [7]:
#|eval: false
dfs['seawater']

Unnamed: 0_level_0,lon,lat,smp_depth,time,h3,h3_unc,h3_dl,h3_unit,tc99,tc99_unc,...,ra226_dl,ra226_unit,ra228,ra228_unc,ra228_dl,ra228_unit,pu239_240_tot,pu239_240_tot_unc,pu239_240_tot_dl,pu239_240_tot_unit
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100,-1.973889,57.998890,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
101,-2.486944,58.484444,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
102,-2.007222,58.994446,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
103,-1.973889,57.998890,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000032,1.610000e-06,2,1
104,-2.486944,58.484444,0.0,1281916800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000031,1.565000e-06,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16394,13.265278,73.723053,1675.0,1538092800,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000012,1.250000e-06,2,1
16399,13.265278,73.723053,1675.0,1538092800,5.1,,3,1,,,...,-1,-1,,,-1,-1,,,-1,-1
18659,13.267500,73.723610,1680.0,1630713600,,,-1,-1,,,...,-1,-1,,,-1,-1,,,-1,-1
18694,13.267500,73.723610,1680.0,1630713600,,,-1,-1,,,...,-1,-1,,,-1,-1,0.000010,8.000000e-07,2,1


## Transform data

### Reshape: wide to long

In [8]:
#| export
class ReshapeWideToLong(Callback):
    "Convert data from wide to long with renamed columns."
    def __init__(self, columns='nuclide', values=['value']):
        fc.store_attr()
        # Retrieve all possible suffixes vars (e.g '_unc', '_dl', ...) from configs
        suff_cfg = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
        # Retrieve all possible nuclides
        nucs_cfg = pd.read_excel(nuc_lut_path())['nc_name'].to_list()
        nucs_cfg = [x for x in nucs_cfg if str(x) != 'nan'] # remove 'nan' from nuclide list
        # Retrieve all possible vars thats are not in vars suffixes
        self.vars_cfg=[x['name'] for var_key in cdl_cfg()['vars'].keys() for x in cdl_cfg()['vars'][var_key].values() if var_key != 'suffixes']
        # combine all possible nuclides with its suffixes.    
        value_name='Activity or MDA'
        derived_nucs_cols={value_name:nucs_cfg}     
        for suf in suff_cfg:
            derived_nucs_cols[suf]= [str(nuc)+str(suf) for nuc in nucs_cfg]
        self.derived_nucs_cols=derived_nucs_cols
           
    def melt(self, df):
        # Among all possible 'self.derived_nuc_cols' include the ones present in df.
        derived_nucs_cols={}
        for key,derived_nuc_cols in self.derived_nucs_cols.items():
            derived_nuc_cols = [col for col in derived_nuc_cols if col in df.columns]
            if derived_nuc_cols:
                derived_nucs_cols[key] = derived_nuc_cols
        
        # Among all possible 'self.vars_cfg' include the ones present in df.
        vars_cfg = [var for var in self.vars_cfg if var in df.columns]
        
        # Melt cols included in self.derived_nucs_cols        
        df=df.reset_index()  # Reset the index so 'sample' can be used with id_vars
        nuc_dfs={}
        for key,val in derived_nucs_cols.items():
            # Transpose nuclide_cols
            df_t=pd.melt(frame=df, id_vars=vars_cfg+['sample'], value_vars=val, var_name='nuclide', value_name=key)
            df_t['nuclide']=df_t['nuclide'].str.replace(key, '')
            # Keep rows where 'key' value is not nan
            df_t=df_t[df_t[key].notna()]
            nuc_dfs[key]=df_t
        
        # Merge dfs created from melt. 
        combine_on= vars_cfg + ['sample'] + ['nuclide']
        merged_df=reduce(lambda df1, df2: pd.merge(df1, df2,  how='outer', left_on= combine_on, right_on = combine_on), nuc_dfs.values())
        # Keep rows where either value_name (i.e.Activity or MDA ) or _unc are not 'nan'.
        merged_df = merged_df[merged_df[['Activity or MDA','_unc']].notna().any(axis=1)]
        return (merged_df)
    
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.melt(tfm.dfs[k])


In [9]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong()])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [10]:
#|eval: false
tfm.dfs['biota']

Unnamed: 0,lon,lat,time,bio_group,species,body_part,sample,nuclide,Activity or MDA,_unc,_dl,_unit
1,-39.634445,66.784164,1017014400,4,99,52,11474,cs137,0.1800,0.014634,2,5
11,-39.150002,62.116665,1375228800,4,426,52,4714,cs137,0.2198,0.025633,2,5
21,-35.919998,64.289719,1287273600,4,99,52,6465,cs137,0.2090,0.008778,2,5
31,-35.099998,64.720001,1287100800,4,381,52,6466,cs137,0.1830,0.007503,2,5
41,-34.000000,64.000000,1306886400,4,99,52,6143,cs137,0.1696,0.018266,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...
79921,34.336113,72.420280,1629676800,4,404,34,259,cs137,0.1200,0.013500,2,5
79931,34.351665,72.426392,1535760000,4,404,52,1913,cs137,0.1200,0.020000,2,5
79941,35.521946,78.758888,1632355200,4,402,3,140,cs137,0.0420,0.016500,2,5
79951,36.581112,73.510002,1536537600,4,99,52,1878,cs137,0.1700,0.015000,2,5


***

### Format: Time

In [11]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Sampling start date'] = tfm.dfs[k]['time'].apply(self.format_time)
    def format_time(self, x): 
        date = num2pydate(x, units=self.cfg['units']['time'])
        date=date.strftime('%d-%b-%Y')
        return date

In [12]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg())])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Lookup: Sample Type 

In [13]:
#| export
class LookupSampleType(Callback):
    def __init__(self): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Sample type'] = k.upper()
            

In [14]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType()])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Lookup : Nuclide 

In [15]:
#| export
def get_nucnames_lut():
    df = pd.read_excel(nuc_lut_path(), usecols=['nc_name','nusymbol'])
    return df.set_index('nc_name').to_dict()['nusymbol']

In [16]:
# | export
class LookupNuclideIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Nuclide'] = tfm.dfs[k]['nuclide'].replace(lut)
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.strip()
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.replace(',','_')
            
            

In [17]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [18]:
#|eval: false
tfm.dfs['biota']['Nuclide'].unique()

array(['137Cs', '210Po', '99Tc', '239_240Pu', '238Pu', '241Am', '210Pb',
       '226Ra', '228Ra', '3H'], dtype=object)

***

### Format : Longitude and Latitude 

Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction.

In [19]:
# | export
def deg_to_dms(deg, coordinate='lat'):
    """Convert from decimal degrees to degrees, minutes, seconds."""
    m, s = divmod(abs(deg)*3600, 60)
    d, m = divmod(m, 60)
    
    if deg < 0:
        if coordinate == 'lat':
            cord = 'S'
        elif coordinate == 'lon':
            cord = 'W'
    else:
        if coordinate == 'lat':
            cord = 'N' 
        elif coordinate == 'lon':
            cord = 'E'                       
        
    d, m = int(d), int(m)
    
    return pd.Series([d, m, s, cord])

In [20]:
# | export
class ConvertLonLatCB(Callback):
    "Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction."
    def __init__(self, fn_convert=deg_to_dms):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp][['Latitude degrees','Latitude minutes','Latitude seconds','Latitude direction']] = tfm.dfs[grp]['lat'].apply(self.fn_convert, coordinate='lat')
            tfm.dfs[grp][['Longitude degrees','Longitude minutes','Longitude seconds','Longitude direction']] = tfm.dfs[grp]['lon'].apply(self.fn_convert, coordinate='lon')


In [21]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [22]:
#|eval: false
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_unit', 'Sampling start date',
       'Sample type', 'Nuclide', 'Latitude degrees', 'Latitude minutes',
       'Latitude seconds', 'Latitude direction', 'Longitude degrees',
       'Longitude minutes', 'Longitude seconds', 'Longitude direction'],
      dtype='object')

***

### Lookup : Units

In [23]:
#| export
def get_unitnames_lut():
    df = pd.read_excel(unit_lut_path(), usecols=['unit_id','unit'])
    return df.set_index('unit_id').to_dict()['unit']

In [24]:
# | export
class LookupUnitIdCB(Callback):
    "Lookup MARIS unit by unit_id."
    def __init__(self,
                 fn_lut=get_unitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Unit'] = tfm.dfs[k]['_unit'].replace(lut)
                        

In [25]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

In [26]:
#|eval: false
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_unit', 'Sampling start date',
       'Sample type', 'Nuclide', 'Latitude degrees', 'Latitude minutes',
       'Latitude seconds', 'Latitude direction', 'Longitude degrees',
       'Longitude minutes', 'Longitude seconds', 'Longitude direction',
       'Unit'],
      dtype='object')

In [27]:
#|eval: false
tfm.dfs['seawater']['_dl']

3        2
13       2
14       2
21       2
31       2
        ..
70595    2
70605    2
70609    2
70619    2
70624    2
Name: _dl, Length: 10090, dtype: int64

***

### Lookup : Value type (_dl) 


In [28]:
#| export
def get_detectionlimitnames_lut():
    df = pd.read_excel(detection_limit_lut_path(), usecols=['id','name'])
    return df.set_index('id').to_dict()['name']

In [29]:
# | export
class LookupValueTypeIdCB(Callback):
    "Lookup MARIS Value Type."
    def __init__(self,
                 fn_lut=get_detectionlimitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Value type'] = tfm.dfs[k]['_dl'].replace(lut)
                        

In [30]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Lookup : Biogroup

Biogroup not used in OPEN REfINE csv format. Confirm this

***

### Lookup : Species

In [31]:
#| export
def get_species_lut():
    df = pd.read_excel(species_lut_path(), usecols=['species_id','species'])
    return df.set_index('species_id').to_dict()['species']

In [32]:
# | export
class LookupSpeciesIdCB(Callback):
    "Lookup MARIS species by species_id."
    def __init__(self,
                 fn_lut=get_species_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'species' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Species'] = tfm.dfs[k]['species'].replace(lut)
                        

In [33]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB(),
                            LookupSpeciesIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Lookup : Body part

In [34]:
#| export
def get_bodypart_lut():
    df = pd.read_excel(bodyparts_lut_path(), usecols=['bodypar_id','bodypar'])
    return df.set_index('bodypar_id').to_dict()['bodypar']

In [35]:
# | export
class LookupBodypartIdCB(Callback):
    "Lookup MARIS bodypart by bodypar_id."
    def __init__(self,
                 fn_lut=get_bodypart_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'body_part' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Body part'] = tfm.dfs[k]['body_part'].replace(lut)
                        

In [36]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB(),
                            LookupSpeciesIdCB(),
                            LookupBodypartIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Lookup :  Dry weight & Wet weight


In [37]:
#|eval: false
'''
# | export
class ConvertWetWeightDryWeightIdCB(Callback):
    "Covert  MARIS bodypart by bodypar_id."
    def __init__(self,
                 fn_lut=get_bodypart_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'body_part' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Body part'] = tfm.dfs[k]['body_part'].replace(lut)
'''

'\n# | export\nclass ConvertWetWeightDryWeightIdCB(Callback):\n    "Covert  MARIS bodypart by bodypar_id."\n    def __init__(self,\n                 fn_lut=get_bodypart_lut):\n        fc.store_attr()\n\n    def __call__(self, tfm):\n        lut = self.fn_lut()\n        for k in tfm.dfs.keys():\n            if \'body_part\' in tfm.dfs[k].columns: \n                tfm.dfs[k][\'Body part\'] = tfm.dfs[k][\'body_part\'].replace(lut)\n'

***

### Lookup : Sediment type

In [38]:
#| export
def get_sediments_lut():
    df = pd.read_excel(sediments_lut_path(), usecols=['sedtype_id','sedtype'])
    return df.set_index('sedtype_id').to_dict()['sedtype']

In [39]:
# | export
class LookupSedimentTypeIdCB(Callback):
    "Lookup MARIS sedtype by sedtype_id."
    def __init__(self,
                 fn_lut=get_bodypart_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'sed_type' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Sediment type'] = tfm.dfs[k]['sed_type'].replace(lut)
                        

In [40]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB(),
                            LookupSpeciesIdCB(),
                            LookupBodypartIdCB(),
                            LookupSedimentTypeIdCB()
                            ])
tfm()

{'seawater':              lon        lat  smp_depth        time  sample        nuclide  \
 3     -58.231667  68.816666        0.0  1339718400   15765  pu239_240_tot   
 13    -58.231667  68.816666        0.0  1339718400   15781           tc99   
 14    -58.231667  68.816666        0.0  1339718400   15799          cs137   
 21    -57.793499  68.100502        0.0  1371168000   15811          cs137   
 31    -57.582001  72.114502        0.0  1339718400   15766  pu239_240_tot   
 ...          ...        ...        ...         ...     ...            ...   
 70595  40.000000  72.500000        0.0   804902400    5973          cs137   
 70605  40.000000  72.500000        0.0   804902400    5985  pu239_240_tot   
 70609  40.000000  74.000000        0.0   804988800    5974          cs137   
 70619  40.000000  74.000000        0.0   804988800    5986  pu239_240_tot   
 70624  40.000000  74.000000        0.0   804988800    5992             h3   
 
        Activity or MDA          _unc  _dl  _unit 

***

### Rename columns

In [41]:
#| export
# Define columns of interest (keys) and renaming rules (values).
renaming_rules = {('seawater','biota', 'sediment') : {    
                                                        ## DEFAULT
                                                        'Sample type' : 'Sample type',
                                                        'Latitude degrees' : 'Latitude degrees',
                                                        'Latitude minutes' : 'Latitude minutes',
                                                        'Latitude seconds' : 'Latitude seconds',
                                                        'Latitude direction' : 'Latitude direction',
                                                        'Longitude degrees' : 'Longitude degrees',
                                                        'Longitude minutes' : 'Longitude minutes',
                                                        'Longitude seconds' : 'Longitude seconds', 
                                                        'Longitude direction' : 'Longitude direction',    
                                                        'lat' : 'Latitude decimal' ,
                                                        'lon' : 'Longitude decimal',
                                                        'Sampling start date': 'Sampling start date',
                                                        #'Sampling start time' : 'Sampling start time'',
                                                        #'Sampling end date' : 'Sampling end date',
                                                        #'Sampling end time' : 'Sampling end time',
                                                        'Nuclide': 'Nuclide',
                                                        'Value type': 'Value type',
                                                        'Unit' : 'Unit',
                                                        'Activity or MDA' : 'Activity or MDA',
                                                        '_unc' : 'Uncertainty',
                                                        #'Quality flag' : 'Quality flag'
                                                        #'Station ID' : 'Station ID '
                                                        #'Sample ID' : 'Sample ID'
                                                        #'Profile or transect ID' : 'Profile or transect ID'
                                                        #'Sampling method' : 'Sampling method'
                                                        #'Preparation method' : 'Preparation method'
                                                        #'Counting method' : 'Counting method'
                                                        #'Sample notes' : 'Sample notes'
                                                        #'Measurement notes' : 'Measurement notes'
                                                    },
                  ('seawater',) : {
                                ## SEAWATER
                                #'smp_depth': 'Sampling depth',
                                #'Salinity' : 'Salinity',
                                #'Temperature' : 'Temperature',
                                #'Filtered' : 'Filtered',
                                #'Mesh size' : 'Mesh size',
                                #'Total depth' : 'Total depth'
                                },
                  ('biota', 'sediment') : {
                                            ## BIOTA & SEDIMENT
                                            #'Dry weight' : 'Dry weight',
                                            #'Wet weight' : 'Wet weight',
                                            #'Dry/wet ratio' : 'Dry/wet ratio',
                                            #'Drying method' : 'Drying method'
                                            },
                  ('biota',) : { 
                                ## BIOTA
                                'Species' : 'Species',
                                'Body part' : 'Body part'
                                },
                  ('sediment',) : {
                                ## SEDIMENT
                                #'Top' : 'Top',
                                #'Bottom' : 'Bottom',
                                #'Sediment type' : 'Sediment type'
                                }
                    }

In [42]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            # Get list of renaming rules keys for the group of interest
            rules_of_interest=[key for key in self.renaming_rules.keys() if grp in key]
            # Merge the dictionaries
            rules={k: v for d in [self.renaming_rules[key] for key in rules_of_interest] for k, v in d.items()}
            # Select cols of interest
            tfm.dfs[grp] = tfm.dfs[grp].loc[:,list(rules.keys())]
            # Rename cols
            tfm.dfs[grp].rename(columns=rules, inplace=True)

In [43]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB(),
                            LookupSpeciesIdCB(),
                            LookupBodypartIdCB(),
                            LookupSedimentTypeIdCB(),
                            RenameColumnCB()
                            ])
tfm()

{'seawater':       Sample type  Latitude degrees  Latitude minutes  Latitude seconds  \
 3        SEAWATER                68                48         59.996338   
 13       SEAWATER                68                48         59.996338   
 14       SEAWATER                68                48         59.996338   
 21       SEAWATER                68                 6          1.807251   
 31       SEAWATER                72                 6         52.207031   
 ...           ...               ...               ...               ...   
 70595    SEAWATER                72                30          0.000000   
 70605    SEAWATER                72                30          0.000000   
 70609    SEAWATER                74                 0          0.000000   
 70619    SEAWATER                74                 0          0.000000   
 70624    SEAWATER                74                 0          0.000000   
 
       Latitude direction  Longitude degrees  Longitude minutes  \
 3     

***

### Encoding

In [44]:
#| export
def encode(fname_in, fname_out, ref_id=-1, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitIdCB(),
                            LookupValueTypeIdCB(),
                            LookupSpeciesIdCB(),
                            LookupBodypartIdCB(),
                            LookupSedimentTypeIdCB(),
                            RenameColumnCB()
                            ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out,
                            ref_id = ref_id,
                            **kwargs)
    encoder.encode()
    return encoder

In [45]:
encode(fname_in, fname_out, ref_id, verbose=False)

<marisco.serializers.OpenRefineCsvEncoder at 0x7f5b13edccd0>

TODO Review nuclides. Cs127?

TODO: check the dfs as the dimensions should be longer. this means each row has a single nuclide. Is this what we want?

TODO: Include refid in encoder

TODO HELCOM seawater is missing depth

TODO Check : OSPAR weight missing in netcdf? 

TODO: Should the var be called 'detection limit'? Is 'value type' more appropriate?

TODO Biogroup not used in OPEN REfINE csv format. Confirm this

TODO Ask about Species dbo. Paul said there is a larger one. 