In [None]:
#| default_exp netcdf_to_csv

# NetCDF to Open Refine CSV (WIP)

***

## Packages import

In [None]:
#| export
from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, cdl_cfg, nuc_lut_path, unit_lut_path, detection_limit_lut_path, species_lut_path, bodyparts_lut_path, sediments_lut_path
from marisco.serializers import OpenRefineCsvEncoder
from functools import reduce,partial

Get the current working directory (cwd). . 

In [None]:
#|eval: false
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [None]:
# | export
fname_in = '../../_data/output/100-HELCOM-MORS-2024.nc'
fname_out = '../../_data/output/ospar_test.csv'
ref_id=191 # OSPAR ref_id

## Load NetCDF 

In [None]:
# | export
def netcdf4_to_df(fname_in):  
    # Read nc file
    with Dataset(fname_in, "r", format='NETCDF4' ) as nc:
        # Read groups ('seawater', 'biota', 'sediment')
        groups= nc.groups.keys()
        # Read fill values 
        fill_value={}
        for group in groups:
            fill_value[group] = nc.groups[group].variables['sample'][:].fill_value
    
    # Create dictionary of dataframes
    dfs={}
    for group in groups:
        # Read dataset
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        # Create Pandas dataframe 
        dfs[group]=ds.to_dataframe()
        # If the index is not 'sample' then set the index to be 'sample'
        if dfs[group].index.name != 'sample':
            dfs[group].set_index("sample", inplace=True)
        # Drop the rows where 'sample' uses the fill_value.
        dfs[group]=dfs[group].drop(fill_value[group], axis=0, errors='ignore') 
    return(dfs)

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
dfs

{'seawater':               lon        lat  smp_depth  tot_depth        time  h3  h3_unc  \
 sample                                                                       
 18083   14.257800  53.942200        0.0       10.0  1339545600 NaN     NaN   
 18264   14.257800  53.942200        0.0       10.0  1339545600 NaN     NaN   
 18592   14.257800  53.942200        0.0        9.0  1370390400 NaN     NaN   
 18643   14.257800  53.942200        0.0        9.0  1370390400 NaN     NaN   
 19023   14.263100  53.948101        0.0        9.0  1401753600 NaN     NaN   
 ...           ...        ...        ...        ...         ...  ..     ...   
 3633    18.233299  58.583302      433.0      435.0   614044800 NaN     NaN   
 3634    18.233299  58.583302      433.0      435.0   614044800 NaN     NaN   
 13276   18.231701  58.583302      437.0      460.0   965260800 NaN     NaN   
 18604   19.333300  54.833302        NaN      107.0  1370476800 NaN     NaN   
 18655   19.333300  54.833302        NaN

In [None]:
#|eval: false
dfs['seawater']

Unnamed: 0_level_0,lon,lat,smp_depth,tot_depth,time,h3,h3_unc,h3_dl,h3_sal,h3_temp,...,pu239_240_tot_dl,pu239_240_tot_sal,pu239_240_tot_temp,pu239_240_tot_unit,cm243_244_tot,cm243_244_tot_unc,cm243_244_tot_dl,cm243_244_tot_sal,cm243_244_tot_temp,cm243_244_tot_unit
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18083,14.257800,53.942200,0.0,10.0,1339545600,,,-1,,,...,-1,,,-1,,,-1,,,-1
18264,14.257800,53.942200,0.0,10.0,1339545600,,,-1,,,...,-1,,,-1,,,-1,,,-1
18592,14.257800,53.942200,0.0,9.0,1370390400,,,-1,,,...,-1,,,-1,,,-1,,,-1
18643,14.257800,53.942200,0.0,9.0,1370390400,,,-1,,,...,-1,,,-1,,,-1,,,-1
19023,14.263100,53.948101,0.0,9.0,1401753600,,,-1,,,...,-1,,,-1,,,-1,,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3633,18.233299,58.583302,433.0,435.0,614044800,,,-1,,,...,-1,,,-1,,,-1,,,-1
3634,18.233299,58.583302,433.0,435.0,614044800,,,-1,,,...,1,9.84,4.0,1,,,-1,,,-1
13276,18.231701,58.583302,437.0,460.0,965260800,,,-1,,,...,-1,,,-1,,,-1,,,-1
18604,19.333300,54.833302,,107.0,1370476800,,,-1,,,...,-1,,,-1,,,-1,,,-1


## Transform data

### Reshape: wide to long

In [None]:
#| export
class ReshapeWideToLong(Callback):
    "Convert data from wide to long with renamed columns."
    def __init__(self, columns='nuclide', values=['value']):
        fc.store_attr()
        # Retrieve all possible suffixes vars (e.g '_unc', '_dl', ...) from configs
        suff_cfg = [value['name'] for value in cdl_cfg()['vars']['suffixes'].values()]
        # Retrieve all possible nuclides
        nucs_cfg = pd.read_excel(nuc_lut_path())['nc_name'].to_list()
        nucs_cfg = [x for x in nucs_cfg if str(x) != 'nan'] # remove 'nan' from nuclide list
        # Retrieve all possible vars thats are not in vars suffixes
        self.vars_cfg=[x['name'] for var_key in cdl_cfg()['vars'].keys() for x in cdl_cfg()['vars'][var_key].values() if var_key != 'suffixes']
        # combine all possible nuclides with its suffixes.    
        value_name='Activity or MDA'
        derived_nucs_cols={value_name:nucs_cfg}     
        for suf in suff_cfg:
            derived_nucs_cols[suf]= [str(nuc)+str(suf) for nuc in nucs_cfg]
        self.derived_nucs_cols=derived_nucs_cols
           
    def melt(self, df):
        # Among all possible 'self.derived_nuc_cols' include the ones present in df.
        derived_nucs_cols={}
        for key,derived_nuc_cols in self.derived_nucs_cols.items():
            derived_nuc_cols = [col for col in derived_nuc_cols if col in df.columns]
            if derived_nuc_cols:
                derived_nucs_cols[key] = derived_nuc_cols
        
        # Among all possible 'self.vars_cfg' include the ones present in df.
        vars_cfg = [var for var in self.vars_cfg if var in df.columns]
        
        # Melt cols included in self.derived_nucs_cols        
        df=df.reset_index()  # Reset the index so 'sample' can be used with id_vars
        nuc_dfs={}
        for key,val in derived_nucs_cols.items():
            # Transpose nuclide_cols
            df_t=pd.melt(frame=df, id_vars=vars_cfg+['sample'], value_vars=val, var_name='nuclide', value_name=key)
            df_t['nuclide']=df_t['nuclide'].str.replace(key, '')
            # Keep rows where 'key' value is not nan
            df_t=df_t[df_t[key].notna()]
            nuc_dfs[key]=df_t
        
        # Merge dfs created from melt. 
        combine_on= vars_cfg + ['sample'] + ['nuclide']
        merged_df=reduce(lambda df1, df2: pd.merge(df1, df2,  how='outer', left_on= combine_on, right_on = combine_on), nuc_dfs.values())
        # Keep rows where either value_name (i.e.Activity or MDA ) or _unc are not 'nan'.
        merged_df = merged_df[merged_df[['Activity or MDA','_unc']].notna().any(axis=1)]
        return (merged_df)
    
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = self.melt(tfm.dfs[k])


In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong()])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

In [None]:
#|eval: false
tfm.dfs['biota']

Unnamed: 0,lon,lat,smp_depth,time,bio_group,species,body_part,sample,nuclide,Activity or MDA,_unc,_dl,_unit
11,9.683333,54.516666,2.0,1323561600,4,50,52,192,co60,0.00848,,2,5
64,9.683333,54.516666,2.0,1323561600,4,50,52,193,cs134,0.00545,,2,5
124,9.683333,54.516666,2.0,1323561600,4,50,52,194,k40,117.00000,4.212000,1,0
170,9.683333,54.516666,2.0,1323561600,4,50,52,195,cs137,0.77600,0.029488,1,5
219,10.000000,54.750000,4.0,1323734400,4,99,52,184,co60,0.00794,,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
774190,27.750000,59.466999,0.0,1035763200,4,50,1,5959,cs137,8.00000,0.400000,1,5
774242,27.750000,59.466999,0.0,1069804800,4,50,1,5955,cs137,5.80000,0.290000,1,5
774294,27.750000,59.466999,0.0,1104364800,4,50,1,5951,cs137,6.50000,0.260000,1,5
774346,27.750000,59.466999,0.0,1134518400,4,50,1,5947,cs137,6.80000,0.136000,1,5


***

### Format: Time

In [None]:
#| export
class LookupTimeFromEncodedTime(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]['Sampling start date'] = tfm.dfs[grp]['time'].apply(lambda row: self.format_date(row)) 
            tfm.dfs[grp]['Sampling start time'] = tfm.dfs[grp]['time'].apply(lambda row: self.format_time(row))
    
    def format_date(self, x): 
        date_time = num2pydate(x, units=self.cfg['units']['time'])
        date = date_time.strftime('%d-%b-%Y')
        return date
    
    def format_time(self, x): 
        date_time = num2pydate(x, units=self.cfg['units']['time'])
        time = date_time.strftime('%H:%M:%S') 
        return time

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg())])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

In [None]:
tfm.dfs['seawater']['Sampling start date']

8        1988-12-09
39       1988-12-09
68       1991-11-23
99       1991-11-23
128      1992-12-16
            ...    
609415   2012-07-24
609430   2012-07-24
609459   2011-01-01
609505   2011-01-01
609520   2011-01-01
Name: Sampling start date, Length: 20242, dtype: datetime64[ns]

***

### Lookup: Sample Type 

In [None]:
#| export
class GetSampleTypeCB(Callback):
    def __init__(self): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Sample type'] = k.upper()
            

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB()])
tfm()['biota']['Sample type']

11        BIOTA
64        BIOTA
124       BIOTA
170       BIOTA
219       BIOTA
          ...  
774190    BIOTA
774242    BIOTA
774294    BIOTA
774346    BIOTA
774398    BIOTA
Name: Sample type, Length: 14873, dtype: object

***

### Lookup : Nuclide 

In [None]:
#| export
def get_nucnames_lut():
    df = pd.read_excel(nuc_lut_path(), usecols=['nc_name','nusymbol'])
    return df.set_index('nc_name').to_dict()['nusymbol']

In [None]:
# | export
class LookupNuclideByIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Nuclide'] = tfm.dfs[k]['nuclide'].replace(lut)
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.strip()
            tfm.dfs[k]['Nuclide']=tfm.dfs[k]['Nuclide'].str.replace(',','_')
            
            

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

In [None]:
#|eval: false
tfm.dfs['biota']['Nuclide'].unique()

array(['60Co', '134Cs', '40K', '137Cs', '241Am', '239_240Pu', '238Pu',
       '90Sr', '108mAg', '106Ru', '110mAg', '125Sb', '228Ra', '228Th',
       '226Ra', '54Mn', '7Be', '95Zr', '65Zn', '58Co', '103Ru', '95Nb',
       '99Tc', '210Po', '210Pb', '57Co', '235U', '214Bi', '214Pb',
       '212Pb', '208Tl', '228Ac', '223Ra', '144Ce', '141Ce', '134_137Cs',
       '117mSn', '153Gd', '155Eu', '152Eu', '59Fe', '46Sc', '124Sb',
       '86Rb', '113Sn', '224Ra', '232Th', '129mTe', '89Sr', '140La',
       '131I', '140Ba'], dtype=object)

***

### Format : Longitude and Latitude 

Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction.

In [None]:
# | export
def deg_to_dms(deg, coordinate='lat'):
    """Convert from decimal degrees to degrees, minutes, seconds."""
    m, s = divmod(abs(deg)*3600, 60)
    d, m = divmod(m, 60)
    
    if deg < 0:
        if coordinate == 'lat':
            cord = 'S'
        elif coordinate == 'lon':
            cord = 'W'
    else:
        if coordinate == 'lat':
            cord = 'N' 
        elif coordinate == 'lon':
            cord = 'E'                       
        
    d, m = int(d), int(m)
    
    return pd.Series([d, m, s, cord])

In [None]:
# | export
class ConvertLonLatCB(Callback):
    "Convert from Longitude and Latitude DDD.DDDDD° to degrees, minutes, seconds and direction."
    def __init__(self, fn_convert=deg_to_dms):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp][['Latitude degrees','Latitude minutes','Latitude seconds','Latitude direction']] = tfm.dfs[grp]['lat'].apply(self.fn_convert, coordinate='lat')
            tfm.dfs[grp][['Longitude degrees','Longitude minutes','Longitude seconds','Longitude direction']] = tfm.dfs[grp]['lon'].apply(self.fn_convert, coordinate='lon')


In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

In [None]:
#|eval: false
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'tot_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_sal', '_temp', '_unit',
       'Sampling start date', 'Sample type', 'Nuclide', 'Latitude degrees',
       'Latitude minutes', 'Latitude seconds', 'Latitude direction',
       'Longitude degrees', 'Longitude minutes', 'Longitude seconds',
       'Longitude direction'],
      dtype='object')

***

### Lookup : Units

In [None]:
#| export
def get_unitnames_lut():
    df = pd.read_excel(unit_lut_path(), usecols=['unit_id','unit'])
    return df.set_index('unit_id').to_dict()['unit']

In [None]:
# | export
class LookupUnitByIdCB(Callback):
    "Lookup MARIS unit by unit_id."
    def __init__(self,
                 fn_lut=get_unitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Unit'] = tfm.dfs[k]['_unit'].replace(lut)
                        

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

In [None]:
#|eval: false
tfm.dfs['seawater'].columns

Index(['lon', 'lat', 'smp_depth', 'tot_depth', 'time', 'sample', 'nuclide',
       'Activity or MDA', '_unc', '_dl', '_sal', '_temp', '_unit',
       'Sampling start date', 'Sample type', 'Nuclide', 'Latitude degrees',
       'Latitude minutes', 'Latitude seconds', 'Latitude direction',
       'Longitude degrees', 'Longitude minutes', 'Longitude seconds',
       'Longitude direction', 'Unit'],
      dtype='object')

In [None]:
#|eval: false
tfm.dfs['seawater']['_dl']

8         1
39        1
68        1
99        1
128       1
         ..
609415    1
609430    1
609459    1
609505    1
609520    1
Name: _dl, Length: 20242, dtype: int64

***

### Lookup : Value type (_dl) 


In [None]:
#| export
def get_detectionlimitnames_lut():
    df = pd.read_excel(detection_limit_lut_path(), usecols=['id','name'])
    return df.set_index('id').to_dict()['name']

In [None]:
# | export
class LookupValueTypeByIdCB(Callback):
    "Lookup MARIS Value Type."
    def __init__(self,
                 fn_lut=get_detectionlimitnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['Value type'] = tfm.dfs[k]['_dl'].replace(lut)
                        

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

***

### Lookup : Biogroup

Biogroup not used in OPEN REfINE csv format. Confirm this

***

### Lookup : Species

In [None]:
#| export
def get_species_lut():
    df = pd.read_excel(species_lut_path(), usecols=['species_id','species'])
    return df.set_index('species_id').to_dict()['species']

In [None]:
# | export
class LookupSpeciesByIdCB(Callback):
    "Lookup MARIS species by species_id."
    def __init__(self,
                 fn_lut=get_species_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'species' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Species'] = tfm.dfs[k]['species'].replace(lut)
                        

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB(),
                            LookupSpeciesByIdCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

***

### Lookup : Body part

In [None]:
#| export
def get_bodypart_lut():
    df = pd.read_excel(bodyparts_lut_path(), usecols=['bodypar_id','bodypar'])
    return df.set_index('bodypar_id').to_dict()['bodypar']

In [None]:
# | export
class LookupBodypartByIdCB(Callback):
    "Lookup MARIS bodypart by bodypar_id."
    def __init__(self,
                 fn_lut=get_bodypart_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'body_part' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Body part'] = tfm.dfs[k]['body_part'].replace(lut)
                        

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB(),
                            LookupSpeciesByIdCB(),
                            LookupBodypartByIdCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

***

### Lookup : Sediment type

In [None]:
#| export
def get_sediments_lut():
    df = pd.read_excel(sediments_lut_path(), usecols=['sedtype_id','sedtype'])
    return df.set_index('sedtype_id').to_dict()['sedtype']

In [None]:
# | export
class LookupSedimentTypeByIdCB(Callback):
    "Lookup MARIS sedtype by sedtype_id."
    def __init__(self,
                 fn_lut=get_bodypart_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            if 'sed_type' in tfm.dfs[k].columns: 
                tfm.dfs[k]['Sediment type'] = tfm.dfs[k]['sed_type'].replace(lut)
                        

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB(),
                            LookupSpeciesByIdCB(),
                            LookupBodypartByIdCB(),
                            LookupSedimentTypeByIdCB()
                            ])
tfm()

{'seawater':             lon        lat  smp_depth  tot_depth        time  sample nuclide  \
 8        9.6333  54.840000        0.0       17.0   597628800    3437   cs134   
 39       9.6333  54.840000        0.0       17.0   597628800    3438   cs137   
 68       9.6333  54.841702        0.0       16.0   690854400    4683   cs134   
 99       9.6333  54.841702        0.0       16.0   690854400    4684   cs137   
 128      9.6333  54.841702        0.0       17.0   724464000    5116   cs134   
 ...         ...        ...        ...        ...         ...     ...     ...   
 609415  31.1667  60.000000        5.0        NaN  1343088000      60    sr90   
 609430  31.1667  60.000000        5.0        NaN  1343088000      91      h3   
 609459  31.1700  60.000000        0.0        0.0  1293840000   17353   cs137   
 609505  31.1700  60.000000        0.0        0.0  1293840000   17396    sr90   
 609520  31.1700  60.000000        0.0        0.0  1293840000   17439      h3   
 
         Activ

***

### Rename columns

In [None]:
#| export
# Define columns of interest (keys) and renaming rules (values).
renaming_rules = {('seawater','biota', 'sediment') : {    
                                                        ## DEFAULT
                                                        'Sample type' : 'Sample type',
                                                        'Latitude degrees' : 'Latitude degrees',
                                                        'Latitude minutes' : 'Latitude minutes',
                                                        'Latitude seconds' : 'Latitude seconds',
                                                        'Latitude direction' : 'Latitude direction',
                                                        'Longitude degrees' : 'Longitude degrees',
                                                        'Longitude minutes' : 'Longitude minutes',
                                                        'Longitude seconds' : 'Longitude seconds', 
                                                        'Longitude direction' : 'Longitude direction',    
                                                        'lat' : 'Latitude decimal' ,
                                                        'lon' : 'Longitude decimal',
                                                        'Sampling start date': 'Sampling start date',
                                                        #'Sampling start time' : 'Sampling start time'',
                                                        #'Sampling end date' : 'Sampling end date',
                                                        #'Sampling end time' : 'Sampling end time',
                                                        'Nuclide': 'Nuclide',
                                                        'Value type': 'Value type',
                                                        'Unit' : 'Unit',
                                                        'Activity or MDA' : 'Activity or MDA',
                                                        '_unc' : 'Uncertainty',
                                                        #'Quality flag' : 'Quality flag'
                                                        #'Station ID' : 'Station ID '
                                                        #'Sample ID' : 'Sample ID'
                                                        #'Profile or transect ID' : 'Profile or transect ID'
                                                        #'Sampling method' : 'Sampling method'
                                                        #'Preparation method' : 'Preparation method'
                                                        #'Counting method' : 'Counting method'
                                                        #'Sample notes' : 'Sample notes'
                                                        #'Measurement notes' : 'Measurement notes'
                                                    },
                  ('seawater',) : {
                                ## SEAWATER
                                #'smp_depth': 'Sampling depth',
                                #'Salinity' : 'Salinity',
                                #'Temperature' : 'Temperature',
                                #'Filtered' : 'Filtered',
                                #'Mesh size' : 'Mesh size',
                                #'Total depth' : 'Total depth'
                                },
                  ('biota', 'sediment') : {
                                            ## BIOTA & SEDIMENT
                                            #'Dry weight' : 'Dry weight',
                                            #'Wet weight' : 'Wet weight',
                                            #'Dry/wet ratio' : 'Dry/wet ratio',
                                            #'Drying method' : 'Drying method'
                                            },
                  ('biota',) : { 
                                ## BIOTA
                                'Species' : 'Species',
                                'Body part' : 'Body part'
                                },
                  ('sediment',) : {
                                ## SEDIMENT
                                #'Top' : 'Top',
                                #'Bottom' : 'Bottom',
                                #'Sediment type' : 'Sediment type'
                                }
                    }

In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            # Get list of renaming rules keys for the group of interest
            rules_of_interest=[key for key in self.renaming_rules.keys() if grp in key]
            # Merge the dictionaries
            rules={k: v for d in [self.renaming_rules[key] for key in rules_of_interest] for k, v in d.items()}
            # Select cols of interest
            tfm.dfs[grp] = tfm.dfs[grp].loc[:,list(rules.keys())]
            # Rename cols
            tfm.dfs[grp].rename(columns=rules, inplace=True)

In [None]:
#|eval: false
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB(),
                            LookupSpeciesByIdCB(),
                            LookupBodypartByIdCB(),
                            LookupSedimentTypeByIdCB(),
                            RenameColumnCB()
                            ])
tfm()

{'seawater':        Sample type  Latitude degrees  Latitude minutes  Latitude seconds  \
 8         SEAWATER                54                50         24.000549   
 39        SEAWATER                54                50         24.000549   
 68        SEAWATER                54                50         30.125427   
 99        SEAWATER                54                50         30.125427   
 128       SEAWATER                54                50         30.125427   
 ...            ...               ...               ...               ...   
 609415    SEAWATER                60                 0          0.000000   
 609430    SEAWATER                60                 0          0.000000   
 609459    SEAWATER                60                 0          0.000000   
 609505    SEAWATER                60                 0          0.000000   
 609520    SEAWATER                60                 0          0.000000   
 
        Latitude direction  Longitude degrees  Longitude minut

***

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, ref_id=-1, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[ReshapeWideToLong(),
                            LookupTimeFromEncodedTime(cfg()),
                            GetSampleTypeCB(),
                            LookupNuclideByIdCB(),
                            ConvertLonLatCB(), 
                            LookupUnitByIdCB(),
                            LookupValueTypeByIdCB(),
                            LookupSpeciesByIdCB(),
                            LookupBodypartByIdCB(),
                            LookupSedimentTypeByIdCB(),
                            RenameColumnCB()
                            ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out,
                            ref_id = ref_id,
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, ref_id, verbose=False)

<marisco.serializers.OpenRefineCsvEncoder>

TODO Review nuclides. Cs127?

TODO: check the dfs as the dimensions should be longer. this means each row has a single nuclide. Is this what we want?

TODO: Include refid in encoder

TODO HELCOM seawater is missing depth

TODO Check : OSPAR weight missing in netcdf? 

TODO: Should the var be called 'detection limit'? Is 'value type' more appropriate?

TODO Biogroup not used in OPEN REfINE csv format. Confirm this

TODO Ask about Species dbo. Paul said there is a larger one. 