In [None]:
#| default_exp handlers.tepco

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import fastcore.all as fc

from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)

from marisco.utils import has_valid_varname
from marisco.serializers import to_netcdf
from marisco.configs import get_nc_tpl_path

from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)

In [None]:
NC_TPL_PATH = get_nc_tpl_path()

In [None]:
fname_in = '../../_data/xls/tepco/coastal_water.xlsx'
fname_out = '../../_data/output/tepco.nc'

## Utils

In [None]:
#| export
def load_data(fname_in):
    "Load TEPCO seawater data"
    
    # get measurements
    df_meas = pd.read_excel(fname_in, 
                            skiprows=1, nrows=23643,
                            converters={'Sampling time': lambda x: x if x != '' else '00:00:00'})
    # get locations
    df_loc = pd.read_excel(fname_in, skiprows=23647, nrows=48, usecols=[0,1,2])
    # join
    df = pd.merge(df_meas, df_loc, how="inner", on='Sampling point number')
    df.drop(columns=['Sampling point number'], inplace=True)
    df.index.name = 'sample'
    return {'seawater': df}

## Configs

In [None]:
CONFIGS = {
    'global_attr': {
        'description': 'TEPCO dataset ...',
        'summary': '...',
        'keyword': 'MARIS TEPCO sediments',
        'license': 'tbd',
    },
    'var_names': {
        'Sampling coordinate North latitude (Decimal)': 'lat',
        'Sampling coordinate East longitude (Decimal)': 'lon',
        '131I radioactivity concentration (Bq/L)': 'i131',
        '131I detection limit (Bq/L)': 'i131_dl',
        '134Cs radioactivity concentration (Bq/L)': 'cs134',
        '134Cs detection limit (Bq/L)': 'cs134_dl',
        '137Cs radioactivity concentration (Bq/L)': 'cs137',
        '137Cs detection limit (Bq/L)': 'cs137_dl',
        '132I radioactivity concentration (Bq/L)': 'i132',
        '132I detection limit (Bq/L)': 'i132_dl',
        '132Te radioactivity concentration (Bq/L)': 'te132',
        '132Te detection limit (Bq/L)': 'te132_dl',
        '136Cs radioactivity concentration (Bq/L)': 'cs136',
        '136Cs detection limit (Bq/L)': 'cs136_dl',
        '140La radioactivity concentration (Bq/L)': 'la140',
        '140La detection limit (Bq/L)': 'la140_dl',
        '89Sr radioactivity concentration (Bq/L)': 'sr89',
        '89Sr detection limit (Bq/L)': 'sr89_dl',
        '90Sr radioactivity concentration (Bq/L)': 'sr90',
        '90Sr detection limit (Bq/L)': 'sr90_dl',
        '238Pu radioactivity concentration (Bq/L)': 'pu238',
        '238Pu detection limit (Bq/L)': 'pu238_dl',
        '239Pu+240Pu radioactivity concentration (Bq/L)': 'pu239_240_tot',
        '239Pu+240Pu statistical error (Bq/L)': 'pu239_240_tot_unc',
        '239Pu+240Pu detection limit (Bq/L)': 'pu239_240_tot_dl',
        'Total alpha radioactivity concentration (Bq/L)': 'talpha',
        'Total alpha detection limit (Bq/L)': 'talpha_dl',
        'Total beta radioactivity concentration (Bq/L)': 'tbeta',
        'Total beta detection limit (Bq/L)': 'tbeta_dl',
        '140Ba radioactivity concentration (Bq/L)': 'ba140',
        '140Ba detection limit (Bq/L)': 'ba140_dl',
        '106Ru radioactivity concentration (Bq/L)': 'ru106',
        '106Ru detection limit (Bq/L)': 'ru106_dl',
        '58Co radioactivity concentration (Bq/L)': 'co58',
        '58Co detection limit (Bq/L)': 'co58_dl',
        '60Co radioactivity concentration (Bq/L)': 'co60',
        '60Co detection limit (Bq/L)': 'co60_dl',
        '144Ce radioactivity concentration (Bq/L)': 'ce144',
        '144Ce detection limit (Bq/L)': 'ce144_dl',
        '54Mn radioactivity concentration (Bq/L)': 'mn54',
        '54Mn detection limit (Bq/L)': 'mn54_dl',
        '3H radioactivity concentration (Bq/L)': 'h3',
        '3H detection limit (Bq/L)': 'h3_dl', 
        '125Sb radioactivity concentration (Bq/L)': 'sb125',
        '125Sb detection limit (Bq/L)': 'sb125_dl',
        '105Ru radioactivity concentration (Bq/L)': 'ru105',
        '105Ru detection limit (Bq/L)': 'ru105_dl'
    }    
}

## Load tables
The data is provided as a single `.xls` file. A preview of this file, e.g using [Open Office](http://www.openoffice.org/), indicates the presence of two datasets in a single sheet: the measurements and their locations that can be joined using the `Sampling point number` column.

In [None]:
dfs = load_data(fname_in)

In [None]:
print(f"# of cols, rows: {dfs['seawater'].shape}")
dfs['seawater'].head()

# of cols, rows: (21477, 51)


Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160,...,,,,,,,,,37.32,141.03
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,ND,...,,,,,,,,,37.32,141.03
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200,...,,,,,,34.0,25.0,,37.32,141.03
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120,...,,,,,,56.0,44.0,,37.32,141.03
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58,...,,,,,,,,,37.32,141.03


## Data transformation pipeline

### Fix missing values

`ND` is assigned `NaN`. This needs to be confirmed.

In [None]:
#| export
class FixMissingValuesCB(Callback):
    "Assign `NaN` to values equal to `ND` (not detected) - to be confirmed "

    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            predicate = tfm.dfs[k] == 'ND'
            tfm.dfs[k][predicate] = np.nan

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB()])

tfm()['seawater'].head()

Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160.0,...,,,,,,,,,37.32,141.03
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,,...,,,,,,,,,37.32,141.03
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200.0,...,,,,,,34.0,25.0,,37.32,141.03
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120.0,...,,,,,,56.0,44.0,,37.32,141.03
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58.0,...,,,,,,,,,37.32,141.03


### Normalize nuclide name

In [None]:
# | export
# Define nuclides-related columns renaming rules
renaming_rules_rdn = {
    '131I radioactivity concentration (Bq/L)': 'i131',
    '131I detection limit (Bq/L)': 'i131_dl',
    '134Cs radioactivity concentration (Bq/L)': 'cs134',
    '134Cs detection limit (Bq/L)': 'cs134_dl',
    '137Cs radioactivity concentration (Bq/L)': 'cs137',
    '137Cs detection limit (Bq/L)': 'cs137_dl',
    '132I radioactivity concentration (Bq/L)': 'i132',
    '132I detection limit (Bq/L)': 'i132_dl',
    '132Te radioactivity concentration (Bq/L)': 'te132',
    '132Te detection limit (Bq/L)': 'te132_dl',
    '136Cs radioactivity concentration (Bq/L)': 'cs136',
    '136Cs detection limit (Bq/L)': 'cs136_dl',
    '140La radioactivity concentration (Bq/L)': 'la140',
    '140La detection limit (Bq/L)': 'la140_dl',
    '89Sr radioactivity concentration (Bq/L)': 'sr89',
    '89Sr detection limit (Bq/L)': 'sr89_dl',
    '90Sr radioactivity concentration (Bq/L)': 'sr90',
    '90Sr detection limit (Bq/L)': 'sr90_dl',
    '238Pu radioactivity concentration (Bq/L)': 'pu238',
    '238Pu detection limit (Bq/L)': 'pu238_dl',
    '239Pu+240Pu radioactivity concentration (Bq/L)': 'pu239_240_tot',
    '239Pu+240Pu statistical error (Bq/L)': 'pu239_240_tot_unc',
    '239Pu+240Pu detection limit (Bq/L)': 'pu239_240_tot_dl',
    'Total alpha radioactivity concentration (Bq/L)': 'talpha',
    'Total alpha detection limit (Bq/L)': 'talpha_dl',
    'Total beta radioactivity concentration (Bq/L)': 'tbeta',
    'Total beta detection limit (Bq/L)': 'tbeta_dl',
    '140Ba radioactivity concentration (Bq/L)': 'ba140',
    '140Ba detection limit (Bq/L)': 'ba140_dl',
    '106Ru radioactivity concentration (Bq/L)': 'ru106',
    '106Ru detection limit (Bq/L)': 'ru106_dl',
    '58Co radioactivity concentration (Bq/L)': 'co58',
    '58Co detection limit (Bq/L)': 'co58_dl',
    '60Co radioactivity concentration (Bq/L)': 'co60',
    '60Co detection limit (Bq/L)': 'co60_dl',
    '144Ce radioactivity concentration (Bq/L)': 'ce144',
    '144Ce detection limit (Bq/L)': 'ce144_dl',
    '54Mn radioactivity concentration (Bq/L)': 'mn54',
    '54Mn detection limit (Bq/L)': 'mn54_dl',
    '3H radioactivity concentration (Bq/L)': 'h3',
    '3H detection limit (Bq/L)': 'h3_dl',
    '125Sb radioactivity concentration (Bq/L)': 'sb125',
    '125Sb detection limit (Bq/L)': 'sb125_dl',
    '105Ru radioactivity concentration (Bq/L)': 'ru105',
    '105Ru detection limit (Bq/L)': 'ru105_dl'}


In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(renaming_rules_rdn.values(), NC_TPL_PATH)

True

In [None]:
#| export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names"
    def __init__(self,
                 renaming_rules=renaming_rules_rdn):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB()])

tfm()['seawater'].head()

Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,i131,i131_dl,cs134,cs134_dl,cs137,cs137_dl,i132,...,mn54_dl,h3,h3_dl,sb125,sb125_dl,ru105,ru105_dl,Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160.0,...,,,,,,,,,37.32,141.03
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,,...,,,,,,,,,37.32,141.03
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200.0,...,,,,,,34.0,25.0,,37.32,141.03
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120.0,...,,,,,,56.0,44.0,,37.32,141.03
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58.0,...,,,,,,,,,37.32,141.03


In [None]:
tfm()['seawater'].columns

Index(['Sampling date', 'Sampling time', 'Collection layer of seawater',
       'i131', 'i131_dl', 'cs134', 'cs134_dl', 'cs137', 'cs137_dl', 'i132',
       'i132_dl', 'te132', 'te132_dl', 'cs136', 'cs136_dl', 'la140',
       'la140_dl', 'sr89', 'sr89_dl', 'sr90', 'sr90_dl', 'pu238', 'pu238_dl',
       'pu239_240_tot', 'pu239_240_tot_unc', 'pu239_240_tot_dl', 'talpha',
       'talpha_dl', 'tbeta', 'tbeta_dl', 'ba140', 'ba140_dl', 'ru106',
       'ru106_dl', 'co58', 'co58_dl', 'co60', 'co60_dl', 'ce144', 'ce144_dl',
       'mn54', 'mn54_dl', 'h3', 'h3_dl', 'sb125', 'sb125_dl', 'ru105',
       'ru105_dl', 'Unnamed: 49',
       'Sampling coordinate North latitude (Decimal)',
       'Sampling coordinate East longitude (Decimal)'],
      dtype='object')

### Parse time

In [None]:
#| export
def time_parser(col):
    day = str(col[0].date())
    time = str(col[1])
    return datetime.strptime(day + ' ' + time, '%Y-%m-%d %H:%M:%S')


In [None]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, 
                 fn_parser=time_parser,
                 cols_time=['Sampling date', 'Sampling time']):
        fc.store_attr()
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k][self.cols_time].apply(self.fn_parser, axis=1)
            tfm.dfs[k].drop(columns=self.cols_time)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

sample
0   2011-03-21 23:15:00
1   2011-03-22 14:28:00
2   2011-03-23 13:51:00
3   2011-03-24 09:30:00
4   2011-03-25 10:00:00
Name: time, dtype: datetime64[ns]


### Rename columns

In [None]:
#| export
renaming_rules_cols = {
    'Sampling coordinate North latitude (Decimal)': 'lat',
    'Sampling coordinate East longitude (Decimal)': 'lon'}

In [None]:
#| export
class RenameColumnCB(Callback):
    "Normalizing, renaming columns"
    def __init__(self,
                 renaming_rules=renaming_rules_cols):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB()])

print(tfm()['seawater'][:5])

       Sampling date Sampling time Collection layer of seawater  i131 i131_dl  \
sample                                                                          
0         2011-03-21      23:15:00                          NaN  1100      13   
1         2011-03-22      14:28:00                          NaN  1100      20   
2         2011-03-23      13:51:00                          NaN   740      27   
3         2011-03-24      09:30:00                          NaN  1100      52   
4         2011-03-25      10:00:00                          NaN   430      10   

       cs134 cs134_dl cs137 cs137_dl i132  ...   h3  h3_dl  sb125 sb125_dl  \
sample                                     ...                               
0         48      9.2    53      8.8  160  ...  NaN    NaN    NaN      NaN   
1         46       14    40       14  NaN  ...  NaN    NaN    NaN      NaN   
2         51       20    55       20  200  ...  NaN    NaN    NaN      NaN   
3         99       38    94       41  120 

### Select columns of interest

In [None]:
#| export
coi = ['time', 'lat', 'lon'] + list(renaming_rules_rdn.values())

In [None]:
#| export
class SelectColumnsCB(Callback):
    def __init__(self,
                 coi=coi):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k][self.coi]


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB()])

print(tfm()['seawater'][:5])

                      time    lat     lon  i131 i131_dl cs134 cs134_dl cs137  \
sample                                                                         
0      2011-03-21 23:15:00  37.32  141.03  1100      13    48      9.2    53   
1      2011-03-22 14:28:00  37.32  141.03  1100      20    46       14    40   
2      2011-03-23 13:51:00  37.32  141.03   740      27    51       20    55   
3      2011-03-24 09:30:00  37.32  141.03  1100      52    99       38    94   
4      2011-03-25 10:00:00  37.32  141.03   430      10    26      7.4    34   

       cs137_dl i132  ...  ce144  ce144_dl  mn54 mn54_dl   h3  h3_dl  sb125  \
sample                ...                                                     
0           8.8  160  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN   
1            14  NaN  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN   
2            20  200  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN   
3            41  120  ...    NaN       NaN  

### Encode time (seconds since ...)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB()])

print(tfm()['seawater'][:5])

              time    lat     lon  i131 i131_dl cs134 cs134_dl cs137 cs137_dl  \
sample                                                                          
0       1300749300  37.32  141.03  1100      13    48      9.2    53      8.8   
1       1300804080  37.32  141.03  1100      20    46       14    40       14   
2       1300888260  37.32  141.03   740      27    51       20    55       20   
3       1300959000  37.32  141.03  1100      52    99       38    94       41   
4       1301047200  37.32  141.03   430      10    26      7.4    34      5.9   

       i132  ...  ce144  ce144_dl  mn54 mn54_dl   h3  h3_dl  sb125 sb125_dl  \
sample       ...                                                              
0       160  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
1       NaN  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
2       200  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
3       120  ...    NaN       NaN   N

### Sanitize coordinates

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB(),
                            SanitizeLonLatCB()])

print(tfm()['seawater'][:5])

              time    lat     lon  i131 i131_dl cs134 cs134_dl cs137 cs137_dl  \
sample                                                                          
0       1300749300  37.32  141.03  1100      13    48      9.2    53      8.8   
1       1300804080  37.32  141.03  1100      20    46       14    40       14   
2       1300888260  37.32  141.03   740      27    51       20    55       20   
3       1300959000  37.32  141.03  1100      52    99       38    94       41   
4       1301047200  37.32  141.03   430      10    26      7.4    34      5.9   

       i132  ...  ce144  ce144_dl  mn54 mn54_dl   h3  h3_dl  sb125 sb125_dl  \
sample       ...                                                              
0       160  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
1       NaN  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
2       200  ...    NaN       NaN   NaN     NaN  NaN    NaN    NaN      NaN   
3       120  ...    NaN       NaN   N

## Encode to NetCDF

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB(),
                            SanitizeLonLatCB()])


dfs_tfm = tfm()

In [None]:
dfs_tfm

{'seawater':               time    lat     lon  i131 i131_dl cs134 cs134_dl  cs137  \
 sample                                                                  
 0       1300749300  37.32  141.03  1100      13    48      9.2     53   
 1       1300804080  37.32  141.03  1100      20    46       14     40   
 2       1300888260  37.32  141.03   740      27    51       20     55   
 3       1300959000  37.32  141.03  1100      52    99       38     94   
 4       1301047200  37.32  141.03   430      10    26      7.4     34   
 ...            ...    ...     ...   ...     ...   ...      ...    ...   
 21472   1657620600  37.48  141.04   NaN     NaN   NaN   0.0011  0.011   
 21473   1657620600  37.48  141.04   NaN     NaN   NaN      NaN    NaN   
 21474   1658224800  37.48  141.04   NaN     NaN   NaN   0.0013   0.01   
 21475   1658224800  37.48  141.04   NaN     NaN   NaN      NaN    NaN   
 21476   1658830200  37.48  141.04   NaN     NaN   NaN   0.0014  0.015   
 
        cs137_dl i132  .

In [None]:
tfm.logs

['Assign `NaN` to values equal to `ND` (not detected) - to be confirmed ',
 'Remap to MARIS radionuclide names',
 'Normalizing, renaming columns',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants']


In [None]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[BboxCB(),
                                    # DepthRangeCB(),
                                    TimeRangeCB(),
                                    ZoteroCB(zotero_key),
                                    KeyValuePairCB('keywords', ', '.join(kw)),
                                    KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))])()

In [None]:
get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)

{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Water Quality > Ocean Contaminants',
 'keywords_vocabulary': 'GCMD Science Keywords',
 '

### Encoding

In [None]:
#| export
def units_fn(grp_name): 
    return 'Bq/l'

In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                                RemapRdnNameCB(),
                                ParseTimeCB(),
                                RenameColumnCB(),
                                SelectColumnsCB(),
                                EncodeTimeCB(),
                                SanitizeLonLatCB()])
    
    dfs_tfm = tfm()
    attrs = get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)
    to_netcdf(dfs_tfm, nc_tpl_path, fname_out, attrs, units_fn)

In [None]:
encode(fname_in, fname_out, NC_TPL_PATH)

% of discarded data for grp seawater: 6.429371649410898
