In [None]:
#| default_exp handlers.tepco

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import re
import numpy as np
from datetime import datetime
import fastcore.all as fc

from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)

from marisco.utils import has_valid_varname
from marisco.configs import nc_tpl_path, cfg

from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)

In [None]:
fname_in = '../../_data/xls/tepco/coastal_water.xlsx'
fname_out = '../../_data/output/tepco.nc'

## Utils

In [None]:
#| export
def load_data(fname_in):
    "Load TEPCO seawater data"
    
    # get measurements
    df_meas = pd.read_excel(fname_in, 
                            skiprows=1, nrows=23643,
                            converters={'Sampling time': lambda x: x if x != '' else '00:00:00'})
    # get locations
    df_loc = pd.read_excel(fname_in, skiprows=23647, nrows=48, usecols=[0,1,2])
    # join
    df = pd.merge(df_meas, df_loc, how="inner", on='Sampling point number')
    # df.drop(columns=['Sampling point number'], inplace=True)
    df.index.name = 'sample'
    return {'seawater': df}

## Load tables
The data is provided as a single `.xls` file. A preview of this file, e.g using [Open Office](http://www.openoffice.org/), indicates the presence of two datasets in a single sheet: the measurements and their locations that can be joined using the `Sampling point number` column.

In [None]:
dfs = load_data(fname_in)

In [None]:
print(f"# of cols, rows: {dfs['seawater'].shape}")
dfs['seawater'].head()

# of cols, rows: (21477, 52)


Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,T-3,,1100,13,48,9.2,53,8.8,...,,,,,,,,,37.32,141.03
1,2011-03-21,23:45:00,T-4,,660,12,31,8.7,33,8.3,...,,,,,,,,,37.24,141.01
2,2011-03-22,14:28:00,T-3,,1100,20,46,14.0,40,14.0,...,,,,,,,,,37.32,141.03
3,2011-03-22,15:06:00,T-4,,670,19,39,11.0,44,11.0,...,,,,,,,,,37.24,141.01
4,2011-03-23,13:51:00,T-3,,740,27,51,20.0,55,20.0,...,,,,,,34.0,25.0,,37.32,141.03


In [None]:
dfs['seawater'].sample(50)

Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21125,2022-05-16,08:27:00,T-D9,下層,,,ND,0.0012,0.0055,,...,,,,,,,,,37.33,141.07
7954,2014-10-16,10:00:00,T-3,上層,,,,,,,...,,ND,0.3,,,,,,37.32,141.03
17124,2019-12-16,09:14:00,T-D5,上層,,,ND,0.0013,0.012,,...,,,,,,,,,37.42,141.07
2736,2012-02-24,08:25:00,T-3,上層,ND,約8.6E-01,ND,約9.0E-01,ND,約1.0E+00,...,,,,,,,,,37.32,141.03
4765,2013-03-06,09:06:00,T-D9,上層,,,,,,,...,,ND,2.9,,,,,,37.33,141.07
7516,2014-07-22,09:31:00,T-D9,上層,,,0.014,,0.035,,...,,,,,,,,,37.33,141.07
10929,2016-04-08,09:03:00,T-7,上層,,,ND,0.0013,0.0022,,...,,,,,,,,,37.23,141.2
11897,2016-11-07,09:42:00,T-B4,下層,,,ND,0.0013,0.0043,,...,,,,,,,,,37.35,141.15
1628,2011-10-16,07:35:00,T-4,上層,ND,約4.0E+00,ND,約6.0E+00,ND,約9.0E+00,...,,,,,,,,,37.24,141.01
15570,2019-01-22,06:27:00,T-B1,上層,,,ND,0.0011,0.0031,,...,,,,,,,,,37.53,141.22


In [None]:
dfs['seawater']['Sampling point number'].unique()

array(['T-3', 'T-4', 'T-5', 'T-7', 'T-11', 'T-12', 'T-14', 'T-18', 'T-20',
       'T-22', 'T-MA', 'T-M10', 'T-A', 'T-D', 'T-E', 'T-B', 'T-C',
       'T-MG1', 'T-MG2', 'T-MG3', 'T-MG4', 'T-MG5', 'T-MG6', 'T-D1',
       'T-D5', 'T-D9', 'T-E1', 'T-G4', 'T-H1', 'T-S6', 'T-17-1', 'T-B3',
       'T-13-1', 'T-S3', 'T-S4', 'T-B4', 'T-S1', 'T-S2', 'T-MG0', 'T-Z',
       'T-B1', 'T-B2', 'T-S7', 'T-S8', 'T-0', 'T-4-1', 'T-4-2', 'T-6'],
      dtype=object)

## Data transformation pipeline

### Fix missing values

`ND` is assigned `NaN`. This needs to be confirmed.

In [None]:
#| export
class FixMissingValuesCB(Callback):
    "Assign `NaN` to values equal to `ND` (not detected) - to be confirmed "
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            predicate = tfm.dfs[k] == 'ND'
            tfm.dfs[k][predicate] = np.nan

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB()])

tfm()['seawater'].head()

Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,T-3,,1100,13,48,9.2,53,8.8,...,,,,,,,,,37.32,141.03
1,2011-03-21,23:45:00,T-4,,660,12,31,8.7,33,8.3,...,,,,,,,,,37.24,141.01
2,2011-03-22,14:28:00,T-3,,1100,20,46,14.0,40,14.0,...,,,,,,,,,37.32,141.03
3,2011-03-22,15:06:00,T-4,,670,19,39,11.0,44,11.0,...,,,,,,,,,37.24,141.01
4,2011-03-23,13:51:00,T-3,,740,27,51,20.0,55,20.0,...,,,,,,34.0,25.0,,37.32,141.03


### Remove Japanese character
    

In [None]:
#| export
class RemoveJapanaseCharCB(Callback):
    "Remove 約 (about) char"
    def _transform_if_about(self, value, about_char='約'):
        if pd.isna(value): return value
        return (value.replace(about_char, '') if str(value).count(about_char) != 0 
                else value)
    
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_about)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB()])

tfm()['seawater'].sample(100)

Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14037,2018-02-14,07:52:00,T-D1,上層,,,,0.0012,0.0052,,...,,,,,,,,,37.50,141.07
10236,2015-12-02,10:13:00,T-D5,下層,,,0.0021,,0.0110,,...,,,,,,,,,37.42,141.07
13253,2017-08-29,09:40:00,T-6,上層,,,0.0065,,0.0390,,...,,,,,,,,,37.48,141.04
10490,2016-01-21,10:20:00,T-6,上層,,,,,,,...,,,0.37,,,,,,37.48,141.04
17480,2020-03-12,07:10:00,T-MA,下層,,,,0.0016,0.0033,,...,,,,,,,,,37.75,141.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7866,2014-09-26,08:40:00,T-D5,下層,,,0.0031,,0.0091,,...,,,,,,,,,37.42,141.07
12771,2017-05-16,09:40:00,T-6,上層,,,0.0047,,0.0330,,...,,,,,,,,,37.48,141.04
2765,2012-02-28,10:34:00,T-MG1,中層,,1.0E+00,,1.3E+00,,1.2E+00,...,,,,,,,,,38.33,141.28
14050,2018-02-16,07:10:00,T-7,下層,,,,0.0013,0.0023,,...,,,,,,,,,37.23,141.20


### Fix values range string

In [None]:
#| export
class FixRangeValueStringCB(Callback):
    "Replace e.g `4.0E+00<&<8.0E+00` by its mean (here 6)"
    
    def _extract_and_calculate_mean(self, s):
        float_strings = re.findall(r"[+-]?\d+\.\d+E[+-]\d+", s)
        float_numbers = np.array(float_strings, dtype=float)
        return float_numbers.mean()

    def _transform_if_range(self, value, range_pattern=r'<&<'):
        if pd.isna(value): return value
        pattern = re.compile(range_pattern)
        return (self._extract_and_calculate_mean(value) if pattern.search(str(value)) 
                else value)

    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_range)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB()])

tfm()['seawater'].sample(100)

Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12263,2017-01-26,08:57:00,T-5,下層,,,,0.0013,0.0036,,...,,,,,,,,,37.42,141.20
16389,2019-07-12,08:15:00,T-B,下層,,,,0.79,,1.1,...,,,,,,,,,36.51,140.67
13998,2018-02-08,06:01:00,T-20,下層,,,,0.0012,0.0031,,...,,,,,,,,,36.97,141.00
11115,2016-05-17,07:32:00,T-D9,上層,,,,0.0011,0.0077,,...,,,,,,,,,37.33,141.07
17110,2019-12-13,09:33:00,T-MG2,上層,,,,0.0015,0.0020,,...,,,,,,,,,38.30,141.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9963,2015-10-14,08:05:00,T-5,上層,,,,0.0017,0.0051,,...,,,,,,,,,37.42,141.20
11651,2016-09-15,05:56:00,T-S3,下層,,,,0.0012,0.0059,,...,,,,,,,,,37.46,141.08
17483,2020-03-12,10:41:00,T-S1,上層,,,,0.0012,0.0150,,...,,,,,,,,,37.58,141.04
11589,2016-09-03,10:06:00,T-MG1,上層,,,,0.0015,0.0062,,...,,,,,,,,,38.33,141.28


### Normalize nuclide name

In [None]:
# | export
# Define nuclides-related columns renaming rules
renaming_rules_rdn = {
    '131I radioactivity concentration (Bq/L)': 'i131',
    '131I detection limit (Bq/L)': 'i131_dl',
    '134Cs radioactivity concentration (Bq/L)': 'cs134',
    '134Cs detection limit (Bq/L)': 'cs134_dl',
    '137Cs radioactivity concentration (Bq/L)': 'cs137',
    '137Cs detection limit (Bq/L)': 'cs137_dl',
    '132I radioactivity concentration (Bq/L)': 'i132',
    '132I detection limit (Bq/L)': 'i132_dl',
    '132Te radioactivity concentration (Bq/L)': 'te132',
    '132Te detection limit (Bq/L)': 'te132_dl',
    '136Cs radioactivity concentration (Bq/L)': 'cs136',
    '136Cs detection limit (Bq/L)': 'cs136_dl',
    '140La radioactivity concentration (Bq/L)': 'la140',
    '140La detection limit (Bq/L)': 'la140_dl',
    '89Sr radioactivity concentration (Bq/L)': 'sr89',
    '89Sr detection limit (Bq/L)': 'sr89_dl',
    '90Sr radioactivity concentration (Bq/L)': 'sr90',
    '90Sr detection limit (Bq/L)': 'sr90_dl',
    '238Pu radioactivity concentration (Bq/L)': 'pu238',
    '238Pu detection limit (Bq/L)': 'pu238_dl',
    '239Pu+240Pu radioactivity concentration (Bq/L)': 'pu239_240_tot',
    '239Pu+240Pu statistical error (Bq/L)': 'pu239_240_tot_unc',
    '239Pu+240Pu detection limit (Bq/L)': 'pu239_240_tot_dl',
    'Total alpha radioactivity concentration (Bq/L)': 'talpha',
    'Total alpha detection limit (Bq/L)': 'talpha_dl',
    'Total beta radioactivity concentration (Bq/L)': 'tbeta',
    'Total beta detection limit (Bq/L)': 'tbeta_dl',
    '140Ba radioactivity concentration (Bq/L)': 'ba140',
    '140Ba detection limit (Bq/L)': 'ba140_dl',
    '106Ru radioactivity concentration (Bq/L)': 'ru106',
    '106Ru detection limit (Bq/L)': 'ru106_dl',
    '58Co radioactivity concentration (Bq/L)': 'co58',
    '58Co detection limit (Bq/L)': 'co58_dl',
    '60Co radioactivity concentration (Bq/L)': 'co60',
    '60Co detection limit (Bq/L)': 'co60_dl',
    '144Ce radioactivity concentration (Bq/L)': 'ce144',
    '144Ce detection limit (Bq/L)': 'ce144_dl',
    '54Mn radioactivity concentration (Bq/L)': 'mn54',
    '54Mn detection limit (Bq/L)': 'mn54_dl',
    '3H radioactivity concentration (Bq/L)': 'h3',
    '3H detection limit (Bq/L)': 'h3_dl',
    '125Sb radioactivity concentration (Bq/L)': 'sb125',
    '125Sb detection limit (Bq/L)': 'sb125_dl',
    '105Ru radioactivity concentration (Bq/L)': 'ru105',
    '105Ru detection limit (Bq/L)': 'ru105_dl'}


In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(renaming_rules_rdn.values(), nc_tpl_path())

True

In [None]:
#| export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names"
    def __init__(self,
                 renaming_rules=renaming_rules_rdn):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB()])

tfm()['seawater'].head()

Unnamed: 0_level_0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,i131,i131_dl,cs134,cs134_dl,cs137,cs137_dl,...,mn54_dl,h3,h3_dl,sb125,sb125_dl,ru105,ru105_dl,Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,T-3,,1100.0,13,48.0,9.2,53.0,8.8,...,,,,,,,,,37.32,141.03
1,2011-03-21,23:45:00,T-4,,660.0,12,31.0,8.7,33.0,8.3,...,,,,,,,,,37.24,141.01
2,2011-03-22,14:28:00,T-3,,1100.0,20,46.0,14.0,40.0,14.0,...,,,,,,,,,37.32,141.03
3,2011-03-22,15:06:00,T-4,,670.0,19,39.0,11.0,44.0,11.0,...,,,,,,,,,37.24,141.01
4,2011-03-23,13:51:00,T-3,,740.0,27,51.0,20.0,55.0,20.0,...,,,,,,34.0,25.0,,37.32,141.03


In [None]:
tfm()['seawater'].columns

Index(['Sampling date', 'Sampling time', 'Sampling point number',
       'Collection layer of seawater', 'i131', 'i131_dl', 'cs134', 'cs134_dl',
       'cs137', 'cs137_dl', 'i132', 'i132_dl', 'te132', 'te132_dl', 'cs136',
       'cs136_dl', 'la140', 'la140_dl', 'sr89', 'sr89_dl', 'sr90', 'sr90_dl',
       'pu238', 'pu238_dl', 'pu239_240_tot', 'pu239_240_tot_unc',
       'pu239_240_tot_dl', 'talpha', 'talpha_dl', 'tbeta', 'tbeta_dl', 'ba140',
       'ba140_dl', 'ru106', 'ru106_dl', 'co58', 'co58_dl', 'co60', 'co60_dl',
       'ce144', 'ce144_dl', 'mn54', 'mn54_dl', 'h3', 'h3_dl', 'sb125',
       'sb125_dl', 'ru105', 'ru105_dl', 'Unnamed: 49',
       'Sampling coordinate North latitude (Decimal)',
       'Sampling coordinate East longitude (Decimal)'],
      dtype='object')

### Parse time

In [None]:
#| export
def time_parser(col):
    day = str(col.iloc[0].date())
    time = str(col.iloc[1])
    return datetime.strptime(day + ' ' + time, '%Y-%m-%d %H:%M:%S')

In [None]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, 
                 fn_parser=time_parser,
                 cols_time=['Sampling date', 'Sampling time']):
        fc.store_attr()
        
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k][self.cols_time].apply(self.fn_parser, axis=1)
            tfm.dfs[k].drop(columns=self.cols_time)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

sample
0   2011-03-21 23:15:00
1   2011-03-21 23:45:00
2   2011-03-22 14:28:00
3   2011-03-22 15:06:00
4   2011-03-23 13:51:00
Name: time, dtype: datetime64[ns]


### Rename columns

In [None]:
#| export
renaming_rules_cols = {
    'Sampling coordinate North latitude (Decimal)': 'lat',
    'Sampling coordinate East longitude (Decimal)': 'lon'}

In [None]:
#| export
class RenameColumnCB(Callback):
    "Normalizing, renaming columns"
    def __init__(self,
                 renaming_rules=renaming_rules_cols):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB()])

print(tfm()['seawater'][:5])

       Sampling date Sampling time Sampling point number  \
sample                                                     
0         2011-03-21      23:15:00                   T-3   
1         2011-03-21      23:45:00                   T-4   
2         2011-03-22      14:28:00                   T-3   
3         2011-03-22      15:06:00                   T-4   
4         2011-03-23      13:51:00                   T-3   

       Collection layer of seawater    i131 i131_dl  cs134 cs134_dl  cs137  \
sample                                                                       
0                               NaN  1100.0      13   48.0      9.2   53.0   
1                               NaN   660.0      12   31.0      8.7   33.0   
2                               NaN  1100.0      20   46.0       14   40.0   
3                               NaN   670.0      19   39.0       11   44.0   
4                               NaN   740.0      27   51.0       20   55.0   

       cs137_dl  ...  h3  h3_dl 

### Select columns of interest

In [None]:
#| export
coi = ['time', 'lat', 'lon'] + list(renaming_rules_rdn.values())

In [None]:
#| export
class SelectColumnsCB(Callback):
    def __init__(self, coi=coi): fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k] = tfm.dfs[k][self.coi]


In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB()])

print(tfm()['seawater'][:5])

                      time    lat     lon    i131 i131_dl  cs134 cs134_dl  \
sample                                                                      
0      2011-03-21 23:15:00  37.32  141.03  1100.0      13   48.0      9.2   
1      2011-03-21 23:45:00  37.24  141.01   660.0      12   31.0      8.7   
2      2011-03-22 14:28:00  37.32  141.03  1100.0      20   46.0       14   
3      2011-03-22 15:06:00  37.24  141.01   670.0      19   39.0       11   
4      2011-03-23 13:51:00  37.32  141.03   740.0      27   51.0       20   

        cs137 cs137_dl   i132  ...  ce144  ce144_dl  mn54  mn54_dl  h3  h3_dl  \
sample                         ...                                              
0        53.0      8.8  160.0  ...    NaN       NaN   NaN      NaN NaN    NaN   
1        33.0      8.3  120.0  ...    NaN       NaN   NaN      NaN NaN    NaN   
2        40.0       14    NaN  ...    NaN       NaN   NaN      NaN NaN    NaN   
3        44.0       11    NaN  ...    NaN       NaN   N

### Encode time (seconds since ...)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB(cfg())])

print(tfm()['seawater'][:5])

              time    lat     lon    i131 i131_dl  cs134 cs134_dl  cs137  \
sample                                                                     
0       1300749300  37.32  141.03  1100.0      13   48.0      9.2   53.0   
1       1300751100  37.24  141.01   660.0      12   31.0      8.7   33.0   
2       1300804080  37.32  141.03  1100.0      20   46.0       14   40.0   
3       1300806360  37.24  141.01   670.0      19   39.0       11   44.0   
4       1300888260  37.32  141.03   740.0      27   51.0       20   55.0   

       cs137_dl   i132  ...  ce144  ce144_dl  mn54  mn54_dl  h3  h3_dl  sb125  \
sample                  ...                                                     
0           8.8  160.0  ...    NaN       NaN   NaN      NaN NaN    NaN    NaN   
1           8.3  120.0  ...    NaN       NaN   NaN      NaN NaN    NaN    NaN   
2            14    NaN  ...    NaN       NaN   NaN      NaN NaN    NaN    NaN   
3            11    NaN  ...    NaN       NaN   NaN      NaN Na

### Retrieve depth

In [None]:
# TO BE DONE
# Options: 
#   - GEBCO: provides raster grid at ~160m (15 arc-second) at equator. We can download it and query via GDAL

### Sanitize coordinates

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])

tfm()['seawater'][:5]

Unnamed: 0_level_0,time,lat,lon,i131,i131_dl,cs134,cs134_dl,cs137,cs137_dl,i132,...,ce144,ce144_dl,mn54,mn54_dl,h3,h3_dl,sb125,sb125_dl,ru105,ru105_dl
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1300749300,37.32,141.03,1100.0,13,48.0,9.2,53.0,8.8,160.0,...,,,,,,,,,,
1,1300751100,37.24,141.01,660.0,12,31.0,8.7,33.0,8.3,120.0,...,,,,,,,,,,
2,1300804080,37.32,141.03,1100.0,20,46.0,14.0,40.0,14.0,,...,,,,,,,,,,
3,1300806360,37.24,141.01,670.0,19,39.0,11.0,44.0,11.0,,...,,,,,,,,,,
4,1300888260,37.32,141.03,740.0,27,51.0,20.0,55.0,20.0,200.0,...,,,,,,,,,34.0,25.0


## Encode to NetCDF

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
                            RemoveJapanaseCharCB(),
                            FixRangeValueStringCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            SelectColumnsCB(),
                            EncodeTimeCB(cfg()),
                            SanitizeLonLatCB()])


dfs_tfm = tfm()

In [None]:
dfs_tfm

{'seawater':               time    lat     lon    i131 i131_dl  cs134 cs134_dl    cs137  \
 sample                                                                       
 0       1300749300  37.32  141.03  1100.0      13   48.0      9.2  53.0000   
 1       1300751100  37.24  141.01   660.0      12   31.0      8.7  33.0000   
 2       1300804080  37.32  141.03  1100.0      20   46.0       14  40.0000   
 3       1300806360  37.24  141.01   670.0      19   39.0       11  44.0000   
 4       1300888260  37.32  141.03   740.0      27   51.0       20  55.0000   
 ...            ...    ...     ...     ...     ...    ...      ...      ...   
 21472   1658736480  37.33  141.07     NaN     NaN    NaN   0.0013   0.0036   
 21473   1658736480  37.33  141.07     NaN     NaN    NaN   0.0012   0.0024   
 21474   1658843100  37.32  141.03     NaN     NaN    NaN   0.0014   0.0150   
 21475   1658825400  37.24  141.01     NaN     NaN    NaN   0.0011   0.0110   
 21476   1658830200  37.48  141.04     N

In [None]:
tfm.logs

['Assign `NaN` to values equal to `ND` (not detected) - to be confirmed ',
 'Remove 約 (about) char',
 'Replace e.g `4.0E+00<&<8.0E+00` by its mean (here 6)',
 'Remap to MARIS radionuclide names',
 'Normalizing, renaming columns',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants']


In [None]:
#| export
def get_attrs(tfm, zotero_key, kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[BboxCB(),
                                    # DepthRangeCB(),
                                    TimeRangeCB(cfg()),
                                    ZoteroCB(zotero_key, cfg=cfg()),
                                    KeyValuePairCB('keywords', ', '.join(kw)),
                                    KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))])()

In [None]:
get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)

{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Water Quality > Ocean Contaminants',
 'keywords_vocabulary': 'GCMD Science Keywords',
 '

### Encoding

In [None]:
#| export
# def units_fn(grp_name): 
#     return 'Bq/l'

In [None]:
#| export
# def encode(fname_in, fname_out, nc_tpl_path):
#     dfs = load_data(fname_in)
#     tfm = Transformer(dfs, cbs=[FixMissingValuesCB(),
#                                 RemoveJapanaseCharCB(),
#                                 FixRangeValueStringCB(),
#                                 RemapRdnNameCB(),
#                                 ParseTimeCB(),
#                                 RenameColumnCB(),
#                                 SelectColumnsCB(),
#                                 EncodeTimeCB(),
#                                 SanitizeLonLatCB()])
    
#     dfs_tfm = tfm()
#     attrs = get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)
#     to_netcdf(dfs_tfm, nc_tpl_path, fname_out, attrs, units_fn)

In [None]:
# encode(fname_in, fname_out, NC_TPL_PATH)