In [None]:
#| default_exp handlers.tepco

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')

In [None]:
#| export
import pandas as pd
import re
import numpy as np
from typing import Tuple
from datetime import datetime
import fastcore.all as fc
from tqdm import tqdm
from collections import defaultdict

from marisco.callbacks import (
    Callback, 
    Transformer,
    EncodeTimeCB, 
    SanitizeLonLatCB,
    EncodeTimeCB, 
    )

from marisco.utils import has_valid_varname
from marisco.configs import nc_tpl_path, cfg
from marisco.encoders import NetCDFEncoder

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB,
    DepthRangeCB, 
    TimeRangeCB,
    ZoteroCB, 
    KeyValuePairCB    
    )

## Configuration & file paths

In [None]:
#| exports
fname_coastal_water = 'https://radioactivity.nra.go.jp/cont/en/results/sea/coastal_water.csv'
fname_clos1F = 'https://radioactivity.nra.go.jp/cont/en/results/sea/close1F_water.xlsx'
fname_iaea_orbs = 'https://raw.githubusercontent.com/RML-IAEA/iaea.orbs/refs/heads/main/src/iaea/orbs/stations/station_points.csv'

fname_out = '../../_data/output/tepco.nc'

## Load data

We here load the data from the [NRA (Nuclear Regulatory Authority)](https://radioactivity.nra.go.jp/en/results) website. For the moment, we only process radioactivity concentration data in the seawater around Fukushima Dai-ichi NPP [TEPCO] (`coastal_water.csv`) and in the `close1F_water.xlsx` file.

In near future, MARIS will provide a dedicated handler for all related [ALPS data](https://radioactivity.nra.go.jp/en/results#sec-12) including measurements not only provided by TEPCO but also MOE, NRA, MLITT and Fukushima Prefecture.



:::{.callout-tip}
## FEEDBACK TO DATA PROVIDER

The **coastal_water.csv** file contains two sections: the measurements and the locations. We identify below the line number where the locations begin. A single point of truth for the location of the stations would ease the processing in future.

:::

In [None]:
#| exports
def find_location_section(df, 
                          col_idx=0,
                          pattern='Sampling point number'
                          ):
    "Find the line number where location data begins."
    mask = df.iloc[:, col_idx] == pattern
    indices = df[mask].index
    return indices[0] if len(indices) > 0 else -1

In [None]:
#| eval: false
find_location_section(pd.read_csv(fname_coastal_water, low_memory=False))

27844

:::{.callout-tip}
## FEEDBACK TO DATA PROVIDER

Distinct parsing of the time from `coastal_water.csv` and `close1F_water.xlsx` files are required. Indeed:

- `coastal_water.csv` uses the format `YYYY/MM/DD` in the `Sampling  HH:MM` and 
- `close1F_water.xlsx` uses the format `YYYY-MM-DD HH:MM:SS`.

:::

In [None]:
#| exports
def fix_sampling_time(x):
    if pd.isna(x): 
        return '00:00:00'
    else:
        hour, min =  x.split(':')[:2]
        return f"{hour if len(hour) == 2 else '0' + hour}:{min}:00"

In [None]:
#| exports
def get_coastal_water_df(fname_coastal_water):
    "Get the measurements dataframe from the `coastal_water.csv` file."
    
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=1, 
                     nrows=locs_idx - 1,
                     low_memory=False)
    df.dropna(subset=['Sampling point number'], inplace=True)
    df['Sampling time'] = df['Sampling time'].map(fix_sampling_time)
    
    df['TIME'] = df['Sampling date'].replace('-', '/') + ' ' + df['Sampling time']
    
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_coastal_water = get_coastal_water_df(fname_coastal_water)
df_coastal_water.tail()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
27836,T-D5,上層,,,,,,,,,...,,,ND,6.9,,,,,,2024/12/9 07:51:00
27837,T-S8,上層,,,,,,,,,...,,,ND,6.9,,,,,,2024/12/9 10:07:00
27838,T-S3,上層,,,,,,,,,...,,,ND,6.9,,,,,,2024/12/11 10:37:00
27839,T-S4,上層,,,,,,,,,...,,,ND,6.8,,,,,,2024/12/11 11:06:00
27840,T-D5,上層,,,,,,,,,...,,,ND,8.6,,,,,,2024/12/16 07:24:00


:::{.callout-tip}
## FEEDBACK TO DATA PROVIDER

Identification of the stations location requires three distinct files:

- the second section of the `coastal_water.csv` file
- the `R6zahyo.pdf` file further processed by [https://github.com/RML-IAEA/iaea.orbs](https://github.com/RML-IAEA/iaea.orbs)
- the second sections of all sheets of `close1F_water.xlsx` file
  
All files and sheets required to look up the location of the stations.

:::

In [None]:
#| exports
def get_locs_coastal_water(fname_coastal_water):
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=locs_idx+1, 
                     low_memory=False).iloc[:, :3]
    
    df.columns = ['station', 'LON', 'LAT']
    df.dropna(subset=['LAT'], inplace=True)
    df['org'] = 'coastal_seawater.csv'
    return df

In [None]:
#| eval: false
df_locs_coastal_water = get_locs_coastal_water(fname_coastal_water)
df_locs_coastal_water.head()

Unnamed: 0,station,LON,LAT,org
0,T-0,37.42,141.04,coastal_seawater.csv
1,T-11,37.24,141.05,coastal_seawater.csv
2,T-12,37.15,141.04,coastal_seawater.csv
3,T-13-1,37.64,141.04,coastal_seawater.csv
4,T-14,37.55,141.06,coastal_seawater.csv


In [None]:
#| eval: false
df_locs_coastal_water['station'].unique()

array(['T-0', 'T-11', 'T-12', 'T-13-1', 'T-14', 'T-17-1', 'T-18', 'T-20',
       'T-22', 'T-3', 'T-4', 'T-4-1', 'T-4-2', 'T-5', 'T-6', 'T-7', 'T-A',
       'T-B', 'T-B1', 'T-B2', 'T-B3', 'T-B4', 'T-C', 'T-D', 'T-D1',
       'T-D5', 'T-D9', 'T-E', 'T-E1', 'T-Z', 'T-MG6', 'T-S1', 'T-S7',
       'T-H1', 'T-S2', 'T-S6', 'T-M10', 'T-MA', 'T-S3', 'T-S4', 'T-S8',
       'T-MG4', 'T-G4', 'T-MG5', 'T-MG1', 'T-MG0', 'T-MG3', 'T-MG2'],
      dtype=object)

:::{.callout-tip}
## FEEDBACK TO DATA PROVIDER

Data contained in the `close1F_water.xlsx` file are spread in several sheets (one per station). Each sheet further contains two sections: the measurements and the locations. 

For each sheet, we have to identify the line number where to split both measurements and the location. We then need to further iterate over all sheets to concatenate the results.

:::

In [None]:
#| exports
def get_clos1F_df(fname_clos1F):
    "Get measurements dataframe from close1F_water.xlsx file and parse datetime."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                   sheet_name=sheet_name, 
                   skiprows=1,
                   nrows=locs_idx-1)
        
        df.dropna(subset=['Sampling point number'], inplace=True)
        df['Sampling date'] = df['Sampling date']\
            .astype(str)\
            .apply(lambda x: x.split(' ')[0]\
            .replace('-', '/'))
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True)
    df.dropna(subset=['Sampling date'], inplace=True)
    df['TIME'] = df['Sampling date'] + ' ' + df['Sampling time'].astype(str)
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_clos1F = get_clos1F_df(fname_clos1F); df_clos1F.head()

100%|██████████| 11/11 [00:06<00:00,  1.80it/s]


Unnamed: 0,Sampling point number,134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),Total beta radioactivity concentration (Bq/L),Total beta detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),Collection layer of seawater,...,106Ru detection limit (Bq/L),60Co radioactivity concentration (Bq/L),60Co detection limit (Bq/L),95Zr radioactivity concentration (Bq/L),95Zr detection limit (Bq/L),99Mo radioactivity concentration (Bq/L),99Mo detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),TIME
0,T-0-1,ND,1.5,ND,1.4,ND,18.0,,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,4.7,,,...,,,,,,,,,,2013/08/14 08:17:00
2,T-0-1,ND,1.1,ND,1.4,ND,20.0,,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,ND,2.9,,...,,,,,,,,,,2013/08/21 08:09:00
4,T-0-1,ND,0.66,ND,0.49,ND,17.0,,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| eval: false
df_clos1F['Sampling point number'].unique()

array(['T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

In [None]:
#| exports
def get_locs_clos1F(fname_clos1F):
    "Get locations dataframe from close1F_water.xlsx file from each sheets."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                           sheet_name=sheet_name, 
                           skiprows=locs_idx+2)
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True).iloc[:, :3]
    df.dropna(subset=['Sampling coordinate North latitude (Decimal)'], inplace=True)    
    df.columns = ['station', 'LON', 'LAT']
    df['org'] = 'close1F.csv'
    return df

In [None]:
#| eval: false
df_locs_clos1F = get_locs_clos1F(fname_clos1F)
df_locs_clos1F.head()

100%|██████████| 11/11 [00:06<00:00,  1.78it/s]


Unnamed: 0,station,LON,LAT,org
0,T-0-1,37.43,141.04,close1F.csv
11,T-0-1A,37.43,141.05,close1F.csv
22,T-0-2,37.42,141.05,close1F.csv
33,T-0-3,37.42,141.04,close1F.csv
44,T-0-3A,37.42,141.05,close1F.csv


:::{.callout-tip}
## FEEDBACK TO DATA PROVIDER

In theory all locations are supposed to be provided in the [R6zahyo.pdf](https://radioactivity.nra.go.jp/cont/en/results/sea/R6zahyo.pdf) file. This file is further processed by https://github.com/RML-IAEA/iaea.orbs and the result is provided in the `station_points.csv` file. 

However, this file does not contain all locations refered to in both `coastal_water.csv` and `close1F_water.xlsx` files.

:::

In [None]:
#| exports
def get_locs_orbs(fname_iaea_orbs):
    df = pd.read_csv(fname_iaea_orbs)
    df.columns = ['org', 'station', 'LON', 'LAT']
    return df

In [None]:
#| eval: false
df_locs_orbs = get_locs_orbs(fname_iaea_orbs)
df_locs_orbs.head()

Unnamed: 0,org,station,LON,LAT
0,MOE,E-31,141.727667,39.059167
1,MOE,E-32,141.635667,38.996
2,MOE,E-37,141.948611,39.259167
3,MOE,E-38,141.755,39.008333
4,MOE,E-39,141.766667,38.991667


In [None]:
#| exports
def concat_locs(dfs):
    "Concatenate and drop duplicates from coastal_seawater.csv and iaea_orbs.csv (kept)"
    df = pd.concat(dfs)
    # Group by org to be used for sorting
    df['org_grp'] = df['org'].apply(
        lambda x: 1 if x == 'coastal_seawater.csv' else 2 if x == 'close1F.csv' else 0)
    df.sort_values('org_grp', ascending=True, inplace=True)
    # Drop duplicates and keep orbs data first
    df.drop_duplicates(subset='station', keep='first', inplace=True)
    df.drop(columns=['org_grp'], inplace=True)
    df.sort_values('station', ascending=True, inplace=True)
    return df

In [None]:
#| eval: false
# df_locs = concat_locs(df_locs_coastal_water, df_locs_orbs)
df_locs = concat_locs([df_locs_clos1F, df_locs_coastal_water, df_locs_orbs])
df_locs.head()

Unnamed: 0,station,LON,LAT,org
214,C-P1,139.863333,35.425,NRA
215,C-P2,139.863333,35.401667,NRA
216,C-P3,139.881667,35.37,NRA
217,C-P4,139.846667,35.356667,NRA
218,C-P5,139.8,35.343333,NRA


In [None]:
#| exports
def align_dfs(df_from, df_to):
    "Align columns structure of df_from to df_to."
    df = defaultdict()    
    for c in df_to.columns:
        df[c] = df_from[c].values if c in df_from.columns else np.NAN
    return pd.DataFrame(df)

In [None]:
# | eval: false
align_dfs(df_clos1F, df_coastal_water).head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-0-1,,,,ND,1.5,ND,1.4,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,,,,...,,,4.7,,,,,,,2013/08/14 08:17:00
2,T-0-1,,,,ND,1.1,ND,1.4,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,,,,...,,,ND,2.9,,,,,,2013/08/21 08:09:00
4,T-0-1,,,,ND,0.66,ND,0.49,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| exports
def concat_dfs(df_coastal_water, df_clos1F):
    "Concatenate and drop duplicates from coastal_seawater.csv and close1F_water.xlsx (kept)"
    df_clos1F = align_dfs(df_clos1F, df_coastal_water)
    df = pd.concat([df_coastal_water, df_clos1F])
    return df

In [None]:
#| eval: false
df_meas = concat_dfs(df_coastal_water, df_clos1F)
df_meas.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,,,,,2011/3/21 23:15:00
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,,,,,2011/3/21 23:45:00
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,,,,,2011/3/22 14:28:00
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,,,,,2011/3/22 15:06:00
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,,,,,34.0,25.0,,2011/3/23 13:51:00


In [None]:
#| exports
def georef_data(df_meas, df_locs):
    "Georeference measurements dataframe using locations dataframe."
    assert "Sampling point number" in df_meas.columns and "station" in df_locs.columns
    return pd.merge(df_meas, df_locs, how="inner", 
                    left_on='Sampling point number', right_on='station')

In [None]:
#| eval: false
df_meas_georef = georef_data(df_meas, df_locs)
df_meas_georef.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| exports
def load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs):
    "Load, align and georeference TEPCO data"
    df_locs = concat_locs(
        [get_locs_coastal_water(fname_coastal_water), 
         get_locs_clos1F(fname_clos1F),
         get_locs_orbs(fname_iaea_orbs)])
    df_meas = concat_dfs(get_coastal_water_df(fname_coastal_water), get_clos1F_df(fname_clos1F))
    df_meas.dropna(subset=['Sampling point number'], inplace=True)
    return {'SEAWATER': georef_data(df_meas, df_locs)}

In [None]:
#| eval: false
dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
dfs['SEAWATER'].head()

100%|██████████| 11/11 [00:06<00:00,  1.77it/s]
100%|██████████| 11/11 [00:06<00:00,  1.74it/s]


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
print(f"# of cols, rows: {dfs['SEAWATER'].shape}")
dfs['SEAWATER'].head()

# of cols, rows: (47148, 53)


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
dfs['SEAWATER']['Sampling point number'].unique()


array(['T-3', 'T-4', 'T-5', 'T-7', 'T-11', 'T-12', 'T-14', 'T-18', 'T-20',
       'T-22', 'T-MA', 'T-M10', 'T-A', 'T-D', 'T-E', 'T-B', 'T-C',
       'T-MG1', 'T-MG2', 'T-MG3', 'T-MG4', 'T-MG5', 'T-MG6', 'T-D1',
       'T-D5', 'T-D9', 'T-E1', 'T-G4', 'T-H1', 'T-S5', 'T-S6', 'T-17-1',
       'T-B3', 'T-13-1', 'T-S3', 'T-S4', 'T-B4', 'T-S1', 'T-S2', 'T-MG0',
       'T-Z', 'T-B1', 'T-B2', 'T-S7', 'T-S8', 'T-0', 'T-4-1', 'T-4-2',
       'T-6', 'T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

## Fix missing values

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: We remap the `ND` value to `NaN`. Please confirm that this is the correct way to handle missing values.
:::


`ND` is assigned `NaN`. This needs to be confirmed.

In [None]:
#| exports
class FixMissingValuesCB(Callback):
    "Assign `NaN` to values equal to `ND` (not detected) - to be confirmed "
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            predicate = tfm.dfs[k] == 'ND'
            tfm.dfs[k][predicate] = np.nan

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[FixMissingValuesCB()])
tfm()['SEAWATER'].head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,160.0,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,120.0,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,200.0,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


## Remove 約 (about) character
    

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: We systematically remove the `約` character. Please confirm that this is the correct way to handle this. We could imagine that mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class RemoveJapanaseCharCB(Callback):
    "Remove 約 (about) char"
    def _transform_if_about(self, value, about_char='約'):
        if pd.isna(value): return value
        return (value.replace(about_char, '') if str(value).count(about_char) != 0 
                else value)
    
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_about)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB()])

tfm()['SEAWATER'].sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
22315,T-3,上層,,,,0.001,0.016,,,,...,,,,,,2022/12/6 13:50:00,T-3,141.026389,37.322222,TEPCO
29546,T-0-2,,,,,,,,,,...,,,,,,2021/08/30 06:26:00,T-0-2,141.046667,37.423333,TEPCO
20600,T-5,上層,,,,,,,,,...,,,,,,2021/12/6 07:29:00,T-5,141.2,37.416667,TEPCO
30414,T-0-3,,,,,,,,,,...,,,,,,2016/06/06 07:51,T-0-3,141.040278,37.416111,TEPCO
10101,T-6,上層,,,0.034,,0.12,,,,...,,,,,,2015/10/20 09:17:00,T-6,141.040556,37.478889,TEPCO
37930,T-1,上層,,,0.0052,,0.19,,,,...,,,,,,2022/06/10 09:35:00,T-1,141.034444,37.431111,TEPCO
20349,T-D1,上層,,,,,,,,,...,,,,,,2021/10/11 08:16:00,T-D1,141.072222,37.5,TEPCO
31955,T-0-3A,,,,,0.53,,0.71,,,...,,,,,,2018/11/05 07:00:00,T-0-3A,141.046667,37.416111,TEPCO
32714,T-0-3A,,,,,0.3,,0.25,,,...,,,,,,2024/11/18 08:06:00,T-0-3A,141.046667,37.416111,TEPCO
10810,T-D5,上層,,,0.002,,0.0098,,,,...,,,,,,2016/2/22 10:52:00,T-D5,141.072222,37.416667,TEPCO


## Fix values range string

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: Value ranges are provided as strings (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7'). We replace them by their mean. Please confirm that this is the correct way to handle this. Again, mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class FixRangeValueStringCB(Callback):
    "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean"
    
    def _extract_and_calculate_mean(self, s):
        # For scientific notation ranges
        float_strings = re.findall(r"[+-]?\d+\.?\d*E?[+-]?\d*", s)
        if float_strings:
            float_numbers = np.array(float_strings, dtype=float)
            return float_numbers.mean()
        return s
    
    def _transform_if_range(self, value):
        if pd.isna(value): 
            return value
        value = str(value)
        # Check for both range patterns
        if '<&<' in value or '～' in value:
            return self._extract_and_calculate_mean(value)
        return value

    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns 
                       if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_range).astype(float)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,station,LON,LAT,org
6679,T-MG2,中層,,,,0.002,0.003,,,,...,,,,,,2014/2/20 08:25:00,T-MG2,141.666667,38.3,TEPCO
42545,T-2,上層,,,,0.53,,0.6,,,...,,,,,,2021/11/25 06:51:00,T-2,141.033611,37.415833,TEPCO
32711,T-0-3A,,,,,,,,,,...,,,,,,2024/11/04 07:48:00,T-0-3A,141.046667,37.416111,TEPCO
18476,T-5,下層,,,,0.0014,0.0014,,,,...,,,,,,2020/9/4 07:12:00,T-5,141.2,37.416667,TEPCO
4731,T-11,上層,,,0.0042,,0.0081,,,,...,,,,,,2013/2/27 09:22:00,T-11,141.047222,37.241667,TEPCO
5912,T-14,下層,,,0.0048,,0.01,,,,...,,,,,,2013/10/4 11:04:00,T-14,141.0625,37.552778,TEPCO
25410,T-D1,上層,,,,,,,,,...,,,,,,2024/8/13 08:00:00,T-D1,141.072222,37.5,TEPCO
9937,T-11,下層,,,0.0019,,0.0088,,,,...,,,,,,2015/9/24 09:56:00,T-11,141.047222,37.241667,TEPCO
25609,T-6,上層,,,,0.0012,0.0074,,,,...,,,,,,2024/9/17 06:25:00,T-6,141.040556,37.478889,TEPCO
34350,T-1,上層,,,,,,,,,...,,,,,,2014/11/17 06:35:00,T-1,141.034444,37.431111,TEPCO


## Select columns of interest

We select the columns of interest and in particular the elements of interest, in our case radionuclides.

In [None]:
#| exports
common_coi = ['org', 'LON', 'LAT', 'TIME', 'station']
nuclides_pattern = '(Bq/L)'

In [None]:
#| exports
class SelectColsOfInterestCB(Callback):
    "Select columns of interest."
    def __init__(self, common_coi, nuclides_pattern): fc.store_attr()
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.dfs['SEAWATER'].columns if nuclides_pattern in c]
        tfm.dfs['SEAWATER'] = tfm.dfs['SEAWATER'][self.common_coi + nuc_of_interest]

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,org,LON,LAT,TIME,station,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),...,144Ce radioactivity concentration (Bq/L),144Ce detection limit (Bq/L),54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L)
8057,TEPCO,141.666667,38.3,2014/10/21 08:18:00,T-MG2,,,,0.0018,0.0032,...,,,,,,,,,,
32748,TEPCO,141.034444,37.431111,2011/04/06 nan,T-1,,,,,,...,,,,,,,,,,
28989,TEPCO,141.046667,37.423333,2016/04/18 08:26,T-0-2,,,,0.7,,...,,,,,,,,,,
6387,TEPCO,141.072222,37.416667,2013/12/29 09:22:00,T-D5,,,0.014,,0.03,...,,,,,,,,,,
44385,close1F.csv,37.41,141.03,2013/06/25 06:55:00,T-2-1,,1.1,,1.1,,...,,,,,,,,,,


## Reshape: wide to long

This step is necessary to extract information such as nuclide names,  detection limint, uncertainty, ...


In [None]:
#| exports
class WideToLongCB(Callback):
    """
    Parse TEPCO measurement columns to extract nuclide name, measurement value, 
    detection limit and uncertainty
    """
    def __init__(self): fc.store_attr()
    
    
    def _melt(self, df):
        "Melt dataframe to long format."
        return df.melt(id_vars=['LON', 'LAT', 'TIME', 'station'])
        
    def _extract_nuclide(self, text):
        words = text.split(' ')
        # Handle special cases for alpha/beta
        if len(words) >= 2 and words[1].lower() in ['alpha', 'beta']:
            return f"{words[0]} {words[1]}"
        return words[0]
    
    def _nuclide_name(self, df):
        "Extract nuclide name from nuclide names."
        df['NUCLIDE'] = df['variable'].map(self._extract_nuclide)
        return df
    
    def _type_indicator(self, df):
        "Create type indicators."
        df['is_concentration'] = df['variable'].str.contains('radioactivity concentration')
        df['is_dl'] = df['variable'].str.contains('detection limit')
        df['is_unc'] = df['variable'].str.contains('statistical error')
        return df
    
    def _unit(self, df):
        "Extract unit from nuclide names."
        df['UNIT'] = df['variable'].str.extract(r'\((.*?)\)')
        return df
    
    def _type_column(self, df):
        "Create type column."
        conditions = [
            df['is_concentration'],
            df['is_dl'],
            df['is_unc']
        ]
        choices = ['VALUE', 'DL', 'UNC']
        df['type'] = np.select(conditions, choices)
        df = df.drop(['is_concentration', 'is_dl', 'is_unc'], axis=1)
        return df
    
    def __call__(self, tfm):
        tfm.dfs['SEAWATER'] = self._melt(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._nuclide_name(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._type_indicator(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._unit(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._type_column(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = pd.pivot_table(
            tfm.dfs['SEAWATER'],
            values='value',
            index=['LON', 'LAT', 'TIME', 'station', 'NUCLIDE', 'UNIT'],
            columns='type',
            aggfunc='first'
        ).reset_index()
        # reset the index and rename it ID
        tfm.dfs['SEAWATER'].reset_index(inplace=True)
        tfm.dfs['SEAWATER'].rename(columns={'index': 'ID'}, inplace=True)
        

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.head()

type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
0,0,37.21,141.01,2012/10/16 07:25:00,T-4-1,131I,Bq/L,0.13,,
1,1,37.21,141.01,2012/10/16 07:25:00,T-4-1,134Cs,Bq/L,0.19,,
2,2,37.21,141.01,2012/10/16 07:25:00,T-4-1,137Cs,Bq/L,0.27,,
3,3,37.21,141.01,2012/10/2 07:30:00,T-4-1,131I,Bq/L,0.11,,
4,4,37.21,141.01,2012/10/2 07:30:00,T-4-1,134Cs,Bq/L,0.22,,


## Remap `UNIT` name to MARIS nomenclature

In [None]:
#| exports
unit_mapping = {'Bq/L': 3}

In [None]:
#| exports
class RemapUnitNameCB(Callback):
    """
    Remap `UNIT` name to MARIS id.
    """
    def __init__(self, unit_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['UNIT'] = tfm.dfs['SEAWATER']['UNIT'].map(self.unit_mapping)


In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
36217,36217,141.034444,37.431111,2014/03/20 07:10:00,T-1,134Cs,3,0.78,,
45060,45060,141.034444,37.431111,2022/08/13 07:50:00,T-1,134Cs,3,0.7,,
7734,7734,140.702222,35.9875,2013/2/21 13:38:00,T-D,131I,3,1.7,,
19344,19344,141.033611,37.415833,2011/06/08 13:40:00,T-2,131I,3,7.0,,
64933,64933,141.047222,37.311111,2021/12/16 05:47:00,T-S7,137Cs,3,,,0.014


## Remap `NUCLIDE` name to MARIS nomenclature

In [None]:
#| exports
nuclide_mapping = {
    '131I': 29,
    '134Cs': 31,
    '137Cs': 33,
    '125Sb': 24,
    'Total beta': 103,
    '238Pu': 67,
    '239Pu+240Pu': 77,
    '3H': 1,
    '89Sr': 11,
    '90Sr': 12,
    'Total alpha': 104,
    '132I': 100,
    '136Cs': 102,
    '58Co': 8,
    '105Ru': 97,
    '106Ru': 17,
    '140La': 35,
    '140Ba': 34,
    '132Te': 99,
    '60Co': 9,
    '144Ce': 37,
    '54Mn': 6
}

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    """
    Remap `NUCLIDE` name to MARIS id.
    """
    def __init__(self, nuclide_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['NUCLIDE'] = tfm.dfs['SEAWATER']['NUCLIDE'].map(self.nuclide_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
46208,46208,141.034444,37.431111,2023/11/16 07:20:00,T-1,33,3,0.72,,
19814,19814,141.033611,37.415833,2011/10/04 08:25:00,T-2,31,3,6.0,,
53426,53426,141.040556,37.478889,2015/4/7 09:20:00,T-6,103,3,17.0,,
57260,57260,141.046667,37.416111,2021/11/01 06:40:00,T-0-3A,31,3,0.63,,
2954,2954,37.41,141.03,2014/11/10 05:35:00,T-2-1,1,3,1.8,,


## Remap `DL` value to MARIS nomenclature

We remap `DL` (Detection Limit) value to MARIS ids as follows:
    
- if a `DL` value is reported with assign `2` (Detection limit or '<')
- if a `DL` value is not reported with assign `1` (Detected value or '=')

In [None]:
#| exports
class RemapDLCB(Callback):
    """
    Remap `DL` name to MARIS id.
    """
    def __init__(self): fc.store_attr()
    def dl_mapping(self, value): return 1 if pd.isna(value) else 2
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER']['DL'] = tfm.dfs['SEAWATER']['DL'].map(self.dl_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(10)

type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
56583,56583,141.046667,37.416111,2018/07/02 06:48:00,T-0-3A,1,3,2,,
33149,33149,141.034444,37.431111,2011/07/11 10:50:00,T-1,67,3,2,,
87985,87985,141.666667,38.3,2019/4/10 08:30:00,T-MG2,31,3,2,,
59706,59706,141.046667,37.423333,2021/08/18 06:26:00,T-0-2,1,3,2,,
42999,42999,141.034444,37.431111,2020/03/09 07:40:00,T-1,33,3,2,,
82975,82975,141.25,38.166667,2012/7/10 08:12:00,T-MG5,11,3,2,,
82276,82276,141.216667,37.533333,2020/3/24 06:25:00,T-B1,31,3,2,,
22102,22102,141.033611,37.415833,2017/01/24 07:00:00,T-2,31,3,2,,
1941,1941,37.41,141.03,2014/03/05 05:37:00,T-2-1,29,3,2,,
5117,5117,37.41,141.03,2016/6/10 06:05,T-2-1,29,3,2,,


## Parse & encode time

In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse time column from TEPCO."
    def __init__(self,
                 time_name='TIME'):
        fc.store_attr()
        
    def __call__(self, tfm):
        tfm.dfs['SEAWATER'][self.time_name] = pd.to_datetime(tfm.dfs['SEAWATER'][self.time_name], 
                                                             format='%Y/%m/%d %H:%M:%S', errors='coerce')

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)




type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
64008,64008,141.047222,37.241667,1488278220,T-11,31,3,2,,
76840,76840,141.083333,37.75,1330931400,T-MA,31,3,2,,
61656,61656,141.046667,37.430556,1537165680,T-0-1A,31,3,2,,
26767,26767,141.033611,37.415833,1589784300,T-2,31,3,2,,0.0049
21477,21477,141.033611,37.415833,1352793600,T-2,29,3,2,,


## Sanitize coordinates

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(5)



type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
47215,47215,141.0375,37.15,1320039300,T-12,31,3,2,,
87089,87089,141.583333,38.633333,1725525900,T-MG0,31,3,2,,
49547,49547,141.040278,37.416111,1602660060,T-0-3,31,3,2,,
70272,70272,141.072167,37.333333,1654934520,T-D9,31,3,2,,
68448,68448,141.072167,37.333333,1352361300,T-D9,12,3,1,,0.011


## Encode to NetCDF

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()
tfm.logs



['Assign `NaN` to values equal to `ND` (not detected) - to be confirmed ',
 'Remove 約 (about) char',
 "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean",
 'Select columns of interest.',
 '\n    Parse TEPCO measurement columns to extract nuclide name, measurement value, \n    detection limit and uncertainty\n    ',
 '\n    Remap `UNIT` name to MARIS id.\n    ',
 '\n    Remap `NUCLIDE` name to MARIS id.\n    ',
 '\n    Remap `DL` name to MARIS id.\n    ',
 'Parse time column from TEPCO.',
 'Encode time as seconds since epoch.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.']

In [None]:
dfs_tfm['SEAWATER'].sample(10)

type,ID,LON,LAT,TIME,station,NUCLIDE,UNIT,DL,UNC,VALUE
42098,42098,141.034444,37.431111,1551599400,T-1,29,3,2,,
49104,49104,141.040278,37.416111,1534140660,T-0-3,33,3,2,,
43640,43640,141.034444,37.431111,1607416500,T-1,31,3,2,,
35686,35686,141.034444,37.431111,1381388100,T-1,29,3,2,,
88211,88211,141.666667,38.3,1709626680,T-MG2,31,3,2,,
73680,73680,141.072222,37.5,1477989120,T-D1,104,3,2,,
61621,61621,141.046667,37.430556,1531722660,T-0-1A,33,3,2,,
38168,38168,141.034444,37.431111,1448873100,T-1,29,3,2,,
29043,29043,141.033611,37.415833,1649572380,T-2,103,3,1,,11.0
15774,15774,141.0225,37.824444,1691485260,T-22,1,3,1,,0.087


In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#| eval: false
get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw)

{'geospatial_lat_min': '141.66666667',
 'geospatial_lat_max': '38.63333333',
 'geospatial_lon_min': '140.60388889',
 'geospatial_lon_max': '35.79611111',
 'geospatial_bounds': 'POLYGON ((140.60388889 35.79611111, 141.66666667 35.79611111, 141.66666667 38.63333333, 140.60388889 38.63333333, 140.60388889 35.79611111))',
 'time_coverage_start': '2011-03-21T14:30:00',
 'time_coverage_end': '2024-12-21T08:03:00',
 'id': 'JEV6HP5A',
 'title': "Readings of Sea Area Monitoring - Monitoring of sea water - Sea area close to TEPCO's Fukushima Daiichi NPS / Coastal area - Readings of Sea Area Monitoring [TEPCO]",
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "TEPCO - Tokyo Electric Power Company"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Mar

In [None]:
#| exports
def encode(
    fname_out: str, # Path to the folder where the NetCDF output will be saved
    **kwargs # Additional keyword arguments
    ):
    "Encode TEPCO data to NetCDF."
    dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
    
    tfm = Transformer(dfs, cbs=[
        FixMissingValuesCB(),
        RemoveJapanaseCharCB(),
        FixRangeValueStringCB(),
        SelectColsOfInterestCB(common_coi, nuclides_pattern),
        WideToLongCB(),
        RemapUnitNameCB(unit_mapping),
        RemapNuclideNameCB(nuclide_mapping),
        RemapDLCB(),
        ParseTimeCB(),
        EncodeTimeCB(),
        SanitizeLonLatCB()
    ])        
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw),
                            verbose=kwargs.get('verbose', False)
                            )
    encoder.encode()

In [None]:
encode(fname_out, verbose=False)

100%|██████████| 11/11 [00:06<00:00,  1.73it/s]
100%|██████████| 11/11 [00:06<00:00,  1.76it/s]


