In [None]:
#| default_exp handlers.tepco

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')

In [None]:
#| export
import pandas as pd
import re
import numpy as np
import fastcore.all as fc
from tqdm import tqdm
from collections import defaultdict

from marisco.callbacks import (
    Callback, 
    Transformer,
    EncodeTimeCB, 
    SanitizeLonLatCB,
    EncodeTimeCB, 
    )

from marisco.configs import cfg
from marisco.encoders import NetCDFEncoder

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB,
    TimeRangeCB,
    ZoteroCB, 
    KeyValuePairCB    
    )

from marisco.netcdf2csv import decode

## Configuration & file paths

In [None]:
#| exports
fname_coastal_water = 'https://radioactivity.nra.go.jp/cont/en/results/sea/coastal_water.csv'
fname_clos1F = 'https://radioactivity.nra.go.jp/cont/en/results/sea/close1F_water.xlsx'
fname_iaea_orbs = 'https://raw.githubusercontent.com/RML-IAEA/iaea.orbs/refs/heads/main/src/iaea/orbs/stations/station_points.csv'

fname_out = '../../_data/output/tepco.nc'

## Load data

We here load the data from the [NRA (Nuclear Regulatory Authority)](https://radioactivity.nra.go.jp/en/results) website. For the moment, we only process radioactivity concentration data in the seawater around Fukushima Dai-ichi NPP [TEPCO] (`coastal_water.csv`) and in the `close1F_water.xlsx` file.

In near future, MARIS will provide a dedicated handler for all related [ALPS data](https://radioactivity.nra.go.jp/en/results#sec-12) including measurements not only provided by TEPCO but also MOE, NRA, MLITT and Fukushima Prefecture.



:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

The **coastal_water.csv** file contains two sections: the measurements and the locations. We identify below the line number where the locations begin. A single point of truth for the location of the stations would ease the processing in future.

:::

In [None]:
#| exports
def find_location_section(df, 
                          col_idx=0,
                          pattern='Sampling point number'
                          ):
    "Find the line number where location data begins."
    mask = df.iloc[:, col_idx] == pattern
    indices = df[mask].index
    return indices[0] if len(indices) > 0 else -1

In [None]:
#| eval: false
find_location_section(pd.read_csv(fname_coastal_water, low_memory=False))

np.int64(28039)

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Distinct parsing of the time from `coastal_water.csv` and `close1F_water.xlsx` files are required. Indeed:

- `coastal_water.csv` uses the format `YYYY/MM/DD` in the `Sampling  HH:MM` and 
- `close1F_water.xlsx` uses the format `YYYY-MM-DD HH:MM:SS`.

:::

In [None]:
#| exports
def fix_sampling_time(x):
    if pd.isna(x): 
        return '00:00:00'
    else:
        hour, min =  x.split(':')[:2]
        return f"{hour if len(hour) == 2 else '0' + hour}:{min}:00"

In [None]:
#| exports
def get_coastal_water_df(fname_coastal_water):
    "Get the measurements dataframe from the `coastal_water.csv` file."
    
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=1, 
                     nrows=locs_idx - 1,
                     low_memory=False)
    df.dropna(subset=['Sampling point number'], inplace=True)
    df['Sampling time'] = df['Sampling time'].map(fix_sampling_time)
    
    df['TIME'] = df['Sampling date'].replace('-', '/') + ' ' + df['Sampling time']
    
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_coastal_water = get_coastal_water_df(fname_coastal_water)
df_coastal_water.tail()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
28031,T-S3,上層,,,,,,,,,...,,,ND,6.9,,,,,,2025/1/8 09:36:00
28032,T-S4,上層,,,,,,,,,...,,,ND,6.9,,,,,,2025/1/8 09:59:00
28033,T-D5,上層,,,,,,,,,...,,,ND,6.3,,,,,,2025/1/13 07:44:00
28034,T-S8,上層,,,,,,,,,...,,,ND,6.6,,,,,,2025/1/15 05:22:00
28035,T-D5,上層,,,,,,,,,...,,,ND,7.4,,,,,,2025/1/20 07:57:00


:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Identification of the stations location requires three distinct files:

- the second section of the `coastal_water.csv` file
- the `R6zahyo.pdf` file further processed by [https://github.com/RML-IAEA/iaea.orbs](https://github.com/RML-IAEA/iaea.orbs)
- the second sections of all sheets of `close1F_water.xlsx` file
  
All files and sheets required to look up the location of the stations.

:::

In [None]:
#| exports
def get_locs_coastal_water(fname_coastal_water):
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=locs_idx+1, 
                     low_memory=False).iloc[:, :3]
    
    df.columns = ['STATION', 'LON', 'LAT']
    df.dropna(subset=['LAT'], inplace=True)
    df['org'] = 'coastal_seawater.csv'
    return df

In [None]:
#| eval: false
df_locs_coastal_water = get_locs_coastal_water(fname_coastal_water)
print(f'Nb. of stations: {len(df_locs_coastal_water)}')
df_locs_coastal_water.head()

Nb. of stations: 48


Unnamed: 0,STATION,LON,LAT,org
0,T-0,37.42,141.04,coastal_seawater.csv
1,T-11,37.24,141.05,coastal_seawater.csv
2,T-12,37.15,141.04,coastal_seawater.csv
3,T-13-1,37.64,141.04,coastal_seawater.csv
4,T-14,37.55,141.06,coastal_seawater.csv


In [None]:
#| eval: false
df_locs_coastal_water.STATION.unique()

array(['T-0', 'T-11', 'T-12', 'T-13-1', 'T-14', 'T-17-1', 'T-18', 'T-20',
       'T-22', 'T-3', 'T-4', 'T-4-1', 'T-4-2', 'T-5', 'T-6', 'T-7', 'T-A',
       'T-B', 'T-B1', 'T-B2', 'T-B3', 'T-B4', 'T-C', 'T-D', 'T-D1',
       'T-D5', 'T-D9', 'T-E', 'T-E1', 'T-Z', 'T-MG6', 'T-S1', 'T-S7',
       'T-H1', 'T-S2', 'T-S6', 'T-M10', 'T-MA', 'T-S3', 'T-S4', 'T-S8',
       'T-MG4', 'T-G4', 'T-MG5', 'T-MG1', 'T-MG0', 'T-MG3', 'T-MG2'],
      dtype=object)

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Data contained in the `close1F_water.xlsx` file are spread in several sheets (one per station). Each sheet further contains two sections: the measurements and the locations. 

For each sheet, we have to identify the line number where to split both measurements and the location. We then need to further iterate over all sheets to concatenate the results.

:::

In [None]:
#| exports
def get_clos1F_df(fname_clos1F):
    "Get measurements dataframe from close1F_water.xlsx file and parse datetime."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                   sheet_name=sheet_name, 
                   skiprows=1,
                   nrows=locs_idx-1)
        
        df.dropna(subset=['Sampling point number'], inplace=True)
        df['Sampling date'] = df['Sampling date']\
            .astype(str)\
            .apply(lambda x: x.split(' ')[0]\
            .replace('-', '/'))
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True)
    df.dropna(subset=['Sampling date'], inplace=True)
    df['TIME'] = df['Sampling date'] + ' ' + df['Sampling time'].astype(str)
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_clos1F = get_clos1F_df(fname_clos1F)
df_clos1F.head()

100%|██████████| 11/11 [00:06<00:00,  1.69it/s]


Unnamed: 0,Sampling point number,134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),Total beta radioactivity concentration (Bq/L),Total beta detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),Collection layer of seawater,...,106Ru detection limit (Bq/L),60Co radioactivity concentration (Bq/L),60Co detection limit (Bq/L),95Zr radioactivity concentration (Bq/L),95Zr detection limit (Bq/L),99Mo radioactivity concentration (Bq/L),99Mo detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),TIME
0,T-0-1,ND,1.5,ND,1.4,ND,18.0,,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,4.7,,,...,,,,,,,,,,2013/08/14 08:17:00
2,T-0-1,ND,1.1,ND,1.4,ND,20.0,,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,ND,2.9,,...,,,,,,,,,,2013/08/21 08:09:00
4,T-0-1,ND,0.66,ND,0.49,ND,17.0,,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| eval: false
df_clos1F['Sampling point number'].unique()

array(['T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

In [None]:
#| exports
def get_locs_clos1F(fname_clos1F):
    "Get locations dataframe from close1F_water.xlsx file from each sheets."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                           sheet_name=sheet_name, 
                           skiprows=locs_idx+2)
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True).iloc[:, :3]
    df.dropna(subset=['Sampling coordinate North latitude (Decimal)'], inplace=True)    
    df.columns = ['STATION', 'LON', 'LAT']
    df['org'] = 'close1F.csv'
    return df

In [None]:
#| eval: false
df_locs_clos1F = get_locs_clos1F(fname_clos1F)
print(f'Nb. of stations: {len(df_locs_clos1F)}')
df_locs_clos1F.head()

100%|██████████| 11/11 [00:04<00:00,  2.36it/s]

Nb. of stations: 11





Unnamed: 0,STATION,LON,LAT,org
0,T-0-1,37.43,141.04,close1F.csv
11,T-0-1A,37.43,141.05,close1F.csv
22,T-0-2,37.42,141.05,close1F.csv
33,T-0-3,37.42,141.04,close1F.csv
44,T-0-3A,37.42,141.05,close1F.csv


:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

The `close1F_water.xlsx` file contains station locations that are not present in the `coastal_water.csv` dataset, as demonstrated in the comparison below:
:::

In [None]:
#| eval: false
set(df_locs_clos1F.STATION) - set(df_locs_coastal_water.STATION)

{'T-0-1',
 'T-0-1A',
 'T-0-2',
 'T-0-3',
 'T-0-3A',
 'T-1',
 'T-2',
 'T-2-1',
 'T-A1',
 'T-A2',
 'T-A3'}

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

In theory all locations are supposed to be provided in the [R6zahyo.pdf](https://radioactivity.nra.go.jp/cont/en/results/sea/R6zahyo.pdf) file. This file is further processed by https://github.com/RML-IAEA/iaea.orbs and the result is provided in the `station_points.csv` file. 

However, this file lacks complete coverage of locations referenced in both `coastal_water.csv` and `close1F_water.xlsx` files, while simultaneously containing additional locations not present in either (see below). A more standardized and comprehensive location reference system would significantly improve the efficiency and reliability of the data ingestion process.

:::

In [None]:
#| exports
def get_locs_orbs(fname_iaea_orbs):
    df = pd.read_csv(fname_iaea_orbs)
    df.columns = ['org', 'STATION', 'LON', 'LAT']
    return df

In [None]:
#| eval: false
df_locs_orbs = get_locs_orbs(fname_iaea_orbs)
df_locs_orbs.head()

Unnamed: 0,org,STATION,LON,LAT
0,MOE,E-31,141.727667,39.059167
1,MOE,E-32,141.635667,38.996
2,MOE,E-37,141.948611,39.259167
3,MOE,E-38,141.755,39.008333
4,MOE,E-39,141.766667,38.991667


In [None]:
#| eval: False
set(df_locs_orbs.STATION) - (set(df_locs_clos1F.STATION) | set(df_locs_coastal_water.STATION))

{'C-P1',
 'C-P2',
 'C-P3',
 'C-P4',
 'C-P5',
 'C-P8',
 'E-31',
 'E-32',
 'E-37',
 'E-38',
 'E-39',
 'E-3A',
 'E-41',
 'E-42',
 'E-43',
 'E-44',
 'E-45',
 'E-46',
 'E-47',
 'E-48',
 'E-49',
 'E-4A',
 'E-4B',
 'E-4C',
 'E-4F',
 'E-4G',
 'E-4H',
 'E-4J',
 'E-4K',
 'E-4L',
 'E-4M',
 'E-71',
 'E-72',
 'E-73',
 'E-74',
 'E-75',
 'E-76',
 'E-77',
 'E-78',
 'E-79',
 'E-7A',
 'E-7B',
 'E-7C',
 'E-7D',
 'E-7F',
 'E-7G',
 'E-7H',
 'E-7I',
 'E-7J',
 'E-7K',
 'E-7L',
 'E-81',
 'E-82',
 'E-83',
 'E-84',
 'E-85',
 'E-S1',
 'E-S10',
 'E-S13',
 'E-S14',
 'E-S15',
 'E-S17',
 'E-S18',
 'E-S19',
 'E-S20',
 'E-S21',
 'E-S22',
 'E-S23',
 'E-S24',
 'E-S25',
 'E-S26',
 'E-S27',
 'E-S28',
 'E-S29',
 'E-S3',
 'E-S30',
 'E-S31',
 'E-S32',
 'E-S33',
 'E-S34',
 'E-S35',
 'E-S36',
 'E-S4',
 'E-S5',
 'E-T1',
 'E-T2',
 'E-T3',
 'E-T4',
 'E-T5',
 'E-T6',
 'E-T7',
 'E-T8',
 'F-P01',
 'F-P02',
 'F-P03',
 'F-P04',
 'F-P05',
 'F-P06',
 'F-P07',
 'F-P08',
 'F-P09',
 'F-P10',
 'F-P11',
 'F-P12',
 'F-P13',
 'F-P14',
 'F-P15'

In [None]:
#| exports
def concat_locs(dfs):
    "Concatenate and drop duplicates from coastal_seawater.csv and iaea_orbs.csv (kept)"
    df = pd.concat(dfs)
    # Group by org to be used for sorting
    df['org_grp'] = df['org'].apply(
        lambda x: 1 if x == 'coastal_seawater.csv' else 2 if x == 'close1F.csv' else 0)
    df.sort_values('org_grp', ascending=True, inplace=True)
    # Drop duplicates and keep orbs data first
    df.drop_duplicates(subset='STATION', keep='first', inplace=True)
    df.drop(columns=['org_grp'], inplace=True)
    df.sort_values('STATION', ascending=True, inplace=True)
    return df

In [None]:
#| eval: false
df_locs = concat_locs([df_locs_clos1F, df_locs_coastal_water, df_locs_orbs])
df_locs.head()

Unnamed: 0,STATION,LON,LAT,org
214,C-P1,139.863333,35.425,NRA
215,C-P2,139.863333,35.401667,NRA
216,C-P3,139.881667,35.37,NRA
217,C-P4,139.846667,35.356667,NRA
218,C-P5,139.8,35.343333,NRA


In [None]:
#| exports
def align_dfs(df_from, df_to):
    "Align columns structure of df_from to df_to."
    df = defaultdict()    
    for c in df_to.columns:
        df[c] = df_from[c].values if c in df_from.columns else np.nan
    return pd.DataFrame(df)

In [None]:
# | eval: false
align_dfs(df_clos1F, df_coastal_water).head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-0-1,,,,ND,1.5,ND,1.4,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,,,,...,,,4.7,,,,,,,2013/08/14 08:17:00
2,T-0-1,,,,ND,1.1,ND,1.4,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,,,,...,,,ND,2.9,,,,,,2013/08/21 08:09:00
4,T-0-1,,,,ND,0.66,ND,0.49,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| exports
def concat_dfs(df_coastal_water, df_clos1F):
    "Concatenate and drop duplicates from coastal_seawater.csv and close1F_water.xlsx (kept)"
    df_clos1F = align_dfs(df_clos1F, df_coastal_water)
    df = pd.concat([df_coastal_water, df_clos1F])
    return df

In [None]:
#| eval: false
df_meas = concat_dfs(df_coastal_water, df_clos1F)
df_meas.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,,,,,2011/3/21 23:15:00
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,,,,,2011/3/21 23:45:00
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,,,,,2011/3/22 14:28:00
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,,,,,2011/3/22 15:06:00
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,,,,,34.0,25.0,,2011/3/23 13:51:00


In [None]:
#| exports
def georef_data(df_meas, df_locs):
    "Georeference measurements dataframe using locations dataframe."
    assert "Sampling point number" in df_meas.columns and "STATION" in df_locs.columns
    return pd.merge(df_meas, df_locs, how="inner", 
                    left_on='Sampling point number', right_on='STATION')

In [None]:
#| eval: false
df_meas_georef = georef_data(df_meas, df_locs)
df_meas_georef.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| exports
def load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs):
    "Load, align and georeference TEPCO data"
    df_locs = concat_locs(
        [get_locs_coastal_water(fname_coastal_water), 
         get_locs_clos1F(fname_clos1F),
         get_locs_orbs(fname_iaea_orbs)])
    df_meas = concat_dfs(get_coastal_water_df(fname_coastal_water), get_clos1F_df(fname_clos1F))
    df_meas.dropna(subset=['Sampling point number'], inplace=True)
    return {'SEAWATER': georef_data(df_meas, df_locs)}

In [None]:
#| eval: false
dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
dfs['SEAWATER'].head()

100%|██████████| 11/11 [00:05<00:00,  1.93it/s]
100%|██████████| 11/11 [00:04<00:00,  2.28it/s]


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
print(f"# of cols, rows: {dfs['SEAWATER'].shape}")
dfs['SEAWATER'].head()

# of cols, rows: (47526, 53)


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
dfs['SEAWATER'].STATION.unique()

array(['T-3', 'T-4', 'T-5', 'T-7', 'T-11', 'T-12', 'T-14', 'T-18', 'T-20',
       'T-22', 'T-MA', 'T-M10', 'T-A', 'T-D', 'T-E', 'T-B', 'T-C',
       'T-MG1', 'T-MG2', 'T-MG3', 'T-MG4', 'T-MG5', 'T-MG6', 'T-D1',
       'T-D5', 'T-D9', 'T-E1', 'T-G4', 'T-H1', 'T-S5', 'T-S6', 'T-17-1',
       'T-B3', 'T-13-1', 'T-S3', 'T-S4', 'T-B4', 'T-S1', 'T-S2', 'T-MG0',
       'T-Z', 'T-B1', 'T-B2', 'T-S7', 'T-S8', 'T-0', 'T-4-1', 'T-4-2',
       'T-6', 'T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

## Fix missing values

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

We remap the `ND` value to `NaN`. Please confirm that this is the correct way to handle missing values.
:::


`ND` is assigned `NaN`. This needs to be confirmed.

In [None]:
#| exports
class FixMissingValuesCB(Callback):
    "Assign `NaN` to values equal to `ND` (not detected) - to be confirmed "
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            predicate = tfm.dfs[k] == 'ND'
            tfm.dfs[k][predicate] = np.nan

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[FixMissingValuesCB()])
tfm()['SEAWATER'].head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,160.0,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,120.0,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,200.0,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


## Remove 約 (about) character
    

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

We systematically remove the `約` character. Please confirm that this is the correct way to handle this. We could imagine that mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class RemoveJapanaseCharCB(Callback):
    "Remove 約 (about) char"
    def _transform_if_about(self, value, about_char='約'):
        if pd.isna(value): return value
        return (value.replace(about_char, '') if str(value).count(about_char) != 0 
                else value)
    
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_about)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB()])

tfm()['SEAWATER'].sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
21961,T-B1,下層,,,,0.0014,0.0013,,,,...,,,,,,2022/9/13 12:08:00,T-B1,141.216667,37.533333,TEPCO
38200,T-1,上層,,,,,,,,,...,,,,,,2022/06/27 08:40:00,T-1,141.034444,37.431111,TEPCO
38352,T-1,上層,,,,0.65,,0.72,,,...,,,,,,2022/10/20 08:05:00,T-1,141.034444,37.431111,TEPCO
13774,T-D1,上層,,,,,,,,,...,,,,,,2017/11/20 08:37:00,T-D1,141.072222,37.5,TEPCO
21489,T-6,上層,,,,0.0013,0.02,,,,...,,,,,,2022/6/9 11:15:00,T-6,141.040556,37.478889,TEPCO
2346,T-4,上層,,0.77,,0.94,,1.1,,,...,,,,,,2012/1/7 08:00:00,T-4,141.013889,37.241667,TEPCO
12676,T-14,上層,,,,0.0011,0.0084,,,,...,,,,,,2017/4/4 08:33:00,T-14,141.0625,37.552778,TEPCO
12696,T-11,下層,,,,0.0014,0.0076,,,,...,,,,,,2017/4/10 07:18:00,T-11,141.047222,37.241667,TEPCO
30063,T-0-2,,,,,,,,,,...,,,,,,2023/11/17 07:32:00,T-0-2,141.046667,37.423333,TEPCO
34977,T-1,上層,,0.74,,0.73,,0.7,,,...,,,,,,2015/09/24 07:14:00,T-1,141.034444,37.431111,TEPCO


## Fix values range string

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Value ranges are provided as strings (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7'). We replace them by their mean. Please confirm that this is the correct way to handle this. Again, mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class FixRangeValueStringCB(Callback):
    "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean"
    
    def _extract_and_calculate_mean(self, s):
        # For scientific notation ranges
        float_strings = re.findall(r"[+-]?\d+\.?\d*E?[+-]?\d*", s)
        if float_strings:
            float_numbers = np.array(float_strings, dtype=float)
            return float_numbers.mean()
        return s
    
    def _transform_if_range(self, value):
        if pd.isna(value): 
            return value
        value = str(value)
        # Check for both range patterns
        if '<&<' in value or '～' in value:
            return self._extract_and_calculate_mean(value)
        return value

    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns 
                       if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_range).astype(float)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
7623,T-MG0,中層,,,,0.0015,0.0021,,,,...,,,,,,2014/8/4 13:48:00,T-MG0,141.583333,38.633333,TEPCO
6637,T-4,上層,,,0.16,,0.4,,,,...,,,,,,2014/2/12 13:00:00,T-4,141.013889,37.241667,TEPCO
27374,T-0-1,,,,,,,,,,...,,,,,,2024/05/29 06:57:00,T-0-1,141.040278,37.430556,TEPCO
12765,T-S4,下層,,,,0.0013,0.0074,,,,...,,,,,,2017/4/19 05:45:00,T-S4,141.0825,37.428611,TEPCO
33529,T-1,上層,,0.53,,1.3,,1.6,,,...,,,,,,2012/07/02 07:20:00,T-1,141.034444,37.431111,TEPCO
46442,T-A1,上層,,,,,,,,,...,,,,,,2023/12/25 07:18:00,T-A1,141.050761,37.440794,TEPCO
39393,T-1,上層,,,,0.97,,0.9,,,...,,,,,,2024/12/10 07:43:00,T-1,141.034444,37.431111,TEPCO
4422,T-5,下層,,,0.008,,0.014,,,,...,,,,,,2013/1/5 07:59:00,T-5,141.2,37.416667,TEPCO
28399,T-0-1A,,,,,0.6,,0.72,,,...,,,,,,2022/03/21 06:19:00,T-0-1A,141.046667,37.430556,TEPCO
44868,T-2-1,,,1.1,,1.9,,1.4,,,...,,,,,,2013/10/03 05:00:00,T-2-1,37.41,141.03,close1F.csv


## Select columns of interest

We select the columns of interest and in particular the elements of interest, in our case radionuclides.

In [None]:
#| exports
common_coi = ['LON', 'LAT', 'TIME', 'STATION']
nuclides_pattern = '(Bq/L)'

In [None]:
#| exports
class SelectColsOfInterestCB(Callback):
    "Select columns of interest."
    def __init__(self, common_coi, nuclides_pattern): fc.store_attr()
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.dfs['SEAWATER'].columns if nuclides_pattern in c]
        tfm.dfs['SEAWATER'] = tfm.dfs['SEAWATER'][self.common_coi + nuc_of_interest]

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,144Ce radioactivity concentration (Bq/L),144Ce detection limit (Bq/L),54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L)
16072,141.040556,37.478889,2019/4/2 10:05:00,T-6,,,,,,,...,,,,,0.3,,,,,
27564,141.046667,37.430556,2014/03/11 10:24:00,T-0-1A,,,,,,,...,,,,,,1.9,,,,
17995,141.072222,37.5,2020/5/18 08:25:00,T-D1,,,,,,,...,,,,,,0.35,,,,
3628,141.133333,38.25,2012/7/26 10:22:00,T-MG4,,,0.0098,,0.017,,...,,,,,,,,,,
16531,141.0,36.966667,2019/7/5 06:34:00,T-20,,,,0.0014,0.0029,,...,,,,,,,,,,


## Reshape: wide to long

So that we can extract information such as nuclide name, unit, derived quantities such as uncertainty, detection limit, ...

In [None]:
#| exports
class WideToLongCB(Callback):
    """
    Get TEPCO nuclide names as values not column names 
    to extract contained information (nuclide name, unc, dl, ...).
    """
    def __init__(self, id_vars=['LON', 'LAT', 'TIME', 'STATION']): 
        fc.store_attr()
        
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER'] = pd.melt(tfm.dfs['SEAWATER'], id_vars=self.id_vars)
#| eval: false

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.head()

Unnamed: 0,LON,LAT,TIME,STATION,variable,value
0,141.026389,37.322222,2011/3/21 23:15:00,T-3,131I radioactivity concentration (Bq/L),1100.0
1,141.013889,37.241667,2011/3/21 23:45:00,T-4,131I radioactivity concentration (Bq/L),660.0
2,141.026389,37.322222,2011/3/22 14:28:00,T-3,131I radioactivity concentration (Bq/L),1100.0
3,141.013889,37.241667,2011/3/22 15:06:00,T-4,131I radioactivity concentration (Bq/L),670.0
4,141.026389,37.322222,2011/3/23 13:51:00,T-3,131I radioactivity concentration (Bq/L),740.0


## Extract

Nulide name, dl, unc, ... are extracted from column names as embedded in TEPCO data source.

### Nuclide name

In [None]:
#| exports
def extract_nuclide(text: str) -> str:
    "Extract the nuclide identifier from a measurement variable name using regex."
    pattern = r'^(Total\s+(?:alpha|beta)|[^\s]+)'
    match = re.match(pattern, text, re.IGNORECASE)
    return match.group(1) if match else text 

For instance:

In [None]:
print(extract_nuclide("Total alpha radioactivity concentration (Bq/L)"))
print(extract_nuclide("131I radioactivity concentration (Bq/L)"))

Total alpha
131I


In [None]:
#| exports
class ExtractNuclideNameCB(Callback):
    "Extract nuclide name from TEPCO data."
    def __init__(self, src_col='variable', dest_col='NUCLIDE'): fc.store_attr()
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER'][self.dest_col] = tfm.dfs['SEAWATER'][self.src_col].map(extract_nuclide)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE
1376151,37.41,141.03,2014/12/26 06:10:00,T-2-1,140Ba detection limit (Bq/L),,140Ba
1230257,141.033611,37.415833,2020/07/03 07:00:00,T-2,Total beta radioactivity concentration (Bq/L),9.5,Total beta
1139074,37.41,141.03,2016/04/04 06:10:00,T-2-1,Total alpha radioactivity concentration (Bq/L),,Total alpha
1526386,141.072222,37.416667,2013/7/30 08:11:00,T-D5,58Co detection limit (Bq/L),,58Co
1132947,141.033611,37.415833,2012/01/01 08:20:00,T-2,Total alpha radioactivity concentration (Bq/L),,Total alpha


### Unit

In [None]:
#| exports
class ExtractUnitCB(Callback):
    "Extract unit from TEPCO data."
    def __init__(self, src_col='variable', dest_col='UNIT'): fc.store_attr()
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER'][self.dest_col] = tfm.dfs['SEAWATER'][self.src_col].str.extract(r'\((.*?)\)')

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE,UNIT
245092,141.072222,37.416667,2014/7/2 10:18:00,T-D5,137Cs detection limit (Bq/L),,137Cs,Bq/L
588346,141.039444,37.265,2020/5/27 05:56:00,T-S5,140La radioactivity concentration (Bq/L),,140La,Bq/L
877200,141.072222,37.5,2022/7/25 08:06:00,T-D1,238Pu radioactivity concentration (Bq/L),,238Pu,Bq/L
961592,141.2,37.416667,2016/4/18 07:57:00,T-5,239Pu+240Pu radioactivity concentration (Bq/L),,239Pu+240Pu,Bq/L
205581,141.040556,37.478889,2018/11/20 10:05:00,T-6,137Cs radioactivity concentration (Bq/L),0.019,137Cs,Bq/L


### Value type
Is it a measurement or derived detection such as detection limit or uncertainty?

In [None]:
#| exports
class ExtractValueTypeCB(Callback):
    "Extract value type from TEPCO data."
    def __init__(self, src_col='variable', dest_col='type'): fc.store_attr()
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER'][self.dest_col] = np.select(
            [
                tfm.dfs['SEAWATER'][self.src_col].str.contains('detection limit', case=False),
                tfm.dfs['SEAWATER'][self.src_col].str.contains('statistical error', case=False)],
            ['DL', 'UNC'],
            default='VALUE'
        )

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE,UNIT,type
1705490,141.033611,37.415833,2020/06/14 06:50:00,T-2,144Ce radioactivity concentration (Bq/L),,144Ce,Bq/L,VALUE
482951,141.072167,37.333333,2014/8/18 09:39:00,T-D9,136Cs radioactivity concentration (Bq/L),,136Cs,Bq/L,VALUE
1051781,141.583333,38.233333,2013/11/22 09:24:00,T-MG3,239Pu+240Pu detection limit (Bq/L),,239Pu+240Pu,Bq/L,DL
1824738,140.763889,36.713889,2020/10/29 08:40:00,T-A,54Mn detection limit (Bq/L),,54Mn,Bq/L,DL
1154352,141.047222,37.241667,2017/11/13 10:01:00,T-11,Total alpha detection limit (Bq/L),,Total alpha,Bq/L,DL


## Reshape: long to wide
Send `type` column to columns names (`VALUE`, `DL`, `UNC`)


In [None]:
#| exports
class LongToWideCB(Callback):
    "Reshape: long to wide"
    def __init__(self, src_col='variable', dest_col='type'): fc.store_attr()
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER'] = pd.pivot_table(
            tfm.dfs['SEAWATER'],
            values='value',
            index=['LON', 'LAT', 'TIME', 'STATION', 'NUCLIDE', 'UNIT'],
            columns='type',
            aggfunc='first'
        ).reset_index()
        tfm.dfs['SEAWATER'].reset_index(inplace=True)
        tfm.dfs['SEAWATER'].rename(columns={'index': 'ID'}, inplace=True)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
9810,9810,140.837222,35.796111,2011/6/7 07:52:00,T-E,134Cs,Bq/L,15.0,,
74464,74464,141.072222,37.5,2017/7/25 08:24:00,T-D1,137Cs,Bq/L,,,0.005
13276,13276,141.013889,37.241667,2011/12/18 08:05:00,T-4,137Cs,Bq/L,,,1.1
20554,20554,141.033611,37.415833,2012/04/20 08:40:00,T-2,137Cs,Bq/L,1.6,,
73525,73525,141.072222,37.5,2012/4/10 09:20:00,T-D1,Total alpha,Bq/L,3.2,,


## Remap `UNIT` name to MARIS nomenclature

In [None]:
#| exports
unit_mapping = {'Bq/L': 3}

In [None]:
#| exports
class RemapUnitNameCB(Callback):
    """
    Remap `UNIT` name to MARIS id.
    """
    def __init__(self, unit_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['UNIT'] = tfm.dfs['SEAWATER']['UNIT'].map(self.unit_mapping)


In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
34619,34619,141.034444,37.431111,2012/09/10 08:20:00,T-1,3H,3,,,3.1
85479,85479,141.283333,38.333333,2022/7/19 09:37:00,T-MG1,134Cs,3,0.00099,,
23348,23348,141.033611,37.415833,2017/11/09 06:55:00,T-2,137Cs,3,0.58,,
51333,51333,141.040278,37.430556,2015/11/02 08:12:00,T-0-1,137Cs,3,0.71,,
25215,25215,141.033611,37.415833,2019/01/31 06:55:00,T-2,134Cs,3,0.73,,


## Remap `NUCLIDE` name to MARIS nomenclature

In [None]:
#| exports
nuclide_mapping = {
    '131I': 29,
    '134Cs': 31,
    '137Cs': 33,
    '125Sb': 24,
    'Total beta': 103,
    '238Pu': 67,
    '239Pu+240Pu': 77,
    '3H': 1,
    '89Sr': 11,
    '90Sr': 12,
    'Total alpha': 104,
    '132I': 100,
    '136Cs': 102,
    '58Co': 8,
    '105Ru': 97,
    '106Ru': 17,
    '140La': 35,
    '140Ba': 34,
    '132Te': 99,
    '60Co': 9,
    '144Ce': 37,
    '54Mn': 6
}

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    "Remap `NUCLIDE` name to MARIS id."
    def __init__(self, nuclide_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['NUCLIDE'] = tfm.dfs['SEAWATER']['NUCLIDE'].map(self.nuclide_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
70314,70314,141.072167,37.333333,2020/12/16 08:13:00,T-D9,31,3,0.0012,,
44916,44916,141.034444,37.431111,2022/03/25 08:08:00,T-1,31,3,0.75,,
25513,25513,141.033611,37.415833,2019/04/15 08:15:00,T-2,33,3,0.58,,0.089
82709,82709,141.216667,37.533333,2014/2/25 06:31:00,T-B1,31,3,,,0.0049
63751,63751,141.047222,37.241667,2011/7/17 06:45:00,T-11,33,3,25.0,,


## Remap `DL` value to MARIS nomenclature

We remap `DL` (Detection Limit) value to MARIS ids as follows:
    
- if a `DL` value is reported with assign `2` (Detection limit or '<')
- if a `DL` value is not reported with assign `1` (Detected value or '=')

In [None]:
#| exports
class RemapDLCB(Callback):
    "Remap `DL` name to MARIS id."
    def __init__(self): fc.store_attr()
    def dl_mapping(self, value): return 2 if pd.isna(value) else 1
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER']['DL'] = tfm.dfs['SEAWATER']['DL'].map(self.dl_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(10)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
24094,24094,141.033611,37.415833,2018/05/11 08:25:00,T-2,29,3,1,,
86077,86077,141.583333,38.233333,2014/4/15 08:43:00,T-MG3,33,3,2,,0.003
59264,59264,141.046667,37.423333,2017/08/21 07:35:00,T-0-2,33,3,1,,
59874,59874,141.046667,37.423333,2020/08/24 06:54:00,T-0-2,103,3,1,,
18714,18714,141.026389,37.322222,2024/1/4 11:35:00,T-3,1,3,1,,
52168,52168,141.040278,37.430556,2019/12/09 06:54:00,T-0-1,31,3,1,,
56403,56403,141.046667,37.416111,2015/12/14 08:18:00,T-0-3A,1,3,1,,
24091,24091,141.033611,37.415833,2018/05/10 08:25:00,T-2,31,3,1,,
14052,14052,141.013889,37.241667,2012/2/8 08:10:00,T-4,29,3,1,,
80709,80709,141.2,37.416667,2013/6/26 08:50:00,T-5,33,3,2,,0.0045


## Parse & encode time

In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse time column from TEPCO."
    def __init__(self, time_name='TIME'): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER'][self.time_name] = pd.to_datetime(tfm.dfs['SEAWATER'][self.time_name], 
                                                             format='%Y/%m/%d %H:%M:%S', errors='coerce')

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)




type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
84018,84018,141.25,38.166667,1453452900,T-MG5,33,3,2,,0.0053
32547,32547,141.033611,37.415833,1736404080,T-2,33,3,1,,
66062,66062,141.050761,37.424686,1661152980,T-A2,31,3,1,,
87647,87647,141.583333,38.633333,1704877020,T-MG0,33,3,2,,0.0019
64051,64051,141.047222,37.241667,1375868100,T-11,31,3,2,,0.024


## Sanitize coordinates

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(5)



type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
81202,81202,141.2,37.416667,1460704020,T-5,31,3,1,,
73470,73470,141.072222,37.5,1589790300,T-D1,103,3,1,,
6676,6676,140.665556,36.506389,1326358260,T-B,33,3,1,,
65334,65334,141.047222,37.311111,1639633620,T-S7,33,3,2,,0.014
81523,81523,141.2,37.416667,1543908540,T-5,33,3,2,,0.0029


## Encode to NetCDF

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    ExtractNuclideNameCB(),
    ExtractUnitCB(),
    ExtractValueTypeCB(),
    LongToWideCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()
tfm.logs



['Assign `NaN` to values equal to `ND` (not detected) - to be confirmed ',
 'Remove 約 (about) char',
 "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean",
 'Select columns of interest.',
 '\n    Get TEPCO nuclide names as values not column names \n    to extract contained information (nuclide name, unc, dl, ...).\n    ',
 'Extract nuclide name from TEPCO data.',
 'Extract unit from TEPCO data.',
 'Extract value type from TEPCO data.',
 'Reshape: long to wide',
 '\n    Remap `UNIT` name to MARIS id.\n    ',
 'Remap `NUCLIDE` name to MARIS id.',
 'Remap `DL` name to MARIS id.',
 'Parse time column from TEPCO.',
 'Encode time as seconds since epoch.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.']

In [None]:
dfs_tfm['SEAWATER'].sample(10)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
81323,81323,141.2,37.416667,1510561380,T-5,33,3,2,,0.0017
73053,73053,141.072222,37.416667,1661157360,T-D5,1,3,1,,
25538,25538,141.033611,37.415833,1555916400,T-2,33,3,1,,0.033
18353,18353,141.026389,37.322222,1641909900,T-3,31,3,1,,
49488,49488,141.040278,37.416111,1550472600,T-0-3,1,3,1,,
85353,85353,141.283333,38.333333,1602065280,T-MG1,31,3,1,,
79748,79748,141.2,37.233333,1449560640,T-7,31,3,1,,
43041,43041,141.034444,37.431111,1578384600,T-1,33,3,1,,
43844,43844,141.034444,37.431111,1608103200,T-1,31,3,1,,
43390,43390,141.034444,37.431111,1591173900,T-1,31,3,1,,


In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#| eval: false
get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw)

{'geospatial_lat_min': '141.66666667',
 'geospatial_lat_max': '38.63333333',
 'geospatial_lon_min': '140.60388889',
 'geospatial_lon_max': '35.79611111',
 'geospatial_bounds': 'POLYGON ((140.60388889 35.79611111, 141.66666667 35.79611111, 141.66666667 38.63333333, 140.60388889 38.63333333, 140.60388889 35.79611111))',
 'time_coverage_start': '2011-03-21T14:30:00',
 'time_coverage_end': '2025-01-25T07:24:00',
 'id': 'JEV6HP5A',
 'title': "Readings of Sea Area Monitoring - Monitoring of sea water - Sea area close to TEPCO's Fukushima Daiichi NPS / Coastal area - Readings of Sea Area Monitoring [TEPCO]",
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "TEPCO - Tokyo Electric Power Company"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Mar

In [None]:
#| exports
def encode(
    fname_out: str, # Path to the folder where the NetCDF output will be saved
    **kwargs # Additional keyword arguments
    ):
    "Encode TEPCO data to NetCDF."
    dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
    
    tfm = Transformer(dfs, cbs=[
        FixMissingValuesCB(),
        RemoveJapanaseCharCB(),
        FixRangeValueStringCB(),
        SelectColsOfInterestCB(common_coi, nuclides_pattern),
        WideToLongCB(),
        ExtractNuclideNameCB(),
        ExtractUnitCB(),
        ExtractValueTypeCB(),
        LongToWideCB(),
        RemapUnitNameCB(unit_mapping),
        RemapNuclideNameCB(nuclide_mapping),
        RemapDLCB(),
        ParseTimeCB(),
        EncodeTimeCB(),
        SanitizeLonLatCB()
    ])        
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw),
                            verbose=kwargs.get('verbose', False)
                            )
    encoder.encode()

In [None]:
#| eval: false
encode(fname_out, verbose=False)

100%|██████████| 11/11 [00:04<00:00,  2.29it/s]
100%|██████████| 11/11 [00:05<00:00,  2.01it/s]




In [None]:
#| eval: false
decode(fname_in=fname_out, verbose=True)

Saved SEAWATER to ../../_data/output/tepco_SEAWATER.csv
